In [None]:
import multiprocessing
import tensorflow as tf
import pandas as pd
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from collections import Counter
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional, SpatialDropout1D
import os
import re
import nltk
from transformers import AutoTokenizer, TFAutoModel
from sklearn.feature_extraction.text import TfidfVectorizer
import gc
from transformers import AutoModel, AutoTokenizer
import pickle
from pathlib import Path
import csv
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder
import nltk

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

#policy = tf.keras.mixed_precision.Policy('mixed_float16')
#tf.keras.mixed_precision.set_global_policy(policy)

nltk.download('punkt_tab')
os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'

total_data_loaded = 0  # Variable to accumulate the total size of loaded data

def load_data_with_tensorflow(filepath, chunksize=50000):
    global total_data_loaded

    dataset = tf.data.experimental.make_csv_dataset(
        filepath,
        batch_size=chunksize,
        label_name='type',
        select_columns=['content', 'type'],
        num_epochs=1,
        ignore_errors=True,
        header=True
    )

    num_chunks = 0
    for batch in tqdm(dataset.take(20), desc="Loading Data"):
        features_dict, labels = batch
        features = features_dict['content']

        # Convert to numpy and filter out empty rows
        features_np = features.numpy()
        labels_np = labels.numpy()
        valid_indices = []

        for i, (feature, label) in enumerate(zip(features_np, labels_np)):
            # Check if both content and type columns are non-empty
            if feature.strip() and label.strip():
                valid_indices.append(i)

        valid_features = features_np[valid_indices]
        valid_labels = labels_np[valid_indices]

        total_data_loaded += sum(len(feature) for feature in valid_features)
        yield valid_features, valid_labels

def preprocess_data(features, labels, stop_words, stemmer):
    # Define a multiprocessing pool
    pool = multiprocessing.Pool(processes=multiprocessing.cpu_count())

    # Preprocess each feature in parallel
    processed_results = pool.starmap(process_text, [(text, stop_words, stemmer) for text in features])

    # Close the pool
    pool.close()
    pool.join()

    processed_features = []
    processed_labels = []
    vocab_sizes = []  # List to store vocabulary sizes
    vocab_sizes_after_filtering = []
    urls_counts = []
    dates_counts = []
    numerics_counts = []
    for result, label in zip(processed_results, labels):
        if result:
            processed_text, vocab_size, vocab_size_after_filtering, num_urls, num_dates, num_numerics = result
            processed_features.append(processed_text)
            processed_labels.append(classify_news_type(label.decode('utf-8')))  # Decode label to string
            vocab_sizes.append(vocab_size)
            vocab_sizes_after_filtering.append(vocab_size_after_filtering)
            urls_counts.append(num_urls)
            dates_counts.append(num_dates)
            numerics_counts.append(num_numerics)

    return processed_features, processed_labels, vocab_sizes, vocab_sizes_after_filtering, urls_counts, dates_counts, numerics_counts

def process_text(text, stop_words, stemmer):
    if not text.strip():  # Check if the text is empty or contains only whitespace
        return '', 0, 0, 0, 0, 0  # Return empty strings and counts if the text is empty

    # Decode the bytes-like object to a string
    text = text.decode('utf-8')

    # Count URLs in the content
    num_urls = len(re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text))

    # Count dates in the content
    dates = re.findall(r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}', text) # Example date format: 01/01/2022
    num_dates = len(dates)

    # Count numeric values in the content
    numerics = re.findall(r'\b\d+\b', text)  # Extracts integers
    num_numerics = len(numerics)

    tokens = word_tokenize(text.lower())
    vocab_size = len(set(tokens))
    filtered_tokens = [stemmer.stem(token) for token in tokens if token not in stop_words and token.isalpha()]
    vocab_size_after_filtering = len(set(filtered_tokens))  # Unique tokens after filtering and stemming
    processed_text = ' '.join(filtered_tokens)  # Processed text with stopwords removed and stemming applied
    return processed_text, vocab_size, vocab_size_after_filtering, num_urls, num_dates, num_numerics

def classify_news_type(news_type):
    fake_types = ['fake', 'conspiracy', 'unreliable', 'satire', 'bias']
    reliable_types = ['political', 'reliable']
    if news_type.lower() in fake_types:
        return 'Fake'
    elif news_type.lower() in reliable_types:
        return 'Reliable'
    else:
        return 'Neutral'

def get_tokens_size_on_disk(tokens, filename):
    with open(filename, 'w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(['Stemmed Tokens'])  # Write header
        for token in tokens:
            writer.writerow([token])  # Write each stemmed token to a separate row
    return os.path.getsize(filename)

def bytes_to_gb(size_in_bytes):
    return size_in_bytes / (1024 ** 3)


def build_lstm_model(max_features, embedding_dim, lstm_units, maxlen):
    model = Sequential()
    model.add(Embedding(max_features, embedding_dim, input_length=maxlen))
    model.add(SpatialDropout1D(0.3))
    model.add(LSTM(lstm_units//2, dropout=0.1, recurrent_dropout=0.1))
    model.add(Dense(3, activation='softmax',))  # 3 classes: Fake, Reliable, Neutral
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# NEW: Add this helper function for optimized data loading
def create_tf_dataset(features, labels, batch_size=512):
    dataset = tf.data.Dataset.from_tensor_slices((features, labels))
    dataset = dataset.shuffle(buffer_size=10000)
    dataset = dataset.batch(batch_size)
    dataset = dataset.prefetch(tf.data.AUTOTUNE)
    return dataset

def main():
    filepath = 'news_cleaned_2018_02_13.csv'  # Adjust the file path as needed
    chunksize = 50000
    nltk.download('punkt', quiet=True)
    nltk.download('stopwords', quiet=True)
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()

    dataset = load_data_with_tensorflow(filepath, chunksize=chunksize)

    X_chunks = []
    y_chunks = []
    total_vocab_size = 0
    total_vocab_size_after_filtering = 0
    total_tokens_size_on_disk = 0
    for chunk_num, (features, labels) in enumerate(dataset, start=1):
        processed_data = preprocess_data(features, labels, stop_words, stemmer)
        processed_features, processed_labels, vocab_sizes, vocab_sizes_after_filtering, urls_count, dates_count, numerics_count = processed_data

        if not processed_features:
            print(f"No features loaded in chunk {chunk_num}.")
        if not processed_labels:
            print(f"No labels loaded in chunk {chunk_num}.")

        # Calculate and print the size of tokens on disk
        for text in processed_features:
            tokens = text.split()
            tokens_filename = "tokens.csv"
            tokens_size_on_disk = get_tokens_size_on_disk(tokens, tokens_filename)
            total_tokens_size_on_disk += tokens_size_on_disk

        X_chunks.extend(processed_features)
        y_chunks.extend(processed_labels)
        total_vocab_size += sum(vocab_sizes)
        total_vocab_size_after_filtering += sum(vocab_sizes_after_filtering)

    print(f"Total Vocab Size: {total_vocab_size}")
    print(f"Total Vocab Size after stemming: {total_vocab_size_after_filtering}")
    print(f"Total data loaded: {total_data_loaded / (1024 ** 3):.6f} GB")
    print(f"Total size of all tokens on disk: {bytes_to_gb(total_tokens_size_on_disk):.6f} GB")

    if not X_chunks or not y_chunks:
        print("No data loaded. Please check the dataset or adjust parameters.")
        return

    X = X_chunks
    y = y_chunks

    # Encode labels
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    y_categorical = to_categorical(y_encoded)

    # Split data
    X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

    # TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
    X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

    # Tokenization for LSTM
    max_features = 5000
    tokenizer = Tokenizer(num_words=max_features, oov_token="<OOV>")
    tokenizer.fit_on_texts(X_train)
    X_train_seq = tokenizer.texts_to_sequences(X_train)
    X_test_seq = tokenizer.texts_to_sequences(X_test)

    # Clip any values that might exceed max_features
    X_train_seq = [[min(token, max_features-1) for token in seq] for seq in X_train_seq]  # NEW
    X_test_seq = [[min(token, max_features-1) for token in seq] for seq in X_test_seq]  # NEW

    # Padding sequences
    maxlen = 200
    X_train_pad = pad_sequences(X_train_seq, maxlen=maxlen)
    X_test_pad = pad_sequences(X_test_seq, maxlen=maxlen)



    # Build LSTM model
    embedding_dim = 128
    lstm_units = 64
    model = build_lstm_model(max_features, embedding_dim, lstm_units, maxlen)

    # NEW: Replace the model.fit() section with this optimized version:

    # Create optimized datasets
    train_dataset = create_tf_dataset(X_train_pad, y_train, batch_size=512)
    val_dataset = create_tf_dataset(X_test_pad, y_test, batch_size=512)

    # Add early stopping
    early_stop = tf.keras.callbacks.EarlyStopping(
        monitor='val_loss',
        patience=2,
        restore_best_weights=True
    )

    # Train the model
    history = model.fit(
        train_dataset,

        epochs=5,

        validation_data=val_dataset,
        callbacks=[early_stop],
        verbose=1
    )

    # Evaluate the model
    y_pred = model.predict(X_test_pad)
    y_pred_classes = np.argmax(y_pred, axis=1)
    y_test_classes = np.argmax(y_test, axis=1)

    # Print classification report
    print("\nClassification Report:")
    print(classification_report(y_test_classes, y_pred_classes, target_names=label_encoder.classes_))

    # Print individual metrics
    accuracy = accuracy_score(y_test_classes, y_pred_classes)
    precision = precision_score(y_test_classes, y_pred_classes, average='weighted')
    recall = recall_score(y_test_classes, y_pred_classes, average='weighted')
    f1 = f1_score(y_test_classes, y_pred_classes, average='weighted')

    print(f"\nAccuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")

    # Plot training history
    plt.figure(figsize=(12, 4))
    plt.subplot(1, 2, 1)
    plt.plot(history.history['accuracy'], label='Train Accuracy')
    plt.plot(history.history['val_accuracy'], label='Validation Accuracy')
    plt.title('Model Accuracy')
    plt.ylabel('Accuracy')
    plt.xlabel('Epoch')
    plt.legend()

    plt.subplot(1, 2, 2)
    plt.plot(history.history['loss'], label='Train Loss')
    plt.plot(history.history['val_loss'], label='Validation Loss')
    plt.title('Model Loss')
    plt.ylabel('Loss')
    plt.xlabel('Epoch')
    plt.legend()
    plt.show()

if __name__ == "__main__":
    main()

