In [1]:
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense, Dropout

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')

# Load the datasets
true_news_path = './True.csv'
fake_news_path = './Fake.csv'

true_news_df = pd.read_csv(true_news_path)
fake_news_df = pd.read_csv(fake_news_path)

# Add labels: 1 for True, 0 for Fake
true_news_df['label'] = 1
fake_news_df['label'] = 0

# Combine datasets
df = pd.concat([true_news_df, fake_news_df], ignore_index=True)
print("Dataset loaded. Sample data:")
print(df.sample(5))

[nltk_data] Downloading package stopwords to C:\Users\PC
[nltk_data]     VISION\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\PC
[nltk_data]     VISION\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Dataset loaded. Sample data:
                                                   title  \
5769   Highlights: The Trump presidency on February 1...   
24756   George Takei NAILS Republican Hypocrisy On Tr...   
24283   Trump Just Signed Away Our Environment, And A...   
16070  New Zealand PM says ban on foreign home buyers...   
39371  WATCH: RACIST RAPPER WHO HUNG WHITE KID In Lat...   

                                                    text       subject  \
5769   (Reuters) - Highlights of the day for U.S. Pre...  politicsNews   
24756  Donald Trump insists that he doesn t have to d...          News   
24283  Donald Trump betrayed Americans on Tuesday by ...          News   
16070  WELLINGTON (Reuters) - New Zealand Prime Minis...     worldnews   
39371  Rapper XXXTentacion released a controversial v...     left-news   

                    date  label  
5769   February 1, 2017       1  
24756  December 20, 2016      0  
24283   January 24, 2017      0  
16070  October 31, 2017      

Step 1: Import Libraries and Load Dataset

Step 2: Preprocessing the Text

In [2]:
# Function to clean text
def clean_text(text):
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    text = text.lower()  # Convert to lowercase
    stop_words = set(stopwords.words('english'))
    return ' '.join(word for word in text.split() if word not in stop_words)

# Apply cleaning
df['cleaned_text'] = df['text'].apply(clean_text)

# Tokenize text
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')
tokenizer.fit_on_texts(df['cleaned_text'])

# Convert text to sequences and pad them
sequences = tokenizer.texts_to_sequences(df['cleaned_text'])
padded_sequences = pad_sequences(sequences, maxlen=100, padding='post', truncating='post')

# Extract labels
labels = df['label'].values

Step 3: Dataset Splitting

In [3]:
# Split dataset into training, validation, and test sets
X_train, X_temp, y_train, y_temp = train_test_split(padded_sequences, labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"Training set size: {len(X_train)}, Validation set size: {len(X_val)}, Test set size: {len(X_test)}")

Training set size: 31428, Validation set size: 6735, Test set size: 6735


Step 4: Pretrained Embeddings Setup
Word2Vec, FastText, and GloVe Pretrained Embeddings

In [None]:
import gensim.downloader as api

# Load pretrained embeddings
word2vec_model = api.load("word2vec-google-news-300")
fasttext_model = api.load("fasttext-wiki-news-subwords-300")
# Download GloVe embeddings and load them
import requests, zipfile, os

glove_url = "http://nlp.stanford.edu/data/glove.6B.zip"
glove_zip_path = "glove.6B.zip"

response = requests.get(glove_url)
with open(glove_zip_path, 'wb') as f:
    f.write(response.content)

with zipfile.ZipFile(glove_zip_path, 'r') as zip_ref:
    zip_ref.extractall()
print("GloVe embeddings downloaded and extracted.")

# Load GloVe (100-dimensional)
def load_glove_embeddings(filepath):
    embeddings = {}
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], dtype='float32')
            embeddings[word] = vector
    return embeddings

glove_embeddings = load_glove_embeddings("glove.6B.100d.txt")

# Create embedding matrices
def create_embedding_matrix(embeddings, tokenizer, dim):
    embedding_matrix = np.zeros((len(tokenizer.word_index) + 1, dim))
    for word, i in tokenizer.word_index.items():
        if word in embeddings:
            embedding_matrix[i] = embeddings[word]
    return embedding_matrix

word2vec_matrix = create_embedding_matrix(word2vec_model, tokenizer, 300)
fasttext_matrix = create_embedding_matrix(fasttext_model, tokenizer, 300)
glove_matrix = create_embedding_matrix(glove_embeddings, tokenizer, 100)

Step 5: Model Creation Function

In [None]:
def create_model(embedding_matrix):
    model = Sequential([
        Embedding(input_dim=embedding_matrix.shape[0], output_dim=embedding_matrix.shape[1], weights=[embedding_matrix], trainable=False),
        LSTM(128, return_sequences=True),
        Dropout(0.4),
        GRU(64),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

Step 6: Train and Evaluate Models

In [None]:
# Train and evaluate with different embeddings
pretrained_matrices = {
    "Word2Vec": word2vec_matrix,
    "FastText": fasttext_matrix,
    "GloVe": glove_matrix
}

results = {}
for name, matrix in pretrained_matrices.items():
    print(f"Training model with {name} embeddings...")
    model = create_model(matrix)
    model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))
    loss, accuracy = model.evaluate(X_test, y_test)
    results[name] = (loss, accuracy)
    print(f"{name} - Loss: {loss}, Accuracy: {accuracy}")

Step 7: Custom Embeddings Training

In [None]:
from gensim.models import Word2Vec, FastText
from glove import Corpus, Glove

# Train custom Word2Vec embeddings
custom_word2vec = Word2Vec(sentences=df['cleaned_text'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4)

# Train custom FastText embeddings
custom_fasttext = FastText(sentences=df['cleaned_text'].apply(lambda x: x.split()), vector_size=100, window=5, min_count=1, workers=4)

# Train custom GloVe embeddings
corpus = Corpus()
corpus.fit(df['cleaned_text'].apply(lambda x: x.split()), window=5)
custom_glove = Glove(no_components=100, learning_rate=0.05)
custom_glove.fit(corpus.matrix, epochs=10, no_threads=4, verbose=True)
custom_glove.add_dictionary(corpus.dictionary)

# Save models
custom_word2vec.save("custom_word2vec.model")
custom_fasttext.save("custom_fasttext.model")
custom_glove.save("custom_glove.model")

Step 8: Custom Embeddings Evaluation

In [None]:
# Create matrices from custom embeddings
custom_word2vec_matrix = create_embedding_matrix(custom_word2vec.wv, tokenizer, 100)
custom_fasttext_matrix = create_embedding_matrix(custom_fasttext.wv, tokenizer, 100)
custom_glove_matrix = create_embedding_matrix(custom_glove.dictionary, tokenizer, 100)

custom_matrices = {
    "Custom Word2Vec": custom_word2vec_matrix,
    "Custom FastText": custom_fasttext_matrix,
    "Custom GloVe": custom_glove_matrix
}

# Evaluate
for name, matrix in custom_matrices.items():
    print(f"Training model with {name} embeddings...")
    model = create_model(matrix)
    model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))
    loss, accuracy = model.evaluate(X_test, y_test)
    results[name] = (loss, accuracy)
    print(f"{name} - Loss: {loss}, Accuracy: {accuracy}")

Step 1: Model Architecture Definitions

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, GRU, Bidirectional, Dense, Dropout

# Create LSTM Model
def create_lstm_model(embedding_matrix):
    model = Sequential([
        Embedding(
            input_dim=embedding_matrix.shape[0], 
            output_dim=embedding_matrix.shape[1], 
            weights=[embedding_matrix], 
            trainable=False
        ),
        LSTM(128, return_sequences=True),
        Dropout(0.4),
        LSTM(64),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create GRU Model
def create_gru_model(embedding_matrix):
    model = Sequential([
        Embedding(
            input_dim=embedding_matrix.shape[0], 
            output_dim=embedding_matrix.shape[1], 
            weights=[embedding_matrix], 
            trainable=False
        ),
        GRU(128, return_sequences=True),
        Dropout(0.4),
        GRU(64),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Create Bi-LSTM Model
def create_bilstm_model(embedding_matrix):
    model = Sequential([
        Embedding(
            input_dim=embedding_matrix.shape[0], 
            output_dim=embedding_matrix.shape[1], 
            weights=[embedding_matrix], 
            trainable=False
        ),
        Bidirectional(LSTM(128, return_sequences=True)),
        Dropout(0.4),
        Bidirectional(LSTM(64)),
        Dropout(0.4),
        Dense(1, activation='sigmoid')
    ])
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

Step 2: Train and Evaluate Models

In [None]:
# Dictionary of pretrained embedding matrices
pretrained_matrices = {
    "Word2Vec": word2vec_matrix,
    "FastText": fasttext_matrix,
    "GloVe": glove_matrix
}

# Train and evaluate each architecture with each embedding
results = {}
for name, matrix in pretrained_matrices.items():
    print(f"\nUsing {name} embeddings:")
    
    # Train and evaluate LSTM
    print("Training LSTM...")
    lstm_model = create_lstm_model(matrix)
    lstm_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))
    lstm_loss, lstm_accuracy = lstm_model.evaluate(X_test, y_test)
    print(f"LSTM - Loss: {lstm_loss}, Accuracy: {lstm_accuracy}")
    
    # Train and evaluate GRU
    print("Training GRU...")
    gru_model = create_gru_model(matrix)
    gru_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))
    gru_loss, gru_accuracy = gru_model.evaluate(X_test, y_test)
    print(f"GRU - Loss: {gru_loss}, Accuracy: {gru_accuracy}")
    
    # Train and evaluate Bi-LSTM
    print("Training Bi-LSTM...")
    bilstm_model = create_bilstm_model(matrix)
    bilstm_model.fit(X_train, y_train, epochs=3, batch_size=32, validation_data=(X_val, y_val))
    bilstm_loss, bilstm_accuracy = bilstm_model.evaluate(X_test, y_test)
    print(f"Bi-LSTM - Loss: {bilstm_loss}, Accuracy: {bilstm_accuracy}")
    
    # Store results
    results[name] = {
        "LSTM": (lstm_loss, lstm_accuracy),
        "GRU": (gru_loss, gru_accuracy),
        "Bi-LSTM": (bilstm_loss, bilstm_accuracy)
    }

Step 3: Display Results

In [None]:
# Print results for each embedding and architecture
for embedding, metrics in results.items():
    print(f"\nResults for {embedding} embeddings:")
    for model_type, (loss, accuracy) in metrics.items():
        print(f"{model_type} - Loss: {loss:.4f}, Accuracy: {accuracy:.4f}")

Step 9: Using BERT for Transformer-Based Embeddings

In [None]:
from transformers import BertTokenizer, TFBertModel
import tensorflow as tf

# Load BERT tokenizer and model
bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
bert_model = TFBertModel.from_pretrained("bert-base-uncased")

# Prepare the dataset for BERT
def prepare_bert_input(texts, tokenizer, max_length=128):
    input_ids, attention_masks = [], []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="tf"
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
    return tf.concat(input_ids, axis=0), tf.concat(attention_masks, axis=0)

# Tokenize train, validation, and test datasets
train_input_ids, train_attention_masks = prepare_bert_input(X_train, bert_tokenizer)
val_input_ids, val_attention_masks = prepare_bert_input(X_val, bert_tokenizer)
test_input_ids, test_attention_masks = prepare_bert_input(X_test, bert_tokenizer)

# Reshape labels
train_labels = tf.convert_to_tensor(y_train, dtype=tf.float32)
val_labels = tf.convert_to_tensor(y_val, dtype=tf.float32)
test_labels = tf.convert_to_tensor(y_test, dtype=tf.float32)

Step 10: Fine-Tuning BERT for Classification

In [None]:
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Define BERT-based classification model
def create_bert_model():
    input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")
    
    bert_output = bert_model(input_ids, attention_mask=attention_mask)[1]  # Pooled output
    dense = Dense(64, activation="relu")(bert_output)
    output = Dense(1, activation="sigmoid")(dense)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=Adam(learning_rate=2e-5), loss="binary_crossentropy", metrics=["accuracy"])
    return model

# Instantiate and train the model
bert_classifier = create_bert_model()

bert_classifier.fit(
    x={"input_ids": train_input_ids, "attention_mask": train_attention_masks},
    y=train_labels,
    validation_data=({"input_ids": val_input_ids, "attention_mask": val_attention_masks}, val_labels),
    epochs=3,
    batch_size=16
)

Step 11: Evaluate BERT Model

In [None]:
# Evaluate the BERT model on the test dataset
test_loss, test_accuracy = bert_classifier.evaluate(
    x={"input_ids": test_input_ids, "attention_mask": test_attention_masks},
    y=test_labels
)

print(f"BERT Test Loss: {test_loss}")
print(f"BERT Test Accuracy: {test_accuracy}")

Step 12: Comparative Analysis of All Approaches
To summarize and compare results:

In [None]:
# Collect results in a dictionary
results["BERT"] = (test_loss, test_accuracy)

# Print all results
for embedding_type, (loss, accuracy) in results.items():
    print(f"{embedding_type}: Loss = {loss:.4f}, Accuracy = {accuracy:.4f}")

Optional: Save Models for Future Use

In [None]:
# Save the BERT model
bert_classifier.save("bert_fake_news_classifier.h5")

# Save tokenizer for reuse
import pickle
with open("bert_tokenizer.pkl", "wb") as f:
    pickle.dump(bert_tokenizer, f)

Step 13: Additional Transformer-Based Architectures

Step 1: Prepare Input Using DistilBERT Tokenizer

In [None]:
from transformers import DistilBertTokenizer, TFDistilBertModel
import tensorflow as tf
from sklearn.model_selection import train_test_split

# Load DistilBERT tokenizer and model
distilbert_tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
distilbert_model = TFDistilBertModel.from_pretrained("distilbert-base-uncased")

# Function to prepare input for DistilBERT
def prepare_distilbert_input(texts, tokenizer, max_length=128):
    input_ids, attention_masks = [], []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_length,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_tensors="tf"
        )
        input_ids.append(encoded["input_ids"])
        attention_masks.append(encoded["attention_mask"])
    return tf.concat(input_ids, axis=0), tf.concat(attention_masks, axis=0)

# Prepare the dataset
X_train, X_temp, y_train, y_temp = train_test_split(df['cleaned_text'], labels, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

train_input_ids, train_attention_masks = prepare_distilbert_input(X_train, distilbert_tokenizer)
val_input_ids, val_attention_masks = prepare_distilbert_input(X_val, distilbert_tokenizer)
test_input_ids, test_attention_masks = prepare_distilbert_input(X_test, distilbert_tokenizer)

train_labels = tf.convert_to_tensor(y_train, dtype=tf.float32)
val_labels = tf.convert_to_tensor(y_val, dtype=tf.float32)
test_labels = tf.convert_to_tensor(y_test, dtype=tf.float32)

Step 2: Create DistilBERT Classification Model

In [None]:
from tensorflow.keras.layers import Input, Dense, Dropout
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Define a model using DistilBERT embeddings
def create_distilbert_model():
    input_ids = Input(shape=(128,), dtype=tf.int32, name="input_ids")
    attention_mask = Input(shape=(128,), dtype=tf.int32, name="attention_mask")
    
    distilbert_output = distilbert_model(input_ids, attention_mask=attention_mask)[0][:, 0, :]  # CLS token
    dense = Dense(64, activation="relu")(distilbert_output)
    dropout = Dropout(0.3)(dense)
    output = Dense(1, activation="sigmoid")(dropout)
    
    model = Model(inputs=[input_ids, attention_mask], outputs=output)
    model.compile(optimizer=Adam(learning_rate=2e-5), loss="binary_crossentropy", metrics=["accuracy"])
    return model

# Instantiate the model
distilbert_classifier = create_distilbert_model()

Step 3: Train DistilBERT Model

In [None]:
# Train the model
distilbert_classifier.fit(
    x={"input_ids": train_input_ids, "attention_mask": train_attention_masks},
    y=train_labels,
    validation_data=({"input_ids": val_input_ids, "attention_mask": val_attention_masks}, val_labels),
    epochs=3,
    batch_size=16
)

Step 4: Evaluate DistilBERT Model

In [None]:
# Evaluate the model on the test dataset
test_loss, test_accuracy = distilbert_classifier.evaluate(
    x={"input_ids": test_input_ids, "attention_mask": test_attention_masks},
    y=test_labels
)

print(f"DistilBERT Test Loss: {test_loss}")
print(f"DistilBERT Test Accuracy: {test_accuracy}")

Optional: Save the Model

In [None]:
# Save the DistilBERT model and tokenizer for reuse
distilbert_classifier.save("distilbert_fake_news_classifier.h5")
with open("distilbert_tokenizer.pkl", "wb") as f:
    pickle.dump(distilbert_tokenizer, f)