In [46]:
# 1. Imports
import pandas as pd
import numpy as np
import re
from collections import Counter
import matplotlib.pyplot as plt

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
import seaborn as sns

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7f397c95cc30>

In [47]:
# 2. Load and preprocess datasets
# Load the Kaggle dataset
kaggle_df = pd.read_csv('Data/Data.csv')

# Load the ISOT datasets
true_df = pd.read_csv('Data/True.csv')
fake_df = pd.read_csv('Data/Fake.csv')

# For ISOT, assign labels: true news = 1 and fake news = 0
true_df['label'] = 1
fake_df['label'] = 0

# Combine the two ISOT datasets
isot_df = pd.concat([true_df, fake_df], ignore_index=True)

# Print dataset shapes
print("Kaggle dataset shape:", kaggle_df.shape)
print("ISOT dataset shape:", isot_df.shape)

# Combine Kaggle and ISOT datasets
df = pd.concat([kaggle_df, isot_df], ignore_index=True)

# Shuffle the combined dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)
print("Combined dataset shape:", df.shape)

# Display class distribution
print("\nClass distribution:")
print(df['label'].value_counts())
print(f"Percentage of fake news: {df['label'].value_counts()[0]/len(df)*100:.2f}%")
print(f"Percentage of true news: {df['label'].value_counts()[1]/len(df)*100:.2f}%")

Kaggle dataset shape: (20800, 5)
ISOT dataset shape: (44898, 5)
Combined dataset shape: (65698, 7)

Class distribution:
label
0    33868
1    31830
Name: count, dtype: int64
Percentage of fake news: 51.55%
Percentage of true news: 48.45%


In [48]:
# 3. Handle missing values and create content column
# Drop rows missing 'text' (critical for classification)
df = df.dropna(subset=['text'])

# Fill missing 'title' values with a placeholder
df['title'] = df['title'].fillna("No Title Provided")

# If there is an 'author' column, fill missing values with "Unknown"
if 'author' in df.columns:
    df['author'] = df['author'].fillna("Unknown")

# Create combined content field
if 'content' not in df.columns:
    df['content'] = df['title'] + " " + df['text']

# Text cleaning function - keep it simple but effective
def clean_text(text):
    """Basic preprocessing of text"""
    text = text.lower()
    text = re.sub(r'http\S+|www\S+|https\S+', '', text)  # Remove URLs
    text = re.sub(r'[^\w\s]', ' ', text)  # Replace punctuation with space
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply text cleaning
df['cleaned_content'] = df['content'].apply(clean_text)

# Print a sample to verify
print("\nSample raw content:")
print(df['content'].iloc[0][:200])
print("\nSample cleaned content:")
print(df['cleaned_content'].iloc[0][:200])


Sample raw content:
‘We’re Just Not Into Her’ – Hillary Clinton Losing Support of Millennials Patrick Henningsen  21st Century WireWhat s up with the millennials? The Clinton campaign may have felt a jolt after seeing th

Sample cleaned content:
we re just not into her hillary clinton losing support of millennials patrick henningsen 21st century wirewhat s up with the millennials the clinton campaign may have felt a jolt after seeing the fron


In [49]:
# 4. Split into training, validation, and testing sets
X = df['cleaned_content']
y = df['label']

# Split 80% training, 20% testing
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Further split training data into training and validation sets (90% train, 10% validation)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, test_size=0.1, random_state=42, stratify=y_train_full)

print(f"Training set: {len(X_train)} samples")
print(f"Validation set: {len(X_val)} samples")
print(f"Test set: {len(X_test)} samples")

# Simple but fast tokenizer function
def tokenize(text, max_words=None):
    tokens = text.split()
    if max_words:
        tokens = tokens[:max_words]
    return tokens

# Build vocabulary with better coverage (using top 15,000 words)
all_tokens = []
for text in X_train:
    all_tokens.extend(tokenize(text))

word_counts = Counter(all_tokens)
print(f"Total unique words found: {len(word_counts)}")

# Create vocabulary (0: padding, 1: unknown)
MAX_VOCAB_SIZE = 15000  # Increased from original implementation
MIN_WORD_FREQ = 3       # Only include words that appear at least 3 times

vocab = {'<pad>': 0, '<unk>': 1}
idx = 2
for word, count in word_counts.most_common(MAX_VOCAB_SIZE):
    if count >= MIN_WORD_FREQ:
        vocab[word] = idx
        idx += 1

vocab_size = len(vocab)
print(f"Vocabulary size: {vocab_size}")

# For inverse lookups
idx_to_word = {idx: word for word, idx in vocab.items()}

# Convert text to sequences
def text_to_sequence(text, vocab, max_len=300):
    tokens = tokenize(text, max_len)
    sequence = [vocab.get(token, vocab['<unk>']) for token in tokens]
    
    # Get actual length before padding
    length = len(sequence)
    
    # Pad sequence to uniform length
    if len(sequence) < max_len:
        sequence += [vocab['<pad>']] * (max_len - len(sequence))
    else:
        sequence = sequence[:max_len]
        length = max_len
        
    return sequence, length

MAX_LEN = 300
X_train_seqs = []
X_train_lengths = []
for text in X_train:
    seq, length = text_to_sequence(text, vocab, MAX_LEN)
    X_train_seqs.append(seq)
    X_train_lengths.append(length)

# Now do the same for validation set
X_val_seqs = []
X_val_lengths = []
for text in X_val:
    seq, length = text_to_sequence(text, vocab, MAX_LEN)
    X_val_seqs.append(seq)
    X_val_lengths.append(length)

X_test_seqs = []
X_test_lengths = []
for text in X_test:
    seq, length = text_to_sequence(text, vocab, MAX_LEN)
    X_test_seqs.append(seq)
    X_test_lengths.append(length)

# Convert to tensors
X_train_tensor = torch.LongTensor(X_train_seqs)
X_train_lengths = torch.LongTensor(X_train_lengths)
y_train_tensor = torch.LongTensor(y_train.values)

X_val_tensor = torch.LongTensor(X_val_seqs)
X_val_lengths = torch.LongTensor(X_val_lengths)
y_val_tensor = torch.LongTensor(y_val.values)

X_test_tensor = torch.LongTensor(X_test_seqs)
X_test_lengths = torch.LongTensor(X_test_lengths)
y_test_tensor = torch.LongTensor(y_test.values)

print(f"Training tensor shape: {X_train_tensor.shape}")
print(f"Validation tensor shape: {X_val_tensor.shape}")
print(f"Test tensor shape: {X_test_tensor.shape}")

# Create custom dataset class
class NewsDataset(Dataset):
    def __init__(self, texts, lengths, labels):
        self.texts = texts
        self.lengths = lengths
        self.labels = labels
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.texts[idx], self.lengths[idx], self.labels[idx]

# Create datasets - now with sequence lengths included
train_dataset = NewsDataset(X_train_tensor, X_train_lengths, y_train_tensor)
val_dataset = NewsDataset(X_val_tensor, X_val_lengths, y_val_tensor)
test_dataset = NewsDataset(X_test_tensor, X_test_lengths, y_test_tensor)

# Create data loaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)
test_loader = DataLoader(test_dataset, batch_size=batch_size)

# Verify the DataLoader returns the correct structure
for batch in train_loader:
    print(f"Batch contains {len(batch)} elements")
    texts, lengths, labels = batch
    print(f"Batch verification: texts shape: {texts.shape}, lengths shape: {lengths.shape}, labels shape: {labels.shape}")
    break  # Only check the first batch

Training set: 47274 samples
Validation set: 5253 samples
Test set: 13132 samples
Total unique words found: 200245
Vocabulary size: 15002
Training tensor shape: torch.Size([47274, 300])
Validation tensor shape: torch.Size([5253, 300])
Test tensor shape: torch.Size([13132, 300])
Batch contains 3 elements
Batch verification: texts shape: torch.Size([64, 300]), lengths shape: torch.Size([64]), labels shape: torch.Size([64])


In [50]:
# 5. Create Dataset, DataLoader, and add augmentation
class NewsDataset(Dataset):
    def __init__(self, sequences, seq_lengths, labels, augment=False):
        self.sequences = sequences
        self.seq_lengths = seq_lengths
        self.labels = labels
        self.augment = augment
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        seq = self.sequences[idx]
        length = self.seq_lengths[idx]
        
        # Apply simple data augmentation (word dropout) if enabled
        if self.augment and length > 10:  # Don't augment very short sequences
            dropout_mask = torch.rand(len(seq)) > 0.1  # 10% word dropout
            # Keep padding as is (don't dropout padding)
            dropout_mask = dropout_mask | (seq == 0)
            # Apply dropout and replace dropped tokens with <unk>
            seq = torch.where(dropout_mask, seq, torch.tensor(1, dtype=torch.long))
            
        return seq, length, self.labels[idx]

# Create datasets with stratification
# First, create a validation split from the training data
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_tensor, y_train_tensor, test_size=0.1, stratify=y_train_tensor, random_state=42
)

X_train_lengths_final = X_train_lengths[:len(X_train_final)]
X_val_lengths = X_train_lengths[len(X_train_final):]

# Create datasets
train_dataset = NewsDataset(X_train_final, X_train_lengths_final, y_train_final, augment=True)
val_dataset = NewsDataset(X_val, X_val_lengths, y_val, augment=False)
test_dataset = NewsDataset(X_test_tensor, X_test_lengths, y_test_tensor, augment=False)

# Create data loaders with variable batch sizes based on dataset size
BATCH_SIZE = 128  # Larger batch size for stability

train_loader = DataLoader(
    train_dataset, batch_size=BATCH_SIZE, shuffle=True,
    num_workers=0, pin_memory=True
)
val_loader = DataLoader(
    val_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=0, pin_memory=True
)
test_loader = DataLoader(
    test_dataset, batch_size=BATCH_SIZE, shuffle=False,
    num_workers=0, pin_memory=True
)

print(f"Number of training batches: {len(train_loader)}")
print(f"Number of validation batches: {len(val_loader)}")
print(f"Number of test batches: {len(test_loader)}")

Number of training batches: 333
Number of validation batches: 37
Number of test batches: 103


In [51]:
# 6. Define an improved RNN model with multi-head self-attention
class ImprovedRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers=2, dropout=0.5):
        super(ImprovedRNN, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.dropout = nn.Dropout(dropout)
        
        # Use GRU (a gated RNN) for better training dynamics (still a type of RNN)
        self.rnn = nn.GRU(embedding_dim, 
                          hidden_dim, 
                          num_layers=n_layers, 
                          bidirectional=True, 
                          batch_first=True,
                          dropout=dropout if n_layers > 1 else 0)
        
        # LayerNorm for stable training
        self.layer_norm = nn.LayerNorm(hidden_dim*2)
        
        # Multi-head self-attention block (captures richer interactions)
        self.attention = nn.MultiheadAttention(embed_dim=hidden_dim*2, num_heads=8, dropout=0.1, batch_first=True)
        
        # Fully connected layers with batch normalization and GELU activations
        self.fc1 = nn.Linear(hidden_dim*2, hidden_dim)
        self.bn1 = nn.BatchNorm1d(hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, hidden_dim//2)
        self.bn2 = nn.BatchNorm1d(hidden_dim//2)
        self.fc3 = nn.Linear(hidden_dim//2, output_dim)
        
        self.activation = nn.GELU()

    def init_weights(self):
        # Xavier initialization for weight matrices, uniform for biases
        for name, param in self.named_parameters():
            if 'weight' in name:
                if param.dim() > 1:
                    nn.init.xavier_uniform_(param)
                else:
                    nn.init.uniform_(param, -0.1, 0.1)
            elif 'bias' in name:
                nn.init.zeros_(param)
                
    def forward(self, x):
        # x shape: [batch_size, max_len]
        embedded = self.dropout(self.embedding(x))  # [B, L, emb_dim]
        
        # GRU layer
        rnn_out, _ = self.rnn(embedded)  # [B, L, hidden_dim*2]
        
        # Layer normalization
        normed = self.layer_norm(rnn_out)
        
        # Create key padding mask (True for padding positions)
        mask = (x == 0)
        
        # Multi-head self-attention; outputs same shape as input
        attn_out, _ = self.attention(normed, normed, normed, key_padding_mask=mask)
        
        # Residual connection
        context = attn_out + normed   # [B, L, hidden_dim*2]
        
        # Average pooling over the time dimension—weighting non-pad tokens only
        # mask: [B, L] -> ~mask gives valid tokens
        valid = (~mask).unsqueeze(2).float()  # [B, L, 1]
        lengths = valid.sum(dim=1)  # [B, 1]
        pooled = (context * valid).sum(dim=1) / (lengths + 1e-8)  # [B, hidden_dim*2]
        
        # Fully connected classifier
        out = self.dropout(pooled)
        out = self.fc1(out)
        out = self.bn1(out)
        out = self.activation(out)
        
        out = self.dropout(out)
        out = self.fc2(out)
        out = self.bn2(out)
        out = self.activation(out)
        
        out = self.dropout(out)
        out = self.fc3(out)
        
        return out

print("ImprovedRNN model defined successfully")

ImprovedRNN model defined successfully


In [52]:
# 7. Initialize model (updated hyperparameters for stability)
EMBEDDING_DIM = 300
HIDDEN_DIM = 384
OUTPUT_DIM = 2
N_LAYERS = 2
DROPOUT = 0.4

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

model = ImprovedRNN(
    vocab_size=vocab_size,
    embedding_dim=EMBEDDING_DIM,
    hidden_dim=HIDDEN_DIM,
    output_dim=OUTPUT_DIM,
    n_layers=N_LAYERS,
    dropout=DROPOUT
)
model.init_weights()
model = model.to(device)

# Remove label smoothing for stability and lower LR
criterion = nn.CrossEntropyLoss()  
optimizer = optim.AdamW(model.parameters(), lr=0.0005, weight_decay=1e-4)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='max', factor=0.5, patience=2, verbose=True)

print(model)
print(f"Trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")

Using device: cuda
ImprovedRNN(
  (embedding): Embedding(15002, 300, padding_idx=0)
  (dropout): Dropout(p=0.4, inplace=False)
  (rnn): GRU(300, 384, num_layers=2, batch_first=True, dropout=0.4, bidirectional=True)
  (layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (attention): MultiheadAttention(
    (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
  )
  (fc1): Linear(in_features=768, out_features=384, bias=True)
  (bn1): BatchNorm1d(384, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc2): Linear(in_features=384, out_features=192, bias=True)
  (bn2): BatchNorm1d(192, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (fc3): Linear(in_features=192, out_features=2, bias=True)
  (activation): GELU(approximate='none')
)
Trainable parameters: 11,474,618




In [53]:
import math
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score

N_EPOCHS = 5  # Number of epochs
train_losses = []
train_accs = []
train_f1s = []

print(f"Starting training for {N_EPOCHS} epochs...")

for epoch in range(N_EPOCHS):
    model.train()
    epoch_loss = 0.0
    epoch_acc = 0.0
    all_train_preds = []
    all_train_labels = []
    
    for batch_idx, (texts, lengths, labels) in enumerate(train_loader):
        texts, lengths, labels = texts.to(device), lengths.to(device), labels.to(device)
        
        optimizer.zero_grad()
        predictions = model(texts)
        loss = criterion(predictions, labels)
        
        # Check for NaN loss
        if torch.isnan(loss):
            print("NaN loss encountered. Skipping this batch.")
            continue
        
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)
        optimizer.step()
        
        # Update metrics
        _, predicted = torch.max(predictions, 1)
        correct = (predicted == labels).float().sum()
        accuracy = correct / len(labels)
        
        epoch_loss += loss.item()
        epoch_acc += accuracy.item()
        all_train_preds.extend(predicted.cpu().numpy())
        all_train_labels.extend(labels.cpu().numpy())
        
        if batch_idx % 50 == 0:
            print(f"Epoch {epoch+1}/{N_EPOCHS} | Batch {batch_idx}/{len(train_loader)} | Loss: {loss.item():.4f} | Acc: {accuracy.item():.4f}")
    
    # Compute and record average train metrics
    train_loss = epoch_loss / len(train_loader)
    train_acc = epoch_acc / len(train_loader)
    train_f1 = f1_score(all_train_labels, all_train_preds, average='weighted')
    train_losses.append(train_loss)
    train_accs.append(train_acc)
    train_f1s.append(train_f1)
    
    print(f"\nEpoch {epoch+1}/{N_EPOCHS} summary:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f}")
    print('-' * 60)

# After training, evaluate on the test set and calculate test metrics
model.eval()
all_test_preds = []
all_test_labels = []
with torch.no_grad():
    for texts, lengths, labels in test_loader:
        texts, lengths, labels = texts.to(device), lengths.to(device), labels.to(device)
        predictions = model(texts)
        _, predicted = torch.max(predictions, 1)
        all_test_preds.extend(predicted.cpu().numpy())
        all_test_labels.extend(labels.cpu().numpy())

test_accuracy = accuracy_score(all_test_labels, all_test_preds)
test_precision = precision_score(all_test_labels, all_test_preds, average='weighted')
test_recall = recall_score(all_test_labels, all_test_preds, average='weighted')
test_f1 = f1_score(all_test_labels, all_test_preds, average='weighted')

print("\nTest Evaluation:")
print(f"Test Accuracy:  {test_accuracy:.4f}")
print(f"Test Precision: {test_precision:.4f}")
print(f"Test Recall:    {test_recall:.4f}")
print(f"Test F1-Score:  {test_f1:.4f}")

Starting training for 5 epochs...
Epoch 1/5 | Batch 0/333 | Loss: 0.6927 | Acc: 0.5000
NaN loss encountered. Skipping this batch.
Epoch 1/5 | Batch 50/333 | Loss: 0.4022 | Acc: 0.9062
Epoch 1/5 | Batch 100/333 | Loss: 0.3607 | Acc: 0.8359
Epoch 1/5 | Batch 150/333 | Loss: 0.2245 | Acc: 0.9375
NaN loss encountered. Skipping this batch.
Epoch 1/5 | Batch 200/333 | Loss: 0.2287 | Acc: 0.9141
NaN loss encountered. Skipping this batch.
Epoch 1/5 | Batch 250/333 | Loss: 0.2611 | Acc: 0.8984
NaN loss encountered. Skipping this batch.
NaN loss encountered. Skipping this batch.
Epoch 1/5 | Batch 300/333 | Loss: 0.1464 | Acc: 0.9453
NaN loss encountered. Skipping this batch.

Epoch 1/5 summary:
  Train Loss: 0.2783 | Train Acc: 0.8830 | Train F1: 0.8990
------------------------------------------------------------
Epoch 2/5 | Batch 0/333 | Loss: 0.1189 | Acc: 0.9766
Epoch 2/5 | Batch 50/333 | Loss: 0.1184 | Acc: 0.9531
NaN loss encountered. Skipping this batch.
Epoch 2/5 | Batch 100/333 | Loss: 0

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
