In [1]:
%env PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin


# Import Libraries 

In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import math
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from bs4 import BeautifulSoup
import shap
import matplotlib.pyplot as plt
from lime.lime_text import LimeTextExplainer
import torch.ao.quantization as quantization

# ADD THE SEED (for reproducibility)

In [3]:
SEED = 42  

# Set seed for PyTorch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Ensure deterministic behavior in PyTorch (may slow down training)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [4]:
csv_path = "myArabicEnglishGlobalFakeNewsDataWithoutFeaturesWithoutPreprocessing.csv"
df = pd.read_csv(csv_path)
df

Unnamed: 0,statement,Label
0,UNHINGED Trump Supporters Visit DC For Inaugu...,1
1,إغلاق دائرة تنفيذ بعبدا بسبب كورونا أعلن أمين ...,1
2,بيراميدز يعود لفندق إقامته في دار السلام عادت ...,1
3,Lindsay Lohan<U+2019>s Strange Accent: Another...,1
4,الإفتاء المصرية: غدًا أول أيام شهر شعبان لعام ...,0
...,...,...
453149,طقس اليوم.. نزول قطرات مطرية بمجموعة من مناطق ...,0
453150,توقيف مصري أثار فيديو يظهر تحرشه بطفلة استنكار...,0
453151,رئيس المجلس الأعلى للقضاء يستقبل المبعوث الأمم...,0
453152,في ظرف 24 ساعة: 4 آلاف مكالمة على 190.. والوضع...,1


In [5]:
df.dropna(inplace=True)

In [6]:
text_column = 'statement'
labels = df['Label']

# Initialize Tokenizers and Models

In [8]:
mpnet_tokenizer = AutoTokenizer.from_pretrained("sentence-transformers/all-mpnet-base-v2")
mpnet_model = AutoModel.from_pretrained("sentence-transformers/all-mpnet-base-v2")

In [15]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [16]:
# Move mpnet_model to device
mpnet_model.to(device)

MPNetModel(
  (embeddings): MPNetEmbeddings(
    (word_embeddings): Embedding(30527, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): MPNetEncoder(
    (layer): ModuleList(
      (0-11): 12 x MPNetLayer(
        (attention): MPNetAttention(
          (attn): MPNetSelfAttention(
            (q): Linear(in_features=768, out_features=768, bias=True)
            (k): Linear(in_features=768, out_features=768, bias=True)
            (v): Linear(in_features=768, out_features=768, bias=True)
            (o): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (intermediate): MPNetIntermediate(
          (dense): Linear(in_

In [17]:
# Define the dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, model):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.model =  model
        self.model.eval()  # Ensure the model is in evaluation mode

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        # Tokenize the text with truncation but no padding
        inputs = mpnet_tokenizer(
            text,
            return_tensors='pt',
            padding=False,        # No padding here (dynamic padding in collate_fn)
            truncation=True,      # Truncate to max_length
            max_length=512        # Set a maximum length (in bytes)
        )

        input_ids = inputs['input_ids'].squeeze(0)  # Shape [seq_len]
        attention_mask = inputs['attention_mask'].squeeze(0)  # Shape [seq_len]

        # Move inputs to the same device as the model
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        # Get embeddings from the model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
            embeddings = outputs.last_hidden_state.squeeze(0)  # Shape [seq_len, hidden_dim]

        # Move data back to CPU for DataLoader
        embeddings = embeddings.cpu()
        attention_mask = attention_mask.cpu()

        return {
            'embeddings': embeddings,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }


In [18]:
def custom_collate_fn(batch):
    # Extract individual components
    embeddings_list = [item['embeddings'] for item in batch]
    attention_masks_list = [item['attention_mask'] for item in batch]
    labels = torch.tensor([item['labels'] for item in batch])

    # Find max sequence length in the batch
    max_seq_len = max([embeddings.size(0) for embeddings in embeddings_list])

    # Initialize padded tensors
    batch_size = len(batch)
    hidden_dim = embeddings_list[0].size(1)

    padded_embeddings = torch.zeros(batch_size, max_seq_len, hidden_dim)
    padded_attention_masks = torch.zeros(batch_size, max_seq_len, dtype=torch.long)

    for i in range(batch_size):
        seq_len = embeddings_list[i].size(0)
        padded_embeddings[i, :seq_len, :] = embeddings_list[i]
        padded_attention_masks[i, :seq_len] = attention_masks_list[i]

    return {
        'embeddings': padded_embeddings,
        'attention_mask': padded_attention_masks,
        'labels': labels
    }

In [19]:

# Fourier Positional Encoding
class FourierPositionalEncoding(nn.Module):
    def __init__(self, d_model, temperature=10000):
        super(FourierPositionalEncoding, self).__init__()
        self.d_model = d_model
        self.register_buffer(
            'freq_bands',
            torch.linspace(0, 1, steps=d_model // 2) * temperature
        )

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, dtype=torch.float, device=x.device).unsqueeze(-1)  # Shape [seq_len, 1]
        proj = positions * self.freq_bands  # Shape [seq_len, d_model/2]
        sin_encoding = torch.sin(proj)
        cos_encoding = torch.cos(proj)
        encoding = torch.cat([sin_encoding, cos_encoding], dim=-1)
        encoding = encoding.unsqueeze(0)  # Shape [1, seq_len, d_model]
        return x + encoding


In [20]:

# Model components
class LayerNormalization(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
        super(LayerNormalization, self).__init__()
        if isinstance(normalized_shape, int):
            normalized_shape = (normalized_shape,)
        self.normalized_shape = tuple(normalized_shape)
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if self.elementwise_affine:
            self.weight = nn.Parameter(torch.ones(normalized_shape))
            self.bias = nn.Parameter(torch.zeros(normalized_shape))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)

    def forward(self, x):
        dims = tuple(-(i + 1) for i in range(len(self.normalized_shape)))
        mean = x.mean(dim=dims, keepdim=True)
        var = x.var(dim=dims, unbiased=False, keepdim=True)
        x = (x - mean) / torch.sqrt(var + self.eps)
        if self.elementwise_affine:
            x = x * self.weight + self.bias
        return x



In [21]:

class InfiniAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.1):
        super(InfiniAttention, self).__init__()
        self.nhead = nhead

        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None, key_padding_mask=None):
        B, T, C = x.size()

        # Transform to query, key, value
        q = self.query(x).view(B, T, self.nhead, C // self.nhead).transpose(1, 2)  # [B, nhead, T, d_head]
        k = self.key(x).view(B, T, self.nhead, C // self.nhead).transpose(1, 2)
        v = self.value(x).view(B, T, self.nhead, C // self.nhead).transpose(1, 2)

        # Compute attention scores
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(k.size(-1))  # [B, nhead, T, T]

        if key_padding_mask is not None:
            # key_padding_mask shape: [B, T] -> [B, 1, 1, T]
            key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)
            attn_scores = attn_scores.masked_fill(key_padding_mask, float('-inf'))

        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        attn_output = torch.matmul(attn_weights, v)  # [B, nhead, T, d_head]
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)
        attn_output = self.out_proj(attn_output)
        return attn_output


In [22]:

class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForwardNetwork, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.dropout(self.activation(self.linear1(x))))


In [23]:

class EncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = InfiniAttention(d_model, nhead, dropout)
        self.ffn = FeedForwardNetwork(d_model, dim_feedforward, dropout)
        self.norm1 = LayerNormalization(d_model)
        self.norm2 = LayerNormalization(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        # Self-attention
        residual = src
        src = self.norm1(src)
        src2 = self.self_attn(src, mask=src_mask, key_padding_mask=src_key_padding_mask)
        src = residual + self.dropout(src2)

        # Feed-forward
        residual = src
        src = self.norm2(src)
        src2 = self.ffn(src)
        src = residual + self.dropout(src2)
        return src

class Encoder(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, num_layers, dropout=0.1):
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, nhead, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        output = src
        for layer in self.layers:
            output = layer(output, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        return output


In [24]:

class ClassifierHead(nn.Module):
    def __init__(self, input_dim, num_classes, dropout_rate=0.5):
        super(ClassifierHead, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(input_dim, input_dim // 2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(input_dim // 2, num_classes)

    def forward(self, x):
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [25]:
from torch.ao.quantization import QConfig, default_qconfig, default_per_channel_qconfig
from torch.ao.quantization.observer import MinMaxObserver, MovingAverageMinMaxObserver

def prepare_model_for_qat(model):
    """
    Prepares the model for quantization-aware training.
    """
    # Define qconfig
    qconfig = QConfig(
        activation=MovingAverageMinMaxObserver.with_args(dtype=torch.quint8),
        weight=MinMaxObserver.with_args(dtype=torch.qint8)
    )

    # Apply qconfig to the model
    model.qconfig = qconfig
    # Skip fusing layers since this is a custom model
    quantization.prepare_qat(model, inplace=True)
    return model

In [26]:
class CombinedModel(nn.Module):
    def __init__(self, embedding_dim=768, hidden_dim=256, num_classes=2, num_encoder_layers=6):
        super(CombinedModel, self).__init__()
        self.quant = torch.ao.quantization.QuantStub()  # Add QuantStub
        self.pos_encoder = FourierPositionalEncoding(embedding_dim)
        self.encoder = Encoder(embedding_dim, nhead=8, dim_feedforward=hidden_dim * 4, num_layers=num_encoder_layers)
        self.classifier = ClassifierHead(embedding_dim, num_classes)
        self.dequant = torch.ao.quantization.DeQuantStub()  # Add DeQuantStub

    def forward(self, embeddings, attention_mask):
        # Apply quantization
        embeddings = self.quant(embeddings)
        
        # Apply positional encoding
        embeddings = self.pos_encoder(embeddings)
        
        # Create src_key_padding_mask from attention_mask
        src_key_padding_mask = attention_mask == 0
        
        # Pass through the encoder
        encoder_output = self.encoder(embeddings, src_key_padding_mask=src_key_padding_mask)
        
        # Masked mean pooling
        masked_encoder_output = encoder_output * attention_mask.unsqueeze(-1)
        sum_embeddings = masked_encoder_output.sum(dim=1)
        lengths = attention_mask.sum(dim=1).unsqueeze(-1)
        pooled_output = sum_embeddings / lengths
        
        # Classifier
        logits = self.classifier(pooled_output)
        
        # Dequantize the output
        logits = self.dequant(logits)
        
        return logits


In [27]:
# Initialize the model and apply QAT
model = CombinedModel().to(device)
model = prepare_model_for_qat(model)

In [28]:

# Split data and create DataLoaders
def split_data(texts, labels, tokenizer, model, train_size=0.8, val_size=0.1, test_size=0.1, batch_size=64):
    assert train_size + val_size + test_size == 1, "Split proportions must sum to 1."

    # Create the dataset
    dataset = FakeNewsDataset(texts, labels, tokenizer, model)

    # Split the dataset
    train_len = int(train_size * len(dataset))
    val_len = int(val_size * len(dataset))
    test_len = len(dataset) - train_len - val_len
    train_dataset, val_dataset, test_dataset = random_split(dataset, [train_len, val_len, test_len])

    # Create DataLoaders with the custom collate function
    train_dataloader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, collate_fn=custom_collate_fn)
    val_dataloader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, collate_fn=custom_collate_fn)
    test_dataloader = DataLoader(
        test_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, collate_fn=custom_collate_fn)

    return train_dataloader, val_dataloader, test_dataloader

# Create DataLoaders
train_dataloader, val_dataloader, test_dataloader = split_data(
    df[text_column],
    labels,
    mpnet_tokenizer,
    mpnet_model,
    batch_size=64  # Adjust batch size as needed
)


In [29]:

# Initialize the model and move it to device
model = CombinedModel()
model.to(device)

CombinedModel(
  (quant): QuantStub()
  (pos_encoder): FourierPositionalEncoding()
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attn): InfiniAttention(
          (query): Linear(in_features=768, out_features=768, bias=True)
          (key): Linear(in_features=768, out_features=768, bias=True)
          (value): Linear(in_features=768, out_features=768, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (out_proj): Linear(in_features=768, out_features=768, bias=True)
        )
        (ffn): FeedForwardNetwork(
          (linear1): Linear(in_features=768, out_features=1024, bias=True)
          (activation): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1024, out_features=768, bias=True)
        )
        (norm1): LayerNormalization()
        (norm2): LayerNormalization()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): ClassifierHead

In [30]:

# Set up optimizer and scheduler
def setup_optimizer_and_scheduler(model, learning_rate=2e-5, step_size=7, gamma=0.1, weight_decay=0.01):
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)
    return optimizer, scheduler

# Loss function for binary classification
criterion = nn.CrossEntropyLoss()

# Set up optimizer and scheduler
optimizer, scheduler = setup_optimizer_and_scheduler(model)

from torch.cuda.amp import autocast, GradScaler

from tqdm import tqdm

def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    # Initialize GradScaler for mixed precision training
    scaler = GradScaler()

    # Wrap the dataloader with tqdm for a progress bar
    progress_bar = tqdm(dataloader, desc="Training", leave=False)

    for batch in progress_bar:
        embeddings = batch['embeddings'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Use autocast for mixed precision (CUDA)
        with autocast():
            outputs = model(embeddings, attention_mask)
            loss = criterion(outputs, labels)

        # Scale the loss and backpropagate
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Accumulate loss and predictions
        total_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

        # Update the progress bar description with the current loss
        progress_bar.set_description(f"Training Loss: {loss.item():.4f}")

    # Calculate average loss and metrics
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)

    return avg_loss, accuracy, f1

In [31]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in dataloader:
            embeddings = batch['embeddings'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(embeddings, attention_mask)
            loss = criterion(outputs, labels)
            
            total_loss += loss.item()
            
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    
    return avg_loss, accuracy, f1, all_preds, all_labels

In [32]:
# Modify training_loop to include early stopping
def training_loop(model, train_dataloader, val_dataloader, optimizer, scheduler, criterion, num_epochs, device, patience=3):
    best_val_f1 = 0
    epochs_no_improve = 0
    
    for epoch in range(num_epochs):
        # Train for one epoch
        train_loss, train_accuracy, train_f1 = train_one_epoch(model, train_dataloader, optimizer, criterion, device)
        
        # Evaluate on the validation set
        val_loss, val_accuracy, val_f1, val_preds, val_labels = evaluate(model, val_dataloader, criterion, device)  # Capture all return values
        
        # Step the learning rate scheduler
        scheduler.step()

        # Print training and validation metrics
        print(f"Epoch: {epoch+1:02}")
        print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_accuracy*100:.2f}% | Train F1: {train_f1:.3f}")
        print(f"\tVal. Loss: {val_loss:.3f} | Val. Acc: {val_accuracy*100:.2f}% | Val. F1: {val_f1:.3f}")

        # Check for improvement in validation F1 score
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            epochs_no_improve = 0
            
            # Save the best model checkpoint
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': val_loss,
                'val_preds': val_preds,  # Save validation predictions
                'val_labels': val_labels,  # Save validation labels
            }, 'best_model_checkpoint.pth')
            print("\tModel saved!")
        else:
            epochs_no_improve += 1
            if epochs_no_improve == patience:
                print(f"Early stopping after {epoch+1} epochs.")
                break

    print('Training complete!')

In [None]:
# Start training with the best hyperparameters
num_epochs = 10
training_loop(model, train_dataloader, val_dataloader, optimizer, scheduler, criterion, num_epochs, device)

                                                                              

Epoch: 01
	Train Loss: 0.601 | Train Acc: 66.47% | Train F1: 0.586
	Val. Loss: 0.566 | Val. Acc: 68.96% | Val. F1: 0.577
	Model saved!


                                                                              

Epoch: 02
	Train Loss: 0.560 | Train Acc: 70.06% | Train F1: 0.651
	Val. Loss: 0.544 | Val. Acc: 70.60% | Val. F1: 0.618
	Model saved!


                                                                              

Epoch: 03
	Train Loss: 0.538 | Train Acc: 71.63% | Train F1: 0.671
	Val. Loss: 0.521 | Val. Acc: 72.64% | Val. F1: 0.669
	Model saved!


                                                                              

Epoch: 04
	Train Loss: 0.518 | Train Acc: 72.83% | Train F1: 0.685
	Val. Loss: 0.501 | Val. Acc: 73.41% | Val. F1: 0.664


                                                                              

Epoch: 05
	Train Loss: 0.501 | Train Acc: 73.76% | Train F1: 0.698
	Val. Loss: 0.488 | Val. Acc: 74.32% | Val. F1: 0.697
	Model saved!


                                                                              

Epoch: 06
	Train Loss: 0.488 | Train Acc: 74.48% | Train F1: 0.706
	Val. Loss: 0.474 | Val. Acc: 75.35% | Val. F1: 0.712
	Model saved!


Training Loss: 0.4464:  13%|█▎        | 754/5665 [17:31<1:53:00,  1.38s/it]