# Import Libraries 

In [1]:
%env PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin

env: PATH=/usr/local/cuda/bin:/opt/conda/bin:/opt/conda/condabin:/opt/conda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin


In [2]:
import pandas as pd
from transformers import AutoTokenizer, AutoModel
import torch
from torch.utils.data import Dataset, DataLoader, random_split
import torch.nn as nn
import math
import torch.optim as optim
from torch.optim.lr_scheduler import StepLR
import torch.nn.functional as F
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from bs4 import BeautifulSoup
import shap
import matplotlib.pyplot as plt
from lime.lime_text import LimeTextExplainer
import torch.ao.quantization as quantization
from sklearn.model_selection import train_test_split

# ADD THE SEED (for reproducibility)

In [3]:
SEED = 42  

# Set seed for PyTorch
torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Ensure deterministic behavior in PyTorch (may slow down training)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [6]:
csv_path = "myEnglishGlobalFakeNewsDataWithoutFeaturesWithoutPreprocessing.csv"
english_df = pd.read_csv(csv_path)
english_df.dropna(inplace=True)
print(english_df)

                                               statement  Label
0      End of eviction moratorium means millions of A...      0
1      The Trump administration worked to free 5,000 ...      0
2      In Afghanistan, over 100 billion dollars spent...      0
3      A photo shows two COVID-19 patients lying on t...      0
4      Its been over 50 years since minimum (wage) an...      0
...                                                  ...    ...
73039  Mayor Fung wants to punish our childrens educa...      1
73040  There are a larger number of shark attacks in ...      0
73041  Democrats have now become the party of the [At...      0
73042  On lifting the U.S. Cuban embargo and allowing...      1
73043  The Department of Veterans Affairs has a manua...      1

[73044 rows x 2 columns]


In [7]:
# Split the English dataset into training and test sets
english_train, english_test = train_test_split(english_df, test_size=0.2, random_state=42)

In [8]:
text_column = 'statement'
labels = english_train['Label']

# Initialize Tokenizers and Models

In [9]:
byt5_tokenizer = AutoTokenizer.from_pretrained("google/byt5-xxl")
en_model = AutoModel.from_pretrained("Alibaba-NLP/gte-large-en-v1.5", trust_remote_code=True)

config.json:   0%|          | 0.00/1.35k [00:00<?, ?B/s]

configuration.py:   0%|          | 0.00/7.13k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- configuration.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling.py:   0%|          | 0.00/59.0k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Alibaba-NLP/new-impl:
- modeling.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

In [10]:
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [11]:
# Move en to device
en_model.to(device)

NewModel(
  (embeddings): NewEmbeddings(
    (word_embeddings): Embedding(30528, 1024, padding_idx=0)
    (rotary_emb): NTKScalingRotaryEmbedding()
    (token_type_embeddings): Embedding(2, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): NewEncoder(
    (layer): ModuleList(
      (0-23): 24 x NewLayer(
        (attention): NewAttention(
          (qkv_proj): Linear(in_features=1024, out_features=3072, bias=True)
          (dropout): Dropout(p=0.0, inplace=False)
          (o_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (mlp): NewGatedMLP(
          (up_gate_proj): Linear(in_features=1024, out_features=8192, bias=False)
          (down_proj): Linear(in_features=4096, out_features=1024, bias=True)
          (act_fn): GELUActivation()
          (hidden_dropout): Dropout(p=0.1, inplace=False)
        )
        (attn_ln): LayerNorm((1024,), eps=1e-12, elementwise_af

In [12]:
# Define the dataset class
class FakeNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, model):
        self.texts = texts.reset_index(drop=True)
        self.labels = labels.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.model = model
        self.model.eval()  # Ensure the model is in evaluation mode

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts.iloc[idx]
        label = self.labels.iloc[idx]

        # Tokenize the text with truncation but no padding
        inputs = self.tokenizer(
            text,
            return_tensors='pt',
            padding=False,  # No padding here (dynamic padding in collate_fn)
            truncation=True,  # Truncate to max_length
            max_length=512  # Set a maximum length (in bytes)
        )

        input_ids = inputs['input_ids'].squeeze(0)  # Shape [seq_len]
        attention_mask = inputs['attention_mask'].squeeze(0)  # Shape [seq_len]

        # Move inputs to the same device as the model
        input_ids = input_ids.to(device)
        attention_mask = attention_mask.to(device)

        # Get embeddings from the model
        with torch.no_grad():
            outputs = self.model(input_ids=input_ids.unsqueeze(0), attention_mask=attention_mask.unsqueeze(0))
            embeddings = outputs.last_hidden_state.squeeze(0)  # Shape [seq_len, hidden_dim]

        # Move data back to CPU for DataLoader
        embeddings = embeddings.cpu()
        attention_mask = attention_mask.cpu()

        return {
            'embeddings': embeddings,
            'attention_mask': attention_mask,
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [13]:
# Custom collate function
def custom_collate_fn(batch):
    embeddings_list = [item['embeddings'] for item in batch]
    attention_masks_list = [item['attention_mask'] for item in batch]
    labels = torch.tensor([item['labels'] for item in batch])

    max_seq_len = max([embeddings.size(0) for embeddings in embeddings_list])

    batch_size = len(batch)
    hidden_dim = embeddings_list[0].size(1)
    padded_embeddings = torch.zeros(batch_size, max_seq_len, hidden_dim)
    padded_attention_masks = torch.zeros(batch_size, max_seq_len, dtype=torch.long)

    for i in range(batch_size):
        seq_len = embeddings_list[i].size(0)
        padded_embeddings[i, :seq_len, :] = embeddings_list[i]
        padded_attention_masks[i, :seq_len] = attention_masks_list[i]

    return {
        'embeddings': padded_embeddings,
        'attention_mask': padded_attention_masks,
        'labels': labels
    }


In [14]:

# Fourier Positional Encoding
class FourierPositionalEncoding(nn.Module):
    def __init__(self, d_model, temperature=10000):
        super(FourierPositionalEncoding, self).__init__()
        self.d_model = d_model
        self.register_buffer(
            'freq_bands',
            torch.linspace(0, 1, steps=d_model // 2) * temperature
        )

    def forward(self, x):
        seq_len = x.size(1)
        positions = torch.arange(seq_len, dtype=torch.float, device=x.device).unsqueeze(-1)  # Shape [seq_len, 1]
        proj = positions * self.freq_bands  # Shape [seq_len, d_model/2]
        sin_encoding = torch.sin(proj)
        cos_encoding = torch.cos(proj)
        encoding = torch.cat([sin_encoding, cos_encoding], dim=-1)
        encoding = encoding.unsqueeze(0)  # Shape [1, seq_len, d_model]
        return x + encoding


In [15]:

# Model components
class LayerNormalization(nn.Module):
    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
        super(LayerNormalization, self).__init__()
        if isinstance(normalized_shape, int):
            normalized_shape = (normalized_shape,)
        self.normalized_shape = tuple(normalized_shape)
        self.eps = eps
        self.elementwise_affine = elementwise_affine
        if self.elementwise_affine:
            self.weight = nn.Parameter(torch.ones(normalized_shape))
            self.bias = nn.Parameter(torch.zeros(normalized_shape))
        else:
            self.register_parameter('weight', None)
            self.register_parameter('bias', None)

    def forward(self, x):
        dims = tuple(-(i + 1) for i in range(len(self.normalized_shape)))
        mean = x.mean(dim=dims, keepdim=True)
        var = x.var(dim=dims, unbiased=False, keepdim=True)
        x = (x - mean) / torch.sqrt(var + self.eps)
        if self.elementwise_affine:
            x = x * self.weight + self.bias
        return x



In [16]:
class MultiScaleAttention(nn.Module):
    def __init__(self, d_model, nhead, dropout=0.1):
        super(MultiScaleAttention, self).__init__()
        self.nhead = nhead
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)
        self.out_proj = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None, key_padding_mask=None):
        B, T, C = x.size()

        # Transform to query, key, value
        q = self.query(x).view(B, T, self.nhead, C // self.nhead).transpose(1, 2)  # [B, nhead, T, d_head]
        k = self.key(x).view(B, T, self.nhead, C // self.nhead).transpose(1, 2)
        v = self.value(x).view(B, T, self.nhead, C // self.nhead).transpose(1, 2)

        # Compute attention scores
        attn_scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(k.size(-1))  # [B, nhead, T, T]

        # Apply mask if provided
        if mask is not None:
            attn_scores = attn_scores.masked_fill(mask == 0, float('-inf'))

        # Apply key_padding_mask if provided
        if key_padding_mask is not None:
            # key_padding_mask shape: [B, T] -> [B, 1, 1, T]
            key_padding_mask = key_padding_mask.unsqueeze(1).unsqueeze(2)
            attn_scores = attn_scores.masked_fill(key_padding_mask, float('-inf'))

        # Compute attention weights
        attn_weights = torch.softmax(attn_scores, dim=-1)
        attn_weights = self.dropout(attn_weights)

        # Apply attention to values
        attn_output = torch.matmul(attn_weights, v)  # [B, nhead, T, d_head]
        attn_output = attn_output.transpose(1, 2).contiguous().view(B, T, C)

        # Final projection
        attn_output = self.out_proj(attn_output)

        return attn_output

In [17]:

class FeedForwardNetwork(nn.Module):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForwardNetwork, self).__init__()
        self.linear1 = nn.Linear(d_model, d_ff)
        self.activation = nn.ReLU()
        self.dropout = nn.Dropout(dropout)
        self.linear2 = nn.Linear(d_ff, d_model)

    def forward(self, x):
        return self.linear2(self.dropout(self.activation(self.linear1(x))))


In [18]:

class EncoderLayer(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, dropout=0.1):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiScaleAttention(d_model, nhead, dropout)
        self.ffn = FeedForwardNetwork(d_model, dim_feedforward, dropout)
        self.norm1 = LayerNormalization(d_model)
        self.norm2 = LayerNormalization(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        src2 = self.self_attn(src)
        src = src + self.dropout(src2)  # Residual connection
        src = self.norm1(src)
        src2 = self.ffn(src)
        src = src + self.dropout(src2)  # Residual connection
        src = self.norm2(src)
        return src

        return src

class Encoder(nn.Module):
    def __init__(self, d_model, nhead, dim_feedforward, num_layers, dropout=0.1):
        super(Encoder, self).__init__()
        self.layers = nn.ModuleList([
            EncoderLayer(d_model, nhead, dim_feedforward, dropout)
            for _ in range(num_layers)
        ])

    def forward(self, src, src_mask=None, src_key_padding_mask=None):
        output = src
        for layer in self.layers:
            output = layer(output, src_mask=src_mask, src_key_padding_mask=src_key_padding_mask)
        return output


In [19]:

class ClassifierHead(nn.Module):
    def __init__(self, input_dim, num_classes, dropout_rate=0.3):
        super(ClassifierHead, self).__init__()
        self.dropout = nn.Dropout(dropout_rate)
        self.fc1 = nn.Linear(input_dim, input_dim // 2)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(input_dim // 2, num_classes)

    def forward(self, x):
        x = self.dropout(x)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        return x


In [20]:
class CombinedModel(nn.Module):
    def __init__(self, embedding_dim=1024, hidden_dim=256, num_classes=2, num_encoder_layers=6):
        super(CombinedModel, self).__init__()
        self.quant = torch.ao.quantization.QuantStub()  # Add QuantStub
        self.pos_encoder = FourierPositionalEncoding(embedding_dim)
        self.encoder = Encoder(embedding_dim, nhead=8, dim_feedforward=hidden_dim * 6, num_layers=num_encoder_layers)
        self.classifier = ClassifierHead(embedding_dim, num_classes)
        self.dequant = torch.ao.quantization.DeQuantStub()  # Add DeQuantStub

    def forward(self, embeddings, attention_mask):
        # Apply quantization
        embeddings = self.quant(embeddings)
        
        # Apply positional encoding
        embeddings = self.pos_encoder(embeddings)
        
        # Create src_key_padding_mask from attention_mask
        src_key_padding_mask = attention_mask == 0
        
        # Pass through the encoder
        encoder_output = self.encoder(embeddings, src_key_padding_mask=src_key_padding_mask)
        
        # Masked mean pooling
        masked_encoder_output = encoder_output * attention_mask.unsqueeze(-1)
        sum_embeddings = masked_encoder_output.sum(dim=1)
        lengths = attention_mask.sum(dim=1).unsqueeze(-1)
        pooled_output = sum_embeddings / lengths
        
        # Classifier
        logits = self.classifier(pooled_output)
        
        # Dequantize the output
        logits = self.dequant(logits)
        
        return logits


In [21]:
# Split the combined training data into training and validation sets
def split_data(texts, labels, tokenizer, model, train_size=0.8, val_size=0.2, batch_size=64):
    assert train_size + val_size == 1, "Split proportions must sum to 1."
    
    # Create the dataset
    dataset = FakeNewsDataset(texts, labels, tokenizer, model)
    
    # Split the dataset
    train_len = int(train_size * len(dataset))
    val_len = len(dataset) - train_len
    train_dataset, val_dataset = random_split(dataset, [train_len, val_len])
    
    # Create DataLoaders with the custom collate function
    train_dataloader = DataLoader(
        train_dataset, batch_size=batch_size, shuffle=True, pin_memory=True, collate_fn=custom_collate_fn
    )
    val_dataloader = DataLoader(
        val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True, collate_fn=custom_collate_fn
    )
    
    return train_dataloader, val_dataloader

In [22]:
train_dataloader, val_dataloader = split_data(
    english_train['statement'], english_train['Label'],
    byt5_tokenizer, en_model, train_size=0.8, val_size=0.2,
    batch_size=64
)

In [23]:
english_test_dataset = FakeNewsDataset(english_test['statement'], english_test['Label'], byt5_tokenizer, en_model)
english_test_dataloader = DataLoader(english_test_dataset, batch_size=64, shuffle=False, collate_fn=custom_collate_fn)

In [24]:

# Initialize the model and move it to device
model = CombinedModel()
model.to(device)

CombinedModel(
  (quant): QuantStub()
  (pos_encoder): FourierPositionalEncoding()
  (encoder): Encoder(
    (layers): ModuleList(
      (0-5): 6 x EncoderLayer(
        (self_attn): MultiScaleAttention(
          (query): Linear(in_features=1024, out_features=1024, bias=True)
          (key): Linear(in_features=1024, out_features=1024, bias=True)
          (value): Linear(in_features=1024, out_features=1024, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
          (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
        )
        (ffn): FeedForwardNetwork(
          (linear1): Linear(in_features=1024, out_features=1536, bias=True)
          (activation): ReLU()
          (dropout): Dropout(p=0.1, inplace=False)
          (linear2): Linear(in_features=1536, out_features=1024, bias=True)
        )
        (norm1): LayerNormalization()
        (norm2): LayerNormalization()
        (dropout): Dropout(p=0.1, inplace=False)
      )
    )
  )
  (classifier): 

In [29]:


# Set up optimizer and scheduler
def setup_optimizer_and_scheduler(model, learning_rate=2e-5, step_size=7, gamma=0.1, weight_decay=0.01):
    optimizer = optim.AdamW(model.parameters(), lr=learning_rate, weight_decay=weight_decay)
    scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=200)
    return optimizer, scheduler

# Loss function for binary classification
criterion = nn.CrossEntropyLoss()

# Set up optimizer and scheduler
optimizer, scheduler = setup_optimizer_and_scheduler(model)

from torch.cuda.amp import autocast, GradScaler

from tqdm import tqdm

from sklearn.metrics import precision_score, recall_score, confusion_matrix

def train_one_epoch(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    all_preds = []
    all_labels = []

    # Initialize GradScaler for mixed precision training
    scaler = GradScaler()

    # Wrap the dataloader with tqdm for a progress bar
    progress_bar = tqdm(dataloader, desc="Training", leave=False)

    for batch in progress_bar:
        embeddings = batch['embeddings'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()

        # Use autocast for mixed precision (CUDA)
        with torch.amp.autocast('cuda'):
            outputs = model(embeddings, attention_mask)
            loss = criterion(outputs, labels)

        # Scale the loss and backpropagate
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        # Accumulate loss and predictions
        total_loss += loss.item()
        _, preds = torch.max(outputs, 1)
        all_preds.extend(preds.detach().cpu().numpy())
        all_labels.extend(labels.detach().cpu().numpy())

        # Update the progress bar description with the current loss
        progress_bar.set_description(f"Training Loss: {loss.item():.4f}")

    # Calculate average loss and metrics
    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    specificity = tn / (tn + fp)

    return avg_loss, accuracy, f1, precision, recall, specificity

In [30]:
def evaluate(model, dataloader, criterion, device):
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []

    with torch.no_grad():
        for batch in dataloader:
            embeddings = batch['embeddings'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(embeddings, attention_mask)
            loss = criterion(outputs, labels)

            total_loss += loss.item()

            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    avg_loss = total_loss / len(dataloader)
    accuracy = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds)
    precision = precision_score(all_labels, all_preds)
    recall = recall_score(all_labels, all_preds)

    # Calculate specificity
    tn, fp, fn, tp = confusion_matrix(all_labels, all_preds).ravel()
    specificity = tn / (tn + fp)

    return avg_loss, accuracy, f1, precision, recall, specificity, all_preds, all_labels

In [31]:
def training_loop(model, train_dataloader, val_dataloader, optimizer, scheduler, criterion, num_epochs, device, patience=5):
    best_val_f1 = 0
    epochs_no_improve = 0

    for epoch in range(num_epochs):
        # Train for one epoch
        train_loss, train_accuracy, train_f1, train_precision, train_recall, train_specificity = train_one_epoch(model, train_dataloader, optimizer, criterion, device)

        # Evaluate on the validation set
        val_loss, val_accuracy, val_f1, val_precision, val_recall, val_specificity, val_preds, val_labels = evaluate(model, val_dataloader, criterion, device)

        # Step the learning rate scheduler
        scheduler.step()

        # Print training and validation metrics
        print(f"Epoch: {epoch+1:02}")
        print(f"\tTrain Loss: {train_loss:.3f} | Train Acc: {train_accuracy*100:.2f}% | Train F1: {train_f1:.3f} | Train Precision: {train_precision:.3f} | Train Recall: {train_recall:.3f} | Train Specificity: {train_specificity:.3f}")
        print(f"\tVal. Loss: {val_loss:.3f} | Val. Acc: {val_accuracy*100:.2f}% | Val. F1: {val_f1:.3f} | Val. Precision: {val_precision:.3f} | Val. Recall: {val_recall:.3f} | Val. Specificity: {val_specificity:.3f}")

        # Check for improvement in validation F1 score
        if val_f1 > best_val_f1:
            best_val_f1 = val_f1
            epochs_no_improve = 0

            # Save the best model checkpoint
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': val_loss,
                'val_preds': val_preds,
                'val_labels': val_labels,
            }, 'best_model_checkpoint.pth')
            print("\tModel saved!")
        else:
            epochs_no_improve += 1
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': val_loss,
                'val_preds': val_preds,
                'val_labels': val_labels,
            }, 'no_improve_model_checkpoint.pth')
            print("\t No Improve Model saved!")
            if epochs_no_improve == patience:
                print(f"Early stopping after {epoch+1} epochs.")
                break

    print('Training complete!')

In [32]:
num_epochs = 10  
training_loop(model, train_dataloader, val_dataloader, optimizer, scheduler, criterion, num_epochs, device)

                                                                        

Epoch: 01
	Train Loss: 0.527 | Train Acc: 71.49% | Train F1: 0.689 | Train Precision: 0.724 | Train Recall: 0.656 | Train Specificity: 0.769
	Val. Loss: 0.397 | Val. Acc: 80.20% | Val. F1: 0.790 | Val. Precision: 0.806 | Val. Recall: 0.774 | Val. Specificity: 0.827
	Model saved!


                                                                        

Epoch: 02
	Train Loss: 0.357 | Train Acc: 80.93% | Train F1: 0.790 | Train Precision: 0.839 | Train Recall: 0.745 | Train Specificity: 0.868
	Val. Loss: 0.327 | Val. Acc: 82.46% | Val. F1: 0.815 | Val. Precision: 0.824 | Val. Recall: 0.807 | Val. Specificity: 0.841
	Model saved!


                                                                        

Epoch: 03
	Train Loss: 0.327 | Train Acc: 82.61% | Train F1: 0.810 | Train Precision: 0.851 | Train Recall: 0.773 | Train Specificity: 0.875
	Val. Loss: 0.320 | Val. Acc: 83.55% | Val. F1: 0.830 | Val. Precision: 0.824 | Val. Recall: 0.836 | Val. Specificity: 0.835
	Model saved!


                                                                        

Epoch: 04
	Train Loss: 0.307 | Train Acc: 83.82% | Train F1: 0.826 | Train Precision: 0.855 | Train Recall: 0.799 | Train Specificity: 0.874
	Val. Loss: 0.287 | Val. Acc: 84.96% | Val. F1: 0.838 | Val. Precision: 0.866 | Val. Recall: 0.813 | Val. Specificity: 0.884
	Model saved!


                                                                        

Epoch: 05
	Train Loss: 0.290 | Train Acc: 85.17% | Train F1: 0.841 | Train Precision: 0.866 | Train Recall: 0.818 | Train Specificity: 0.883
	Val. Loss: 0.278 | Val. Acc: 85.69% | Val. F1: 0.844 | Val. Precision: 0.883 | Val. Recall: 0.809 | Val. Specificity: 0.901
	Model saved!


                                                                        

Epoch: 06
	Train Loss: 0.274 | Train Acc: 85.89% | Train F1: 0.849 | Train Precision: 0.872 | Train Recall: 0.827 | Train Specificity: 0.888
	Val. Loss: 0.273 | Val. Acc: 86.07% | Val. F1: 0.848 | Val. Precision: 0.892 | Val. Recall: 0.808 | Val. Specificity: 0.910
	Model saved!


                                                                        

Epoch: 07
	Train Loss: 0.264 | Train Acc: 86.55% | Train F1: 0.857 | Train Precision: 0.875 | Train Recall: 0.839 | Train Specificity: 0.890
	Val. Loss: 0.263 | Val. Acc: 86.98% | Val. F1: 0.864 | Val. Precision: 0.866 | Val. Recall: 0.863 | Val. Specificity: 0.876
	Model saved!


                                                                        

Epoch: 08
	Train Loss: 0.253 | Train Acc: 87.06% | Train F1: 0.863 | Train Precision: 0.880 | Train Recall: 0.845 | Train Specificity: 0.894
	Val. Loss: 0.259 | Val. Acc: 87.08% | Val. F1: 0.862 | Val. Precision: 0.883 | Val. Recall: 0.842 | Val. Specificity: 0.897
	 No Improve Model saved!


                                                                        

Epoch: 09
	Train Loss: 0.241 | Train Acc: 87.71% | Train F1: 0.870 | Train Precision: 0.885 | Train Recall: 0.855 | Train Specificity: 0.898
	Val. Loss: 0.261 | Val. Acc: 86.86% | Val. F1: 0.852 | Val. Precision: 0.926 | Val. Recall: 0.789 | Val. Specificity: 0.942
	 No Improve Model saved!


                                                                        

Epoch: 10
	Train Loss: 0.232 | Train Acc: 88.16% | Train F1: 0.874 | Train Precision: 0.892 | Train Recall: 0.857 | Train Specificity: 0.904
	Val. Loss: 0.248 | Val. Acc: 87.46% | Val. F1: 0.867 | Val. Precision: 0.885 | Val. Recall: 0.849 | Val. Specificity: 0.899
	Model saved!
Training complete!


In [26]:
# Load the checkpoint
checkpoint = torch.load('/kaggle/working/best_model_checkpoint.pth')  # Replace with your checkpoint file name

# Load the model state
model.load_state_dict(checkpoint['model_state_dict'])

# Load the optimizer state
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

# Load the scheduler state (if used)

if 'scheduler_state_dict' in checkpoint:
    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])

# Get the epoch number (if saved in the checkpoint)
start_epoch = checkpoint.get('epoch', 0) + 1  # Start from the next epoch

You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


In [None]:
num_epochs = 10  
training_loop(model, train_dataloader, val_dataloader, optimizer, scheduler, criterion, num_epochs, device)

In [27]:
checkpoint = torch.load('/kaggle/working/best_model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])

You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.


<All keys matched successfully>

In [28]:
import torch
import torch.onnx

# Assuming you have already loaded the best model checkpoint
checkpoint = torch.load('best_model_checkpoint.pth')
model.load_state_dict(checkpoint['model_state_dict'])
model.to(device)

# Set the model to evaluation mode
model.eval()

# Create a dummy input with the same shape as the model's input
# The input shape should match the expected input of the model
# For example, if your model expects embeddings of shape [batch_size, seq_len, embedding_dim]
batch_size = 1
seq_len = 512  # Adjust based on your model's expected input sequence length
embedding_dim = 1024  # Adjust based on your model's embedding dimension

dummy_input = torch.randn(batch_size, seq_len, embedding_dim).to(device)
dummy_attention_mask = torch.ones(batch_size, seq_len, dtype=torch.long).to(device)

# Export the model to ONNX format
onnx_model_path = "best_model.onnx"
torch.onnx.export(
    model,  # Model to be exported
    (dummy_input, dummy_attention_mask),  # Model input (tuple or tensor)
    onnx_model_path,  # Output file path
    export_params=True,  # Store the trained parameter weights inside the model file
    opset_version=11,  # ONNX opset version to export the model to
    do_constant_folding=True,  # Whether to execute constant folding for optimization
    input_names=['embeddings', 'attention_mask'],  # Input names
    output_names=['logits'],  # Output names
    dynamic_axes={  # Dynamic axes for variable-length inputs
        'embeddings': {0: 'batch_size', 1: 'seq_len'},  # Batch and sequence length can vary
        'attention_mask': {0: 'batch_size', 1: 'seq_len'},  # Batch and sequence length can vary
        'logits': {0: 'batch_size'}  # Batch size can vary
    }
)

print(f"Model saved in ONNX format at {onnx_model_path}")

You are using `torch.load` with `weights_only=False` (the current default value), which uses the default pickle module implicitly. It is possible to construct malicious pickle data which will execute arbitrary code during unpickling (See https://github.com/pytorch/pytorch/blob/main/SECURITY.md#untrusted-models for more details). In a future release, the default value for `weights_only` will be flipped to `True`. This limits the functions that could be executed during unpickling. Arbitrary objects will no longer be allowed to be loaded via this mode unless they are explicitly allowlisted by the user via `torch.serialization.add_safe_globals`. We recommend you start setting `weights_only=True` for any use case where you don't have full control of the loaded file. Please open an issue on GitHub for any issues related to this experimental feature.
Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will 

Model saved in ONNX format at best_model.onnx


In [29]:
import onnx

# Load the ONNX model
onnx_model = onnx.load(onnx_model_path)

# Check that the model is well formed
onnx.checker.check_model(onnx_model)

print("ONNX model is valid!")

ONNX model is valid!


In [None]:
# Evaluate on English test set
english_loss, english_accuracy, english_f1, english_precision, english_recall, english_specificity, _, _ = evaluate(model, english_test_dataloader, criterion, device)
print(f"English Test Loss: {english_loss:.3f} | English Test Acc: {english_accuracy*100:.2f}% | English Test F1: {english_f1:.3f} | English Test Precision: {english_precision:.3f} | English Test Recall: {english_recall:.3f} | English Test Specificity: {english_specificity:.3f}")

# Quantization Aware Training

In [None]:
model.eval()  # Ensure the model is in evaluation mode
model = quantization.convert(model, inplace=True)

# Save the quantized model
torch.save(model.state_dict(), 'quantized_model.pth')

In [None]:
import torch
import requests
import json
import re
from collections import Counter
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
from sklearn.feature_selection import mutual_info_classif
from scipy.sparse import csr_matrix
import torch.nn.functional as F
import os
from datetime import datetime

In [None]:
class AdaptiveSparseEncoder:
    def __init__(self, initial_features=1024, target_sparsity=0.1, adaptation_rate=0.01):
        self.initial_features = initial_features
        self.target_sparsity = target_sparsity
        self.adaptation_rate = adaptation_rate
        self.feature_importance = torch.ones(initial_features, device=device)
        self.feature_mask = torch.ones(initial_features, device=device)
        self.selected_features = None
        self.feature_history = []
        
    def update_feature_importance(self, embeddings, labels):
        """Update feature importance based on mutual information with labels"""
        with torch.no_grad():
            # Convert to numpy for mutual information calculation
            embeddings_np = embeddings.cpu().numpy()
            labels_np = labels.cpu().numpy()
            
            # Calculate mutual information for each feature
            mi_scores = mutual_info_classif(embeddings_np, labels_np)
            new_importance = torch.tensor(mi_scores, device=device)
            
            # Exponential moving average update
            self.feature_importance = (1 - self.adaptation_rate) * self.feature_importance + \
                                    self.adaptation_rate * new_importance
            
            # Update feature mask based on target sparsity
            k = int(self.initial_features * self.target_sparsity)
            top_k_values, top_k_indices = torch.topk(self.feature_importance, k)
            self.feature_mask.zero_()
            self.feature_mask[top_k_indices] = 1.0
            
            # Store selected features for analysis
            self.selected_features = top_k_indices.cpu().numpy()
            self.feature_history.append(self.selected_features)
    
    def encode(self, embeddings):
        """Apply sparse encoding to embeddings"""
        return embeddings * self.feature_mask
    
    def get_active_features(self):
        """Return indices of currently active features"""
        return torch.nonzero(self.feature_mask).squeeze()

In [None]:
def extract_keywords(text):
     stopwords = {
    'في', 'من', 'على', 'و', 'إلى', 'عن', 'مع', 'كان', 'هذا', 'ذلك', 'هي', 'هو',
    'لـ', 'كانوا', 'كانت', 'كما', 'أيضاً', 'بعد', 'قبل', 'أو', 'إذا', 'ما', 'لا',
    'لم', 'لن', 'ليس', 'لكن', 'حتى', 'بين', 'بما', 'ذلك', 'بعد',
    'أكثر', 'أقل', 'بعض', 'لذلك', 'هناك', 'هؤلاء', 'الذين', 'اللواتي', 'اللاتي',
    'الذي', 'التي', 'الذين', 'اللاتين', 'إليكم', 'إليكن', 'هناك', 'أياً', 'كل',
    'كلما', 'أي', 'أيضاً', 'أيها', 'أيا', 'أيّ', 'امام', 'أيما', 'أين', 'أم',
    'أمام', 'أمامك', 'أمامكم', 'أمامهم', 'أمامهن', 'أمامنا', 'أيضا', 'أحد', 'أحدهم',
    'أحدهن', 'آخر', 'آخرون', 'آخرين', 'كلية', 'كليةً'
}

     words = re.findall(r'\b\w+\b', text.lower())
     keywords = [word for word in words if word not in stopwords and len(word) > 2]
     return keywords

In [None]:
def get_recent_keywords():
    url = "https://google.serper.dev/search?engine=google_news&q=latest&apiKey=a4efc96fdbe0af101682967e30c852ec04aaf725"
    response = requests.get(url)
    
    if response.status_code != 200:
        print("Error fetching recent news:", response.status_code)
        return []
    
    response_data = response.json()
    keywords = []
    
    organic_items = response_data.get('organic', [])
    for item in organic_items:
        keywords.extend(extract_keywords(item.get('title', '') + ' ' + item.get('snippet', '')))

    keyword_counts = Counter(keywords)
    most_common_keywords = [keyword for keyword, count in keyword_counts.most_common(10)]
    
    return most_common_keywords

In [None]:
def get_fact_checked_news(keywords):
    api_key = 'AIzaSyB8CWmQx1lxf_ByZiCo7GduINgchV0BLOc'
    results = []

    for query in keywords:
        encoded_query = requests.utils.quote(query)
        url = f'https://factchecktools.googleapis.com/v1alpha1/claims:search?query={encoded_query}&key={api_key}'
        response = requests.get(url)
        
        if response.status_code == 200:
            data = response.json()
            if 'claims' in data:
                for claim in data['claims']:
                    for review in claim.get('claimReview', []):
                        if review.get('languageCode') == 'ar':
                            results.append({
                                'text': claim.get('text', ''),
                                'rating': review.get('textualRating', ''),
                                'date': review.get('reviewDate', ''),
                                'url': review.get('url', '')
                            })
    return results

In [None]:
# Custom Dataset for news articles
class NewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        inputs = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': inputs['input_ids'].squeeze(0),
            'attention_mask': inputs['attention_mask'].squeeze(0),
            'labels': torch.tensor(self.labels[idx], dtype=torch.long)
        }

In [None]:
# Modified Combined Model with Sparse Encoding
class AdaptiveSparseModel(nn.Module):
    def __init__(self, embedding_dim=1024, hidden_dim=256, num_classes=2, num_encoder_layers=12):
        super(AdaptiveSparseModel, self).__init__()
        self.sparse_encoder = AdaptiveSparseEncoder(embedding_dim)
        self.pos_encoder = FourierPositionalEncoding(embedding_dim)  # From your original code
        self.encoder = Encoder(embedding_dim, nhead=8, dim_feedforward=hidden_dim * 6, 
                             num_layers=num_encoder_layers)  # From your original code
        self.classifier = ClassifierHead(embedding_dim, num_classes)  # From your original code
        
    def forward(self, embeddings, attention_mask, labels=None, training=True):
        # Apply sparse encoding
        if training and labels is not None:
            self.sparse_encoder.update_feature_importance(embeddings.mean(dim=1), labels)
        
        sparse_embeddings = self.sparse_encoder.encode(embeddings)
        
        # Apply positional encoding
        pos_embeddings = self.pos_encoder(sparse_embeddings)
        
        # Create padding mask
        src_key_padding_mask = attention_mask == 0
        
        # Pass through encoder
        encoder_output = self.encoder(pos_embeddings, src_key_padding_mask=src_key_padding_mask)
        
        # Masked mean pooling
        masked_encoder_output = encoder_output * attention_mask.unsqueeze(-1)
        sum_embeddings = masked_encoder_output.sum(dim=1)
        lengths = attention_mask.sum(dim=1).unsqueeze(-1)
        pooled_output = sum_embeddings / lengths
        
        # Classification
        logits = self.classifier(pooled_output)
        return logits

In [None]:
class EWC:
    def __init__(self, model, dataloader, criterion, device):
        self.model = model
        self.dataloader = dataloader
        self.criterion = criterion
        self.device = device
        self.fisher_matrix = None
        self.optimal_params = None

    def compute_fisher_information(self):
        self.model.eval()
        fisher_matrix = {name: torch.zeros_like(param) for name, param in self.model.named_parameters()}

        for batch in self.dataloader:
            # Extract inputs from the batch
            input_ids = batch['input_ids'].to(self.device)
            attention_mask = batch['attention_mask'].to(self.device)
            labels = batch['labels'].to(self.device)

            # Compute embeddings using en
            with torch.no_grad():
                outputs = en_model(input_ids=input_ids, attention_mask=attention_mask)
                embeddings = outputs.last_hidden_state

            self.model.zero_grad()
            # Forward pass through the model
            logits = self.model(embeddings, attention_mask)
            loss = self.criterion(logits, labels)
            loss.backward()

            # Accumulate the Fisher information
            for name, param in self.model.named_parameters():
                if param.grad is not None:  # Avoid parameters without gradients
                    fisher_matrix[name] += param.grad.data ** 2 / len(self.dataloader)

        self.fisher_matrix = fisher_matrix


    def save_optimal_params(self):
        self.optimal_params = {name: param.clone() for name, param in self.model.named_parameters()}

    def ewc_loss(self):
        loss = 0
        for name, param in self.model.named_parameters():
            if name in self.fisher_matrix:
                loss += (self.fisher_matrix[name] * (param - self.optimal_params[name]) ** 2).sum()
        return loss


In [None]:
class ContinuousLearningPipeline:
    def __init__(self, model_path='best_model_checkpoint.pth', save_dir='model_checkpoints'):
        self.model = AdaptiveSparseModel().to(device)
        self.save_dir = save_dir
        self.ewc = None  # Initialize EWC as None
        if not os.path.exists(save_dir):
            os.makedirs(save_dir)
        self.load_checkpoint(model_path)
        self.optimizer = torch.optim.AdamW(self.model.parameters(), lr=2e-5)
        self.criterion = nn.CrossEntropyLoss()
        self.best_performance = float('inf')  # Track best loss for saving best model
        
    def load_checkpoint(self, model_path):
        if os.path.exists(model_path):
            checkpoint = torch.load(model_path)
            self.model.load_state_dict(checkpoint['model_state_dict'])
            if 'best_performance' in checkpoint:
                self.best_performance = checkpoint['best_performance']
            print(f"Loaded model checkpoint from {model_path}")
        else:
            print(f"No checkpoint found at {model_path}, starting with fresh model")
    
    def save_checkpoint(self, performance_metric, is_best=False):
        """Save model checkpoint with timestamp and performance metric"""
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        
        # Prepare checkpoint data
        checkpoint = {
            'model_state_dict': self.model.state_dict(),
            'optimizer_state_dict': self.optimizer.state_dict(),
            'feature_importance': self.model.sparse_encoder.feature_importance,
            'feature_mask': self.model.sparse_encoder.feature_mask,
            'best_performance': performance_metric,
            'timestamp': timestamp,
        }
        if self.ewc:
            checkpoint['fisher_matrix'] = self.ewc.fisher_matrix
            checkpoint['optimal_params'] = self.ewc.optimal_params

        # Save regular checkpoint
        checkpoint_path = os.path.join(self.save_dir, f'checkpoint_{timestamp}.pth')
        torch.save(checkpoint, checkpoint_path)
        print(f"Saved checkpoint to {checkpoint_path}")
        
        # Save best model if this is the best performance
        if is_best:
            best_model_path = os.path.join(self.save_dir, 'best_model_checkpoint.pth')
            torch.save(checkpoint, best_model_path)
            print(f"Saved best model to {best_model_path}")

    def analyze_feature_adaptation(self):
            """Analyze how features have adapted over time"""
            feature_history = self.model.sparse_encoder.feature_history
            if len(feature_history) > 1:
                feature_stability = np.zeros(len(feature_history) - 1)
                for i in range(len(feature_history) - 1):
                    feature_stability[i] = len(set(feature_history[i]) & set(feature_history[i + 1])) / \
                                    len(feature_history[i])
                return {
                    'feature_stability': feature_stability.mean(),
                    'active_features': len(self.model.sparse_encoder.get_active_features()),
                    'importance_stats': {
                        'mean': float(self.model.sparse_encoder.feature_importance.mean()),
                        'std': float(self.model.sparse_encoder.feature_importance.std())
                    }
                }
            return None
    
    def predict_with_confidence(self, text):
            """Make prediction with confidence scores and feature importance analysis"""
            self.model.eval()
            with torch.no_grad():
                # Tokenize and get embeddings
                inputs = byt5_tokenizer(text, return_tensors='pt', padding=True, truncation=True, 
                                   max_length=512).to(device)
                mpnet_outputs = en_model(**inputs)
                embeddings = mpnet_outputs.last_hidden_state
            
                # Get prediction
                logits = self.model(embeddings, inputs['attention_mask'], training=False)
                probabilities = F.softmax(logits, dim=1)
                prediction = torch.argmax(probabilities, dim=1)
            
                # Get active features for this prediction
                active_features = self.model.sparse_encoder.get_active_features()
            
                return {
                    'prediction': prediction.item(),
                    'confidence': float(probabilities.max()),
                    'active_features_count': len(active_features),
                    'top_feature_indices': active_features[:10].cpu().tolist()
                }

    def update_model(self, new_texts, new_labels):
        """Update model with new data and save checkpoints"""
        self.model.train()
        
        # Create dataset from new examples
        dataset = NewsDataset(new_texts, new_labels, byt5_tokenizer)
        dataloader = DataLoader(dataset, batch_size=16, shuffle=True)
        
        total_loss = 0
        num_batches = 0
        
        for batch in dataloader:
            # Move batch to device
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            # Get embeddings
            with torch.no_grad():
                outputs = en_model(input_ids=input_ids, attention_mask=attention_mask)
                embeddings = outputs.last_hidden_state
            
            # Forward pass with labels for feature adaptation
            self.optimizer.zero_grad()
            logits = self.model(embeddings, attention_mask, labels, training=True)
            loss = self.criterion(logits, labels)
            
            # Add EWC loss if applicable
            if self.ewc:
                loss += self.ewc.ewc_loss()
            
            # Backward pass
            loss.backward()
            self.optimizer.step()
            
            total_loss += loss.item()
            num_batches += 1
        
        # Calculate average loss
        avg_loss = total_loss / num_batches if num_batches > 0 else float('inf')
        
        # Save checkpoint if this is the best performance
        is_best = avg_loss < self.best_performance
        if is_best:
            self.best_performance = avg_loss
        
        self.save_checkpoint(avg_loss, is_best)
        
        # Compute Fisher Information and Save Optimal Params for EWC
        self.ewc = EWC(self.model, dataloader, self.criterion, device)
        self.ewc.compute_fisher_information()
        self.ewc.save_optimal_params()
        
        # Analyze feature adaptation
        adaptation_stats = self.analyze_feature_adaptation()
        return adaptation_stats

In [None]:
def run_continuous_pipeline():
    pipeline = ContinuousLearningPipeline()
    
    # Fetch and process new data
    keywords = get_recent_keywords()
    if keywords:
        fact_checked_results = get_fact_checked_news(keywords)
        
        # Process new examples
        new_texts = []
        new_labels = []
        
        print("\nAnalyzing and learning from new claims...")
        for result in fact_checked_results:
            text = result['text']
            
            # Make prediction
            prediction_result = pipeline.predict_with_confidence(text)
            
            print(f"\nClaim: {text}")
            print(f"Model Prediction: {'Fake' if prediction_result['prediction'] == 1 else 'Real'}")
            print(f"Confidence: {prediction_result['confidence']*100:.2f}%")
            print(f"Active Features: {prediction_result['active_features_count']}")
            
            # Add to training data if we have ground truth
            if 'rating' in result:
                label = 1 if 'false' in result['rating'].lower() else 0
                new_texts.append(text)
                new_labels.append(label)
                
                # Display actual label alongside prediction
                print(f"Actual Label: {'Fake' if label == 1 else 'Real'}") # Added this line

        
        # Update model if we have new labeled data
        if new_texts:
            print("\nUpdating model with new data...")
            adaptation_stats = pipeline.update_model(new_texts, new_labels)
            
            if adaptation_stats:
                print("\nFeature Adaptation Statistics:")
                print(f"Feature Stability: {adaptation_stats['feature_stability']:.3f}")
                print(f"Active Features: {adaptation_stats['active_features']}")
                print(f"Feature Importance - Mean: {adaptation_stats['importance_stats']['mean']:.3f}, "
                      f"Std: {adaptation_stats['importance_stats']['std']:.3f}")
    
    print("\nPipeline completed successfully.")

In [None]:
if __name__ == "__main__":
    run_continuous_pipeline()

In [None]:
def predict_and_justify_with_lime_and_shap(model, text, tokenizer, en_model):
    model.eval()  # Set the model to evaluation mode

    # Tokenize and get model prediction
    with torch.no_grad():  # Disable gradient calculations
        inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512).to(device)
        mpnet_outputs = en_model(**inputs)
        embeddings = mpnet_outputs.last_hidden_state
        logits = model(embeddings, inputs['attention_mask'], training=False)  # training=False for AdaptiveSparseModel
        probabilities = F.softmax(logits, dim=1)
        prediction = torch.argmax(probabilities, dim=1).item()  # Get the predicted class (0 or 1)

    
    # SHAP Integration
    def model_predict_shap(texts):
        if isinstance(texts, str):
            texts = [texts]
        elif not isinstance(texts, list):
            texts = list(texts)
        texts = [str(text) for text in texts]

        # Tokenize and predict
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            mpnet_outputs = en_model(**inputs)
            embeddings = mpnet_outputs.last_hidden_state
            logits = model(embeddings, inputs['attention_mask'], training=False)

        return logits.cpu().numpy()

    # Create SHAP explainer
    explainer_shap = shap.Explainer(model_predict_shap, masker=shap.maskers.Text(tokenizer))

    # Generate SHAP values
    shap_values = explainer_shap([text])

    # Display words with SHAP values
    words = tokenizer.tokenize(text)
    word_shap_values = shap_values.values[0]

    print("\nWords with SHAP Values:")
    for word, shap_value in zip(words, word_shap_values):
        print(f"Word: {word}, SHAP Value: {shap_value}")

    # LIME Integration
    class_names = ['Real', 'Fake']  # Define class names for LIME visualization

    def model_predict_lime(texts):
        if isinstance(texts, str):
            texts = [texts]
        elif not isinstance(texts, list):
            texts = list(texts)
        texts = [str(text) for text in texts]

        # Tokenize and predict
        inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
        inputs = {key: val.to(device) for key, val in inputs.items()}

        with torch.no_grad():
            mpnet_outputs = en_model(**inputs)
            embeddings = mpnet_outputs.last_hidden_state
            logits = model(embeddings, inputs['attention_mask'], training=False)
            probabilities = F.softmax(logits, dim=1)

        return probabilities.cpu().numpy()

    # Initialize LIME explainer
    explainer_lime = LimeTextExplainer(class_names=class_names)

    # Generate LIME explanation
    lime_explanation = explainer_lime.explain_instance(
        text,  # Input text
        model_predict_lime,  # Prediction function
        num_features=10,  # Number of top words to display
        labels=[0, 1]  # Labels to explain (both Real and Fake classes)
    )

    # Visualize LIME explanation for the predicted class
    print("\nLIME Explanation:")
    lime_explanation.show_in_notebook(text=True)

    # Save LIME explanation as an HTML file (optional)
    lime_explanation.save_to_file('lime_explanation.html')

    # Return all results
    return {
        'prediction': prediction,  # 0 or 1
        'probability': probabilities[0][prediction].item(),  # Probability of predicted class
        'shap_values': shap_values.values[0].tolist(),  # SHAP values
        'shap_base_values': shap_values.base_values[0].tolist(),  # SHAP base values
        'lime_explanation': lime_explanation,
    }


In [None]:
model = AdaptiveSparseModel() #Or CombinedModel (according to your use case)
model.load_state_dict(torch.load('best_model_checkpoint.pth')['model_state_dict'])
model.to(device)

In [None]:
# Example usage
new_claim = """اغتيال عالم الكيمياء الدكتور حمدي"""

result = predict_and_justify_with_lime_and_shap(model, new_claim, byt5_tokenizer, en_model)

# Print the results
print(f"Claim: {new_claim}")
print(f"Prediction: {'Fake' if result['prediction'] == 1 else 'Real'}")
print(f"Probability: {result['probability']:.4f}")

# If LIME explanation is saved, inform the user
print("\nLIME explanation saved as 'lime_explanation.html'. Open the file in a browser to view it.")

In [None]:
# Step 1: Reinitialize the scaler
scaler = torch.cuda.amp.GradScaler()

# Step 2: Load the existing checkpoint
checkpoint_path = 'best_model_checkpoint.pth'
checkpoint = torch.load(checkpoint_path)

# Step 3: Add the scaler state to the checkpoint
checkpoint['scaler_state_dict'] = scaler.state_dict()

# Step 4: Save the updated checkpoint
torch.save(checkpoint, 'updated_best_model_checkpoint.pth')