In [1]:
!pip install torch transformers



In [11]:
!pip install supabase

Collecting supabase
  Downloading supabase-2.4.5-py3-none-any.whl.metadata (9.3 kB)
Collecting gotrue<3.0,>=1.3 (from supabase)
  Downloading gotrue-2.4.2-py3-none-any.whl.metadata (6.1 kB)
Collecting postgrest<0.17.0,>=0.14 (from supabase)
  Downloading postgrest-0.16.4-py3-none-any.whl.metadata (5.1 kB)
Collecting realtime<2.0.0,>=1.0.0 (from supabase)
  Downloading realtime-1.0.4-py3-none-any.whl.metadata (2.6 kB)
Collecting storage3<0.8.0,>=0.5.3 (from supabase)
  Downloading storage3-0.7.4-py3-none-any.whl.metadata (1.9 kB)
Collecting supafunc<0.5.0,>=0.3.1 (from supabase)
  Downloading supafunc-0.4.5-py3-none-any.whl.metadata (1.2 kB)
Collecting strenum<0.5.0,>=0.4.9 (from postgrest<0.17.0,>=0.14->supabase)
  Downloading StrEnum-0.4.15-py3-none-any.whl.metadata (5.3 kB)
Collecting typing-extensions<5.0.0,>=4.11.0 (from realtime<2.0.0,>=1.0.0->supabase)
  Downloading typing_extensions-4.11.0-py3-none-any.whl.metadata (3.0 kB)
Downloading supabase-2.4.5-py3-none-any.whl (15 kB)
Dow

In [19]:
!huggingface-cli login --token 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

Token has not been saved to git credential helper. Pass `add_to_git_credential=True` if you want to set the git credential as well.
Token is valid (permission: write).
Your token has been saved to /root/.cache/huggingface/token
Login successful


In [12]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm 
import torch
from torch.utils.data import DataLoader, TensorDataset
from torch.nn import CrossEntropyLoss
from transformers import AutoTokenizer, RobertaForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report, confusion_matrix
from kaggle_secrets import UserSecretsClient
from supabase import create_client, ClientOptions
import matplotlib.pyplot as plt
import seaborn as sns

In [4]:
class EWC:
    
    def __init__(self, prior_model, data_samples, num_sample=30, lambda_=0.1):
        self.prior_model = prior_model
        self.prior_weights = [weight.cpu().detach().numpy() for weight in prior_model.parameters()]
        self.num_sample = num_sample
        self.data_samples = data_samples
        self.fisher_matrix = self.compute_fisher()
        self.lambda_ = lambda_  # Define lambda_ attribute
        
    def compute_fisher(self):
        weights = self.prior_weights
        fisher_accum = [np.zeros_like(layer) for layer in weights]
        criterion = torch.nn.CrossEntropyLoss()
        for j in tqdm(range(self.num_sample)):
            idx = np.random.randint(self.data_samples.shape[0])
            input_sample = self.data_samples[idx].unsqueeze(0).to(self.prior_model.device)  # Ensure sample is on the same device as the model
            outputs = self.prior_model(input_sample)
            loss = criterion(outputs.logits, torch.tensor([0]).to(outputs.logits.device))  # Assume binary classification
            gradients = torch.autograd.grad(outputs=loss, inputs=self.prior_model.parameters(), create_graph=True)
            for m, grad in enumerate(gradients):
                fisher_accum[m] += grad.detach().cpu().numpy() ** 2  # Move gradient to CPU, detach, and convert to NumPy array
        fisher_accum = [fisher / self.num_sample for fisher in fisher_accum]
        return fisher_accum
    
    def compute_penalty_loss(self, model):
        penalty = 0.
        for fisher, param, param_prior in zip(self.fisher_matrix, model.parameters(), self.prior_model.parameters()):
            param_numpy = param.detach().cpu().numpy()  # Convert tensor to NumPy array
            param_prior_numpy = param_prior.detach().cpu().numpy()  # Convert tensor to NumPy array
            penalty += torch.sum(torch.tensor(fisher) * ((param_numpy - param_prior_numpy) ** 2))  # Convert NumPy array to tensor
        return 0.5 * self.lambda_ * penalty
    
    def get_fisher(self):
        return self.fisher_matrix

In [5]:
def initialize_ewc(model, tokenizer, ewc_texts, lambda_=10):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    ewc_encodings = tokenizer(ewc_texts, truncation=True, padding=True, return_tensors='pt')
    ewc_encodings = {key: value.to(device) for key, value in ewc_encodings.items()}
    data_samples = ewc_encodings['input_ids']  # Sample a small subset of data
    
    # Set num_sample to the size of data_samples
    num_sample = data_samples.size(0)
    
    ewc = EWC(prior_model=model, data_samples=data_samples, num_sample=num_sample, lambda_=lambda_)
    return ewc

In [21]:
def fine_tune_roberta_for_rumor_detection(model, tokenizer, train_loader, val_loader, model_save_path, ewc=None, epochs=20, learning_rate=1e-5, patience=3):
    # Set up optimizer
    optimizer = AdamW(model.parameters(), lr=learning_rate)

    # Initialize early stopping variables
    best_val_loss = float('inf')    
    no_improvement_counter = 0

    # Training loop
    for epoch in range(epochs):
        model.train()
        train_correct_predictions = 0
        train_total_samples = 0

        for batch in train_loader:
            input_ids, attention_mask, labels = batch
            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            
            # Calculate additional EWC loss
            if ewc is not None:
                ewc_loss = ewc.compute_penalty_loss(model)
                loss += ewc_loss
            
            loss.backward()
            optimizer.step()

            # Calculate training accuracy
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            train_correct_predictions += (predictions == labels).sum().item()
            train_total_samples += labels.size(0)

        # Validation
        model.eval()
        val_loss = 0.0
        correct_predictions = 0
        total_samples = 0

        with torch.no_grad():
            for batch in val_loader:
                input_ids, attention_mask, labels = batch

                outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
                val_loss += outputs.loss.item()
                logits = outputs.logits
                predictions = torch.argmax(logits, dim=1)
                correct_predictions += (predictions == labels).sum().item()
                total_samples += labels.size(0)

        average_val_loss = val_loss / len(val_loader)
        accuracy = correct_predictions / total_samples
        train_accuracy = train_correct_predictions / train_total_samples

        print(f'Epoch {epoch + 1}/{epochs}, '
              f'Train Loss: {loss.item()}, Train Accuracy: {train_accuracy:.4f}, '
              f'Val Loss: {average_val_loss:.4f}, Val Accuracy: {accuracy:.4f}')

        # Check for early stopping
        if average_val_loss < best_val_loss:
            best_val_loss = average_val_loss
            no_improvement_counter = 0
        else:
            no_improvement_counter += 1

        if no_improvement_counter >= patience:
            print(f'Early stopping after {epoch + 1} epochs without improvement.')
            break

    return model, tokenizer

In [7]:
def train_model_with_ewc(train_texts, val_texts, train_labels, val_labels, ewc_texts, model_save_path='NFRD/nfrd-model', lambda_=10):
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    
    tokenizer = AutoTokenizer.from_pretrained(model_save_path)
    model = RobertaForSequenceClassification.from_pretrained(model_save_path, num_labels=2)
    model.to(device)
    
    # Initialize EWC
    ewc = initialize_ewc(model, tokenizer, ewc_texts, lambda_)
    
    # Tokenize and encode the training and validation sets
    train_encodings = tokenizer(train_texts, truncation=True, padding=True, return_tensors='pt')
    val_encodings = tokenizer(val_texts, truncation=True, padding=True, return_tensors='pt')
    
    # Convert data to PyTorch tensors and move them to GPU
    train_encodings = {key: value.to(device) for key, value in train_encodings.items()}
    val_encodings = {key: value.to(device) for key, value in val_encodings.items()}
    train_labels = torch.tensor(train_labels).to(device)
    val_labels = torch.tensor(val_labels).to(device)
    
    # Create PyTorch datasets
    train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
    val_dataset = TensorDataset(val_encodings['input_ids'], val_encodings['attention_mask'], val_labels)
    
    # Create PyTorch data loaders
    batch_size = 16
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    
    # Fine-tune model with EWC
    new_model, new_tokenizer = fine_tune_roberta_for_rumor_detection(model, tokenizer, train_loader, val_loader, model_save_path=model_save_path)
    
    # Save the model
    new_model.push_to_hub(model_save_path)
    new_tokenizer.push_to_hub(model_save_path)
    print(f'Model uploaded to the Hugging Face Model Hub')
    
    return new_model

In [None]:
# usage
# train_model_with_ewc(train_texts, val_texts, train_labels, val_labels, ewc_texts, model_save_path)