In [4]:
# Import necessary libraries
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, classification_report, confusion_matrix
import optuna
import re
import string
import nltk
from nltk.corpus import words

In [3]:
!pip install optuna

Collecting optuna
  Downloading optuna-4.2.1-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.15.2-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.2.1-py3-none-any.whl (383 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m383.6/383.6 kB[0m [31m9.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.15.2-py3-none-any.whl (231 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m231.9/231.9 kB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.9.0-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, alembic, optuna
Successfully installed alembic-1.15.2 colorlog-6.9.0 optuna-4.2.1


In [5]:
# Download NLTK resources
nltk.download('words')
nltk.download('punkt')


[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [6]:
device_to_use = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device_to_use}")


Using device: cpu


In [9]:
data_content = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/Dataset-MisinformationData - Sheet1.csv")

# Split into train, validation, and test sets
train_split, temp_split = train_test_split(data_content, test_size=0.2, random_state=42, shuffle=True)
val_split, test_split = train_test_split(temp_split, test_size=0.5, random_state=42, shuffle=True)

# Save the splits to CSV files
train_split.to_csv("/content/drive/MyDrive/Colab Notebooks/train.csv", index=False)
val_split.to_csv("/content/drive/MyDrive/Colab Notebooks/val.csv", index=False)
test_split.to_csv("/content/drive/MyDrive/Colab Notebooks/test.csv", index=False)

print(f"Train set size: {len(train_split)}")
print(f"Validation set size: {len(val_split)}")
print(f"Test set size: {len(test_split)}")


Train set size: 7774
Validation set size: 972
Test set size: 972


In [10]:
english_words = set(words.words())
english_words.update(["india", "corona", "birthday", "to", "you"])  # Ensure key words are recognized

def segment_words(text_segment):
    n = len(text_segment)
    dp = [None] * (n + 1)
    dp[0] = []

    for i in range(1, n + 1):
        for j in range(i):
            word = text_segment[j:i]
            if word in english_words and dp[j] is not None:
                dp[i] = dp[j] + [word]
                break

    return ' '.join(dp[-1]) if dp[-1] else text_segment  # Avoid breaking into characters

def process_hashtag(hashtag):
    return segment_words(hashtag.lower())

def clean_text(text):
    if not isinstance(text, str):
        return ""

    text = text.lower()
    text = re.sub(r'http\S+', 'URL', text)
    text = re.sub(r'@(\w+)', r'\1', text)
    text = re.sub(r'#(\w+)', lambda m: process_hashtag(m.group(1)), text)
    text = ''.join([c if c not in string.punctuation else ' ' for c in text])
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra spaces

    return text

# Apply preprocessing to the text column
train_split['cleaned_tweet'] = train_split['tweet'].apply(clean_text)
val_split['cleaned_tweet'] = val_split['tweet'].apply(clean_text)
test_split['cleaned_tweet'] = test_split['tweet'].apply(clean_text)

In [11]:
label_map = {"fake": 0, "real": 1}


In [12]:
# Task 3: Obtain representations using BERT-based model
class CovidNewsDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_seq_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_seq_length = max_seq_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
            # Convert string label to integer
        if isinstance(label, str):
          label = label_map[label]

        encoding = self.tokenizer(
            text,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)

        }

In [13]:
def calculate_metrics(pred):
    true_vals = pred.label_ids
    predicted_vals = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(true_vals, predicted_vals, average='binary')
    acc = accuracy_score(true_vals, predicted_vals)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

In [None]:
# Define the list of model names to evaluate
bert_model_options = [
    "bert-base-uncased",
    #"digitalepidemiologylab/covid-twitter-bert",
    #"twitter/twhin-bert-base",
    #"allenai/scibert_scivocab_uncased"
]


In [None]:
# Dictionary to store results for all models
model_results = {}

# Loop through each model
for current_model in bert_model_options:
    print(f"\n\n===== Training and evaluating {current_model} =====")

    # Initialize tokenizer
    current_tokenizer = AutoTokenizer.from_pretrained(current_model)

    # Create datasets
    train_dataset = CovidNewsDataset(
        train_split['cleaned_tweet'].values,
        train_split['label'].values,
        current_tokenizer
    )

    val_dataset = CovidNewsDataset(
        val_split['cleaned_tweet'].values,
        val_split['label'].values,
        current_tokenizer
    )

    test_dataset = CovidNewsDataset(
        test_split['cleaned_tweet'].values,
        test_split['label'].values,
        current_tokenizer
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Task 4: Training Classifiers with Hyperparameter Tuning
    # Define model initialization function
    def init_model():
        return AutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2)

    # Define hyperparameter search space
    def hp_search_space(trial):
        return {
            "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
            "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
            "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.1),
            "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 2)
        }

    # Initialize trainer for hyperparameter search
    tuning_config = TrainingArguments(
        output_dir=f"./tuning_results_{current_model.replace('/', '_')}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to='none'
    )

    hp_tuner = Trainer(
        model_init=init_model,
        args=tuning_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calculate_metrics,
    )

    # Run hyperparameter search
    print("Starting hyperparameter tuning...")
    best_run = hp_tuner.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=hp_search_space,
        n_trials=1  # Reduced for faster execution, increase for better results
    )

    print(f"Best hyperparameters: {best_run.hyperparameters}")

    # Train with best hyperparameters
    optimal_params = best_run.hyperparameters
    training_config = TrainingArguments(
        output_dir=f"./best_model_results_{current_model.replace('/', '_')}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=optimal_params["learning_rate"],
        per_device_train_batch_size=optimal_params["per_device_train_batch_size"],
        per_device_eval_batch_size=optimal_params["per_device_train_batch_size"],
        num_train_epochs=optimal_params["num_train_epochs"],
        weight_decay=optimal_params["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to='none'
    )

    optimal_model = AutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2).to(device_to_use)
    model_trainer = Trainer(
        model=optimal_model,
        args=training_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calculate_metrics,
    )

    print("Training with best hyperparameters...")
    model_trainer.train()

    # Task 5: Evaluating Models
    def evaluate_classifier(model, test_loader, device):
        model.eval()
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                preds = torch.argmax(logits, dim=1).cpu().numpy()
                true = labels.cpu().numpy()

                all_predictions.extend(preds)
                all_labels.extend(true)

        # Calculate metrics
        print("\nTest Set Evaluation:")
        print(classification_report(all_labels, all_predictions, target_names=["Fake", "Real"]))

        # Create confusion matrix
        cm = confusion_matrix(all_labels, all_predictions)
        print("Confusion Matrix:")
        print(cm)

        # Calculate accuracy
        accuracy = accuracy_score(all_labels, all_predictions)
        print(f"Test Accuracy: {accuracy:.4f}")

        # Calculate precision, recall, f1 for both micro and macro
        precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_predictions, average='micro')
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(all_labels, all_predictions, average='macro')

        eval_results = {
            'predictions': all_predictions,
            'true_labels': all_labels,
            'accuracy': accuracy,
            'f1_micro': f1_micro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            'f1_macro': f1_macro,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'best_hyperparameters': optimal_params
        }

        return eval_results

    # Evaluate the best model
    print("Evaluating model on test set...")
    evaluation_results = evaluate_classifier(optimal_model, test_loader, device_to_use)

    # Save the fine-tuned model
    model_save_path = f"./fine_tuned_{current_model.replace('/', '_')}"
    optimal_model.save_pretrained(model_save_path)
    current_tokenizer.save_pretrained(model_save_path)
    print(f"Model saved to {model_save_path}")

    # Store results for this model
    model_results[current_model] = evaluation_results

# Generate a summary report for all models
print("\n\n===== MODEL TRAINING SUMMARY =====")
for model_name, results in model_results.items():
    print(f"\nModel: {model_name}")
    print(f"Best Hyperparameters: {results['best_hyperparameters']}")
    print("Test Set Performance:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"F1 Score (Micro): {results['f1_micro']:.4f}")
    print(f"Precision (Micro): {results['precision_micro']:.4f}")
    print(f"Recall (Micro): {results['recall_micro']:.4f}")
    print(f"F1 Score (Macro): {results['f1_macro']:.4f}")
    print(f"Precision (Macro): {results['precision_macro']:.4f}")
    print(f"Recall (Macro): {results['recall_macro']:.4f}")



===== Training and evaluating bert-base-uncased =====


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-07 21:12:53,225] A new study created in memory with name: no-name-645e04e3-286e-4b38-97c3-b89d430f731b
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting hyperparameter tuning...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2908,0.12023,0.967078,0.967611,0.971545,0.96371
2,0.0765,0.138387,0.970165,0.970677,0.973631,0.967742


[I 2025-04-07 21:20:15,250] Trial 0 finished with value: 3.8822148281520596 and parameters: {'learning_rate': 2.80944491312171e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.014356938544736958, 'num_train_epochs': 2}. Best is trial 0 with value: 3.8822148281520596.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Best hyperparameters: {'learning_rate': 2.80944491312171e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.014356938544736958, 'num_train_epochs': 2}
Training with best hyperparameters...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.2867,0.130314,0.966049,0.966901,0.962076,0.971774
2,0.0796,0.148568,0.970165,0.970854,0.967936,0.97379


Evaluating model on test set...

Test Set Evaluation:
              precision    recall  f1-score   support

        Fake       0.98      0.95      0.96       474
        Real       0.95      0.98      0.97       498

    accuracy                           0.97       972
   macro avg       0.97      0.97      0.97       972
weighted avg       0.97      0.97      0.97       972

Confusion Matrix:
[[449  25]
 [  8 490]]
Test Accuracy: 0.9660
Model saved to ./fine_tuned_bert-base-uncased


===== MODEL TRAINING SUMMARY =====

Model: bert-base-uncased
Best Hyperparameters: {'learning_rate': 2.80944491312171e-05, 'per_device_train_batch_size': 8, 'weight_decay': 0.014356938544736958, 'num_train_epochs': 2}
Test Set Performance:
Accuracy: 0.9660
F1 Score (Micro): 0.9660
Precision (Micro): 0.9660
Recall (Micro): 0.9660
F1 Score (Macro): 0.9660
Precision (Macro): 0.9670
Recall (Macro): 0.9656


In [11]:
# Define the list of model names to evaluate
bert_model_options = [
    #"bert-base-uncased",
    "digitalepidemiologylab/covid-twitter-bert",
    #"twitter/twhin-bert-base",
    #"allenai/scibert_scivocab_uncased"
]


In [12]:
# Dictionary to store results for all models
model_results = {}

# Loop through each model
for current_model in bert_model_options:
    print(f"\n\n===== Training and evaluating {current_model} =====")

    # Initialize tokenizer
    current_tokenizer = AutoTokenizer.from_pretrained(current_model)

    # Create datasets
    train_dataset = CovidNewsDataset(
        train_split['cleaned_tweet'].values,
        train_split['label'].values,
        current_tokenizer
    )

    val_dataset = CovidNewsDataset(
        val_split['cleaned_tweet'].values,
        val_split['label'].values,
        current_tokenizer
    )

    test_dataset = CovidNewsDataset(
        test_split['cleaned_tweet'].values,
        test_split['label'].values,
        current_tokenizer
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Task 4: Training Classifiers with Hyperparameter Tuning
    # Define model initialization function
    def init_model():
        return AutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2)

    # Define hyperparameter search space
    def hp_search_space(trial):
        return {
            "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
            "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
            "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.1),
            "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 2)
        }

    # Initialize trainer for hyperparameter search
    tuning_config = TrainingArguments(
        output_dir=f"./tuning_results_{current_model.replace('/', '_')}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to='none'
    )

    hp_tuner = Trainer(
        model_init=init_model,
        args=tuning_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calculate_metrics,
    )

    # Run hyperparameter search
    print("Starting hyperparameter tuning...")
    best_run = hp_tuner.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=hp_search_space,
        n_trials=1  # Reduced for faster execution, increase for better results
    )

    print(f"Best hyperparameters: {best_run.hyperparameters}")

    # Train with best hyperparameters
    optimal_params = best_run.hyperparameters
    training_config = TrainingArguments(
        output_dir=f"./best_model_results_{current_model.replace('/', '_')}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=optimal_params["learning_rate"],
        per_device_train_batch_size=optimal_params["per_device_train_batch_size"],
        per_device_eval_batch_size=optimal_params["per_device_train_batch_size"],
        num_train_epochs=optimal_params["num_train_epochs"],
        weight_decay=optimal_params["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to='none'
    )

    optimal_model = AutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2).to(device_to_use)
    model_trainer = Trainer(
        model=optimal_model,
        args=training_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calculate_metrics,
    )

    print("Training with best hyperparameters...")
    model_trainer.train()

    # Task 5: Evaluating Models
    def evaluate_classifier(model, test_loader, device):
        model.eval()
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                preds = torch.argmax(logits, dim=1).cpu().numpy()
                true = labels.cpu().numpy()

                all_predictions.extend(preds)
                all_labels.extend(true)

        # Calculate metrics
        print("\nTest Set Evaluation:")
        print(classification_report(all_labels, all_predictions, target_names=["Fake", "Real"]))

        # Create confusion matrix
        cm = confusion_matrix(all_labels, all_predictions)
        print("Confusion Matrix:")
        print(cm)

        # Calculate accuracy
        accuracy = accuracy_score(all_labels, all_predictions)
        print(f"Test Accuracy: {accuracy:.4f}")

        # Calculate precision, recall, f1 for both micro and macro
        precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_predictions, average='micro')
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(all_labels, all_predictions, average='macro')

        eval_results = {
            'predictions': all_predictions,
            'true_labels': all_labels,
            'accuracy': accuracy,
            'f1_micro': f1_micro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            'f1_macro': f1_macro,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'best_hyperparameters': optimal_params
        }

        return eval_results

    # Evaluate the best model
    print("Evaluating model on test set...")
    evaluation_results = evaluate_classifier(optimal_model, test_loader, device_to_use)

    # Save the fine-tuned model
    model_save_path = f"./fine_tuned_{current_model.replace('/', '_')}"
    optimal_model.save_pretrained(model_save_path)
    current_tokenizer.save_pretrained(model_save_path)
    print(f"Model saved to {model_save_path}")

    # Store results for this model
    model_results[current_model] = evaluation_results

# Generate a summary report for all models
print("\n\n===== MODEL TRAINING SUMMARY =====")
for model_name, results in model_results.items():
    print(f"\nModel: {model_name}")
    print(f"Best Hyperparameters: {results['best_hyperparameters']}")
    print("Test Set Performance:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"F1 Score (Micro): {results['f1_micro']:.4f}")
    print(f"Precision (Micro): {results['precision_micro']:.4f}")
    print(f"Recall (Micro): {results['recall_micro']:.4f}")
    print(f"F1 Score (Macro): {results['f1_macro']:.4f}")
    print(f"Precision (Macro): {results['precision_macro']:.4f}")
    print(f"Recall (Macro): {results['recall_macro']:.4f}")



===== Training and evaluating digitalepidemiologylab/covid-twitter-bert =====


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/421 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/1.35G [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-08 14:34:16,905] A new study created in memory with name: no-name-fcb3a071-4160-4ff6-bda6-603670e1042d


Starting hyperparameter tuning...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.119636,0.966049,0.9667,0.967677,0.965726


[I 2025-04-08 14:44:41,326] Trial 0 finished with value: 3.866152259568951 and parameters: {'learning_rate': 3.903632666192493e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.04795722402711263, 'num_train_epochs': 1}. Best is trial 0 with value: 3.866152259568951.


Best hyperparameters: {'learning_rate': 3.903632666192493e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.04795722402711263, 'num_train_epochs': 1}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at digitalepidemiologylab/covid-twitter-bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with best hyperparameters...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.098148,0.975309,0.975806,0.975806,0.975806


Evaluating model on test set...

Test Set Evaluation:
              precision    recall  f1-score   support

        Fake       0.98      0.96      0.97       474
        Real       0.97      0.98      0.97       498

    accuracy                           0.97       972
   macro avg       0.97      0.97      0.97       972
weighted avg       0.97      0.97      0.97       972

Confusion Matrix:
[[457  17]
 [ 10 488]]
Test Accuracy: 0.9722
Model saved to ./fine_tuned_digitalepidemiologylab_covid-twitter-bert


===== MODEL TRAINING SUMMARY =====

Model: digitalepidemiologylab/covid-twitter-bert
Best Hyperparameters: {'learning_rate': 3.903632666192493e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.04795722402711263, 'num_train_epochs': 1}
Test Set Performance:
Accuracy: 0.9722
F1 Score (Micro): 0.9722
Precision (Micro): 0.9722
Recall (Micro): 0.9722
F1 Score (Macro): 0.9722
Precision (Macro): 0.9725
Recall (Macro): 0.9720


In [13]:
# Define the list of model names to evaluate
bert_model_options = [
    #"bert-base-uncased",
    #"digitalepidemiologylab/covid-twitter-bert",
    "twitter/twhin-bert-base",
    #"allenai/scibert_scivocab_uncased"
]


In [14]:
# Dictionary to store results for all models
model_results = {}

# Loop through each model
for current_model in bert_model_options:
    print(f"\n\n===== Training and evaluating {current_model} =====")

    # Initialize tokenizer
    current_tokenizer = AutoTokenizer.from_pretrained(current_model)

    # Create datasets
    train_dataset = CovidNewsDataset(
        train_split['cleaned_tweet'].values,
        train_split['label'].values,
        current_tokenizer
    )

    val_dataset = CovidNewsDataset(
        val_split['cleaned_tweet'].values,
        val_split['label'].values,
        current_tokenizer
    )

    test_dataset = CovidNewsDataset(
        test_split['cleaned_tweet'].values,
        test_split['label'].values,
        current_tokenizer
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Task 4: Training Classifiers with Hyperparameter Tuning
    # Define model initialization function
    def init_model():
        return AutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2)

    # Define hyperparameter search space
    def hp_search_space(trial):
        return {
            "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
            "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
            "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.1),
            "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 2)
        }

    # Initialize trainer for hyperparameter search
    tuning_config = TrainingArguments(
        output_dir=f"./tuning_results_{current_model.replace('/', '_')}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to='none'
    )

    hp_tuner = Trainer(
        model_init=init_model,
        args=tuning_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calculate_metrics,
    )

    # Run hyperparameter search
    print("Starting hyperparameter tuning...")
    best_run = hp_tuner.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=hp_search_space,
        n_trials=1  # Reduced for faster execution, increase for better results
    )

    print(f"Best hyperparameters: {best_run.hyperparameters}")

    # Train with best hyperparameters
    optimal_params = best_run.hyperparameters
    training_config = TrainingArguments(
        output_dir=f"./best_model_results_{current_model.replace('/', '_')}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=optimal_params["learning_rate"],
        per_device_train_batch_size=optimal_params["per_device_train_batch_size"],
        per_device_eval_batch_size=optimal_params["per_device_train_batch_size"],
        num_train_epochs=optimal_params["num_train_epochs"],
        weight_decay=optimal_params["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to='none'
    )

    optimal_model = AutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2).to(device_to_use)
    model_trainer = Trainer(
        model=optimal_model,
        args=training_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calculate_metrics,
    )

    print("Training with best hyperparameters...")
    model_trainer.train()

    # Task 5: Evaluating Models
    def evaluate_classifier(model, test_loader, device):
        model.eval()
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                preds = torch.argmax(logits, dim=1).cpu().numpy()
                true = labels.cpu().numpy()

                all_predictions.extend(preds)
                all_labels.extend(true)

        # Calculate metrics
        print("\nTest Set Evaluation:")
        print(classification_report(all_labels, all_predictions, target_names=["Fake", "Real"]))

        # Create confusion matrix
        cm = confusion_matrix(all_labels, all_predictions)
        print("Confusion Matrix:")
        print(cm)

        # Calculate accuracy
        accuracy = accuracy_score(all_labels, all_predictions)
        print(f"Test Accuracy: {accuracy:.4f}")

        # Calculate precision, recall, f1 for both micro and macro
        precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_predictions, average='micro')
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(all_labels, all_predictions, average='macro')

        eval_results = {
            'predictions': all_predictions,
            'true_labels': all_labels,
            'accuracy': accuracy,
            'f1_micro': f1_micro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            'f1_macro': f1_macro,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'best_hyperparameters': optimal_params
        }

        return eval_results

    # Evaluate the best model
    print("Evaluating model on test set...")
    evaluation_results = evaluate_classifier(optimal_model, test_loader, device_to_use)

    # Save the fine-tuned model
    model_save_path = f"./fine_tuned_{current_model.replace('/', '_')}"
    optimal_model.save_pretrained(model_save_path)
    current_tokenizer.save_pretrained(model_save_path)
    print(f"Model saved to {model_save_path}")

    # Store results for this model
    model_results[current_model] = evaluation_results

# Generate a summary report for all models
print("\n\n===== MODEL TRAINING SUMMARY =====")
for model_name, results in model_results.items():
    print(f"\nModel: {model_name}")
    print(f"Best Hyperparameters: {results['best_hyperparameters']}")
    print("Test Set Performance:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"F1 Score (Micro): {results['f1_micro']:.4f}")
    print(f"Precision (Micro): {results['precision_micro']:.4f}")
    print(f"Recall (Micro): {results['recall_micro']:.4f}")
    print(f"F1 Score (Macro): {results['f1_macro']:.4f}")
    print(f"Precision (Macro): {results['precision_macro']:.4f}")
    print(f"Recall (Macro): {results['recall_macro']:.4f}")



===== Training and evaluating twitter/twhin-bert-base =====


tokenizer_config.json:   0%|          | 0.00/372 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/632 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-08 15:08:15,117] A new study created in memory with name: no-name-ebdd389b-62bd-4df2-b367-bf5af2094f17


Starting hyperparameter tuning...


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.158177,0.965021,0.965657,0.967611,0.96371
2,0.208900,0.163326,0.968107,0.968592,0.973523,0.96371


[I 2025-04-08 15:17:14,481] Trial 0 finished with value: 3.873931786888671 and parameters: {'learning_rate': 2.4074414455643288e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.047820737349219086, 'num_train_epochs': 2}. Best is trial 0 with value: 3.873931786888671.


Best hyperparameters: {'learning_rate': 2.4074414455643288e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.047820737349219086, 'num_train_epochs': 2}


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at twitter/twhin-bert-base and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with best hyperparameters...


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.159095,0.957819,0.958035,0.972973,0.943548
2,0.205500,0.145451,0.969136,0.969574,0.97551,0.96371


Evaluating model on test set...

Test Set Evaluation:
              precision    recall  f1-score   support

        Fake       0.97      0.97      0.97       474
        Real       0.97      0.98      0.97       498

    accuracy                           0.97       972
   macro avg       0.97      0.97      0.97       972
weighted avg       0.97      0.97      0.97       972

Confusion Matrix:
[[458  16]
 [ 12 486]]
Test Accuracy: 0.9712
Model saved to ./fine_tuned_twitter_twhin-bert-base


===== MODEL TRAINING SUMMARY =====

Model: twitter/twhin-bert-base
Best Hyperparameters: {'learning_rate': 2.4074414455643288e-05, 'per_device_train_batch_size': 16, 'weight_decay': 0.047820737349219086, 'num_train_epochs': 2}
Test Set Performance:
Accuracy: 0.9712
F1 Score (Micro): 0.9712
Precision (Micro): 0.9712
Recall (Micro): 0.9712
F1 Score (Macro): 0.9712
Precision (Macro): 0.9713
Recall (Macro): 0.9711


In [16]:
# Define the list of model names to evaluate
bert_model_options = [
    #"bert-base-uncased",
    #"digitalepidemiologylab/covid-twitter-bert",
    #"twitter/twhin-bert-base",
    "sarkerlab/SocBERT-base"
]

In [None]:
# Dictionary to store results for all models
model_results = {}

# Loop through each model
for current_model in bert_model_options:
    print(f"\n\n===== Training and evaluating {current_model} =====")

    # Initialize tokenizer
    current_tokenizer = AutoTokenizer.from_pretrained(current_model)

    # Create datasets
    train_dataset = CovidNewsDataset(
        train_split['cleaned_tweet'].values,
        train_split['label'].values,
        current_tokenizer
    )

    val_dataset = CovidNewsDataset(
        val_split['cleaned_tweet'].values,
        val_split['label'].values,
        current_tokenizer
    )

    test_dataset = CovidNewsDataset(
        test_split['cleaned_tweet'].values,
        test_split['label'].values,
        current_tokenizer
    )

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)

    # Task 4: Training Classifiers with Hyperparameter Tuning
    # Define model initialization function
    def init_model():
        return AutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2)

    # Define hyperparameter search space
    def hp_search_space(trial):
        return {
            "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
            "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
            "weight_decay": trial.suggest_float("weight_decay", 0.01, 0.1),
            "num_train_epochs": trial.suggest_int("num_train_epochs", 1, 2)
        }

    # Initialize trainer for hyperparameter search
    tuning_config = TrainingArguments(
        output_dir=f"./tuning_results_{current_model.replace('/', '_')}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        num_train_epochs=1,
        weight_decay=0.01,
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to='none'
    )

    hp_tuner = Trainer(
        model_init=init_model,
        args=tuning_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calculate_metrics,
    )

    # Run hyperparameter search
    print("Starting hyperparameter tuning...")
    best_run = hp_tuner.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=hp_search_space,
        n_trials=1  # Reduced for faster execution, increase for better results
    )

    print(f"Best hyperparameters: {best_run.hyperparameters}")

    # Train with best hyperparameters
    optimal_params = best_run.hyperparameters
    training_config = TrainingArguments(
        output_dir=f"./best_model_results_{current_model.replace('/', '_')}",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=optimal_params["learning_rate"],
        per_device_train_batch_size=optimal_params["per_device_train_batch_size"],
        per_device_eval_batch_size=optimal_params["per_device_train_batch_size"],
        num_train_epochs=optimal_params["num_train_epochs"],
        weight_decay=optimal_params["weight_decay"],
        load_best_model_at_end=True,
        metric_for_best_model="accuracy",
        report_to='none'
    )

    optimal_model = AutoModelForSequenceClassification.from_pretrained(current_model, num_labels=2).to(device_to_use)
    model_trainer = Trainer(
        model=optimal_model,
        args=training_config,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=calculate_metrics,
    )

    print("Training with best hyperparameters...")
    model_trainer.train()

    # Task 5: Evaluating Models
    def evaluate_classifier(model, test_loader, device):
        model.eval()
        all_predictions = []
        all_labels = []

        with torch.no_grad():
            for batch in test_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                labels = batch['labels'].to(device)

                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                preds = torch.argmax(logits, dim=1).cpu().numpy()
                true = labels.cpu().numpy()

                all_predictions.extend(preds)
                all_labels.extend(true)

        # Calculate metrics
        print("\nTest Set Evaluation:")
        print(classification_report(all_labels, all_predictions, target_names=["Fake", "Real"]))

        # Create confusion matrix
        cm = confusion_matrix(all_labels, all_predictions)
        print("Confusion Matrix:")
        print(cm)

        # Calculate accuracy
        accuracy = accuracy_score(all_labels, all_predictions)
        print(f"Test Accuracy: {accuracy:.4f}")

        # Calculate precision, recall, f1 for both micro and macro
        precision_micro, recall_micro, f1_micro, _ = precision_recall_fscore_support(all_labels, all_predictions, average='micro')
        precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(all_labels, all_predictions, average='macro')

        eval_results = {
            'predictions': all_predictions,
            'true_labels': all_labels,
            'accuracy': accuracy,
            'f1_micro': f1_micro,
            'precision_micro': precision_micro,
            'recall_micro': recall_micro,
            'f1_macro': f1_macro,
            'precision_macro': precision_macro,
            'recall_macro': recall_macro,
            'best_hyperparameters': optimal_params
        }

        return eval_results

    # Evaluate the best model
    print("Evaluating model on test set...")
    evaluation_results = evaluate_classifier(optimal_model, test_loader, device_to_use)

    # Save the fine-tuned model
    model_save_path = f"./fine_tuned_{current_model.replace('/', '_')}"
    optimal_model.save_pretrained(model_save_path)
    current_tokenizer.save_pretrained(model_save_path)
    print(f"Model saved to {model_save_path}")

    # Store results for this model
    model_results[current_model] = evaluation_results

# Generate a summary report for all models
print("\n\n===== MODEL TRAINING SUMMARY =====")
for model_name, results in model_results.items():
    print(f"\nModel: {model_name}")
    print(f"Best Hyperparameters: {results['best_hyperparameters']}")
    print("Test Set Performance:")
    print(f"Accuracy: {results['accuracy']:.4f}")
    print(f"F1 Score (Micro): {results['f1_micro']:.4f}")
    print(f"Precision (Micro): {results['precision_micro']:.4f}")
    print(f"Recall (Micro): {results['recall_micro']:.4f}")
    print(f"F1 Score (Macro): {results['f1_macro']:.4f}")
    print(f"Precision (Macro): {results['precision_macro']:.4f}")
    print(f"Recall (Macro): {results['recall_macro']:.4f}")



===== Training and evaluating sarkerlab/SocBERT-base =====


tokenizer_config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.24M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/735k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.13M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/766 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/572M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/572M [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sarkerlab/SocBERT-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-04-08 16:12:26,260] A new study created in memory with name: no-name-8da41245-4b0e-428a-914f-fc1b8c1faec3


Starting hyperparameter tuning...


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sarkerlab/SocBERT-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,No log,0.144955,0.943416,0.944945,0.93837,0.951613


[I 2025-04-08 18:16:54,220] Trial 0 finished with value: 3.778343267342961 and parameters: {'learning_rate': 1.8602739617378416e-05, 'per_device_train_batch_size': 32, 'weight_decay': 0.05146221231744363, 'num_train_epochs': 1}. Best is trial 0 with value: 3.778343267342961.


Best hyperparameters: {'learning_rate': 1.8602739617378416e-05, 'per_device_train_batch_size': 32, 'weight_decay': 0.05146221231744363, 'num_train_epochs': 1}


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at sarkerlab/SocBERT-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Training with best hyperparameters...


Epoch,Training Loss,Validation Loss
