## **NLP - Text Classification Project**
Group H - August 2025

Classification of tweets from Twitter that have been manually tagged for sentiment analysis.

In [None]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [None]:
%env CUDA_LAUNCH_BLOCKING=1

from wordcloud import WordCloud, STOPWORDS
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix, cohen_kappa_score
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset
from optuna.pruners import MedianPruner
from scipy.stats import pearsonr
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F
import os
import re
import string
import time
import glob
import nltk
import evaluate
import transformers
import torch
import optuna
import wandb
wandb.login()
# API key - 0cbd7fe3cffd71df993b30edb4fa0db94f114413

num_train_samples = 5000
os.environ["WANDB_PROJECT"] = f"tweet-sentiment-classification_split_to_test_maxl_256_{num_train_samples}_samples_optuna"
os.environ["WANDB_INIT_TIMEOUT"] = "180"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_train.csv', encoding='latin1')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test.csv', encoding='latin1')

In [None]:
# Merge and shuffle for better stratified splits
df_full = pd.concat([df_train, df_test], ignore_index=True)
df_full = df_full.sample(frac=1.0, random_state=42).reset_index(drop=True)

### **Pre-processing the Data**

Three pre-processing strategies were considered:<br>
1) No pre-processing – using tweets in their original form.<br>
2) Minimal pre-processing – removing extra spaces.<br>
3) Full pre-processing – lowercasing text, removing URLs, mentions, hashtags, special characters, and extra whitespace.

The no pre-processing approach was selected, as it achieved the highest performance.

In [None]:
# Try without pre-processing
is_preprocessed = "no_preprocess"
df_full["clean_text"] = df_full["OriginalTweet"]

In [None]:
# # Try minimal pre-processing
# def light_preprocess(text):
#     return text.strip()                             # Remove unnecessary spaces

# is_preprocessed = "minimal_preprocess"
# df_full["clean_text"] = df_full["OriginalTweet"].apply(light_preprocess)

In [None]:
# # Try pre-processing
# def clean_text(text):
#     text = str(text).lower()
#     text = re.sub(r"http\S+|www\S+|https\S+", '', text, flags=re.MULTILINE)  # URLs
#     text = re.sub(r'\@\w+|\#','', text)  # Mentions & hashtags
#     text = re.sub(r'\n', ' ', text)  # Line breaks
#     text = re.sub(r"[^a-zA-Z']", ' ', text)  # Keep letters only
#     text = re.sub(r'\s+', ' ', text).strip()  # Extra whitespace
#     return text

# is_preprocessed = "w_preprocess"
# df_full["clean_text"] = df_full["OriginalTweet"].apply(clean_text)

**Encode Sentiment Labels**

Map each unique sentiment label to a numeric ID for model compatibility, and apply this mapping datasets.

In [None]:
# Mapping sentiments to unique numeric IDs
unique_labels = sorted(df_full["Sentiment"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
df_full["label"] = df_full["Sentiment"].map(label2id)

#### Stratified Data Splitting
The dataset is split into train (70%), validation (15%), and test (15%) sets using stratified sampling to preserve the original label distribution across all subsets.

In [None]:
# Stratified split: 70% train, 15% val, 15% test
train_val_df, test_df = train_test_split(df_full, test_size=0.15, stratify=df_full["label"], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["label"], random_state=42)

# Confirm sizes
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

**Use Small Subsets for Quick Evaluation**

Select shuffled samples from each training and validation dataset for both BERT and RoBERTa. This allows faster experimentation during model development.

In [None]:
train_subset_df, _ = train_test_split(
    train_df[["clean_text", "label"]],
    train_size=num_train_samples,
    stratify=train_df["label"],
    random_state=42
)

val_subset_df, _ = train_test_split(
    val_df[["clean_text", "label"]],
    train_size=500,
    stratify=val_df["label"],
    random_state=42
)

## Fine-Tuning Pretrained Language Models

Fine-tuning two pretrained transformer-based models from the Hugging Face library — BERT and RoBERTa — on our sentiment classification task. These models will be trained using the Hugging Face API. Model performance will be monitored and tuned using hyperparameter optimization (Optuna) and experiment tracking (Weights & Biases).

**Load Pretrained Models**

Initialize tokenizers and models for BERT and RoBERTa, both widely used transformer architectures for text classification.

In [None]:
# Choose pretrained models
bert_model_name = "bert-base-uncased"
roberta_model_name = "roberta-base"

sentiment_labels = df_full['Sentiment'].unique()
n_labels = len(sentiment_labels)

# Load BERT tokenizer and model
bert_tokenizer = AutoTokenizer.from_pretrained(bert_model_name)
bert_model = AutoModelForSequenceClassification.from_pretrained(bert_model_name, num_labels=n_labels)

# Load RoBERTa tokenizer and model
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name, num_labels=n_labels)

**Tokenization**

Custom tokenization functions are defined for each model. The dataset splits are converted to Hugging Face Dataset objects and tokenized separately for BERT and RoBERTa.

In [None]:
# Tokenize function
def tokenize_function_bert(examples):
    return bert_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

def tokenize_function_roberta(examples):
    return roberta_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

In [None]:
# Convert DataFrame to Hugging Face Dataset
hf_subset_train = Dataset.from_pandas(train_subset_df)
hf_subset_val = Dataset.from_pandas(val_subset_df)

hf_train = Dataset.from_pandas(train_df[["clean_text", "label"]])
hf_val = Dataset.from_pandas(val_df[["clean_text", "label"]])
hf_test = Dataset.from_pandas(test_df[["clean_text", "label"]])

In [None]:
# Tokenize subsets
# Tokenize for BERT
tokenized_bert_train_sub = hf_subset_train.map(tokenize_function_bert, batched=True)
tokenized_bert_train_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_bert_val_sub = hf_subset_val.map(tokenize_function_bert, batched=True)
tokenized_bert_val_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Tokenize for RoBERTa
tokenized_roberta_train_sub = hf_subset_train.map(tokenize_function_roberta, batched=True)
tokenized_roberta_train_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_val_sub = hf_subset_val.map(tokenize_function_roberta, batched=True)
tokenized_roberta_val_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
# Tokenize full dataset
# Tokenize for BERT
tokenized_bert_train = hf_train.map(tokenize_function_bert, batched=True)
tokenized_bert_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_bert_val = hf_val.map(tokenize_function_bert, batched=True)
tokenized_bert_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_bert_test = hf_test.map(tokenize_function_bert, batched=True)
tokenized_bert_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

# Tokenize for RoBERTa
tokenized_roberta_train = hf_train.map(tokenize_function_roberta, batched=True)
tokenized_roberta_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_val = hf_val.map(tokenize_function_roberta, batched=True)
tokenized_roberta_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_test = hf_test.map(tokenize_function_roberta, batched=True)
tokenized_roberta_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

**Metric Computation**

The evaluation metrics include accuracy and macro-averaged F1 score. The compute_metrics function converts model logits into predicted labels and calculates both metrics, providing a balanced performance measure across all classes.

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "precision_macro": precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall_macro":    recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"],
    }

**Hyperparameters Tuning with Optuna**

Hyperparameter tuning for BERT and RoBERTa using Optuna.
A function configuring Hugging Face Trainer with parameters suggested by Optuna, including learning rate, batch size, and number of epochs. Each model is trained and evaluated on a subset of the dataset, with macro F1 score as the optimization target. Results are logged to Weights & Biases, and Optuna’s Median Pruner is used to stop underperforming trials early.

In [None]:
def build_trainer(model_checkpoint, trial, run_prefix, train_dataset, val_dataset):
    # Sample hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    n_samples = len(train_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=5)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    run_name = f"{run_prefix}-ep{num_epochs}-lr{learning_rate}-bs{batch_size}-samples{n_samples}-run{int(time.time())}-{is_preprocessed}"

    args = TrainingArguments(
        output_dir=f"./results/{run_prefix}/{run_name}",
        disable_tqdm=True,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        label_smoothing_factor=0.1,
        load_best_model_at_end=True,
        save_total_limit=1,
        logging_strategy="epoch",
        logging_dir=f"./logs/{run_prefix}/{run_name}",
        run_name=run_name,
        report_to="wandb",
        metric_for_best_model="f1_macro",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    return trainer

In [None]:
def objective_bert(trial):
    trainer = build_trainer(
        model_checkpoint="bert-base-uncased",
        trial=trial,
        run_prefix="bert",
        train_dataset=tokenized_bert_train_sub,
        val_dataset=tokenized_bert_val_sub
    )
    trainer.train()
    eval_result = trainer.evaluate()
    wandb.finish()
    return eval_result["eval_f1_macro"]

def objective_roberta(trial):
    trainer = build_trainer(
        model_checkpoint="roberta-base",
        trial=trial,
        run_prefix="roberta",
        train_dataset=tokenized_roberta_train_sub,
        val_dataset=tokenized_roberta_val_sub
    )
    trainer.train()
    eval_result = trainer.evaluate()
    wandb.finish()
    return eval_result["eval_f1_macro"]

In [None]:
study_bert = optuna.create_study(direction="maximize",
                                 pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
                                 study_name=f"bert_study_stratify_{is_preprocessed}",
                                 storage=f"sqlite:////content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/bert_study_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna.db",
                                 load_if_exists=True)
study_bert.optimize(objective_bert, n_trials=5)
wandb.finish()

In [None]:
study_roberta = optuna.create_study(direction="maximize",
                                    pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
                                    study_name=f"roberta_study_stratify_{is_preprocessed}",
                                    storage=f"sqlite:////content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/roberta_study_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna.db",
                                    load_if_exists=True)
study_roberta.optimize(objective_roberta, n_trials=5)
wandb.finish()

Display best trial parameters

In [None]:
best_trial_bert = study_bert.best_trial
print('Bert best trial on subset:')
print(best_trial_bert.params)
best_trial_roberta = study_roberta.best_trial
print('RoBerta best trial on subset:')
print(best_trial_roberta.params)

#### Final Model Training with Best Hyperparameters
The best hyperparameters from the Optuna search are used to retrain BERT and RoBERTa on the full training set.
Each model is trained and evaluated using the Hugging Face Trainer with W&B logging.


In [None]:
best_params_bert = best_trial_bert.params
run_name_bert = f"bert_final_stratify_{is_preprocessed}-ep{best_params_bert['num_train_epochs']}-lr{best_params_bert['learning_rate']:.1e}-bs{best_params_bert['batch_size']}"
wandb.init(project=f"tweet-sentiment-classification_split_to_test_maxl_256_{num_train_samples}_samples_optuna", name=run_name_bert, reinit=True)

final_trainer_bert = build_trainer(
    model_checkpoint="bert-base-uncased",
    trial=best_trial_bert,
    run_prefix=f"bert_final_stratify_{is_preprocessed}",
    train_dataset=tokenized_bert_train,
    val_dataset=tokenized_bert_val
)
final_trainer_bert.train()
final_trainer_bert.evaluate(tokenized_bert_test)
wandb.finish()

In [None]:
best_params_roberta = best_trial_roberta.params
run_name_roberta = f"roberta_final_stratify_{is_preprocessed}-ep{best_params_roberta['num_train_epochs']}-lr{best_params_roberta['learning_rate']:.1e}-bs{best_params_roberta['batch_size']}"
wandb.init(project=f"tweet-sentiment-classification_split_to_test_maxl_256_{num_train_samples}_samples_optuna", name=run_name_roberta, reinit=True)

final_trainer_roberta = build_trainer(
    model_checkpoint="roberta-base",
    trial=best_trial_roberta,
    run_prefix=f"roberta_final_stratify_{is_preprocessed}",
    train_dataset=tokenized_roberta_train,
    val_dataset=tokenized_roberta_val
)
final_trainer_roberta.train()
final_trainer_roberta.evaluate(tokenized_roberta_test)
wandb.finish()

In [None]:
# Saving final models
final_trainer_bert.save_model(f"models/bert_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna")
bert_tokenizer.save_pretrained(f"models/bert_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna")
!cp -r models/bert_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/bert_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"

final_trainer_roberta.save_model(f"models/roberta_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna")
roberta_tokenizer.save_pretrained(f"models/roberta_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna")
!cp -r models/roberta_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/roberta_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"

### **Ensemble Models**

In this section, combining predictions from multiple fine-tuned transformer models (BERT, RoBERTa, and DeBERTa) using soft-vote ensembling. Instead of relying on a single model’s predictions, averaging the class probabilities from each model—optionally with optimized weights via Optuna—to leverage their complementary strengths and reduce individual model biases, often leading to improved accuracy and robustness.

Generate Class Probabilities

In [None]:
# Tokenize texts, run in eval mode and outputs softmax probabilities for each class
def model_probs(model, tokenizer, texts, batch_size=32, max_length=256):
    model.eval()
    inputs = tokenizer(
        list(texts),
        padding=True,
        truncation=True,
        max_length=max_length,
        return_tensors="pt"
    )
    ds = TensorDataset(inputs["input_ids"], inputs["attention_mask"])
    dl = DataLoader(ds, batch_size=batch_size)

    probs_all = []
    with torch.no_grad():
        for input_ids, attn_mask in dl:
            input_ids = input_ids.to(device)
            attn_mask = attn_mask.to(device)
            logits = model(input_ids=input_ids, attention_mask=attn_mask).logits
            probs = torch.nn.functional.softmax(logits, dim=1).cpu().numpy()
            probs_all.append(probs)
    return np.concatenate(probs_all, axis=0)

Load Fine-Tuned Models and Generate Predictions

In [None]:
# Load best finetuned checkpoints
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

best_bert_path = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/bert_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"
bert_model = AutoModelForSequenceClassification.from_pretrained(best_bert_path).to(device)
bert_tokenizer = AutoTokenizer.from_pretrained(best_bert_path)

best_roberta_path = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/roberta_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"
roberta_model = AutoModelForSequenceClassification.from_pretrained(best_roberta_path).to(device)
roberta_tokenizer = AutoTokenizer.from_pretrained(best_roberta_path)

best_deberta_path = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/deberta_best_model_stratify_maxl_128_{is_preprocessed}_{num_train_samples}_samples_optuna"
deberta_model = AutoModelForSequenceClassification.from_pretrained(best_deberta_path).to(device)
deberta_tokenizer = AutoTokenizer.from_pretrained(best_deberta_path)

val_texts = list(val_df["clean_text"])
val_labels = val_df["label"].values

test_texts = list(test_df["clean_text"])
test_labels = test_df["label"].values

# Generate class probability predictions on validation and test
bert_val_probs   = model_probs(bert_model, bert_tokenizer, val_texts,  batch_size=32, max_length=256)
roberta_val_probs= model_probs(roberta_model, roberta_tokenizer, val_texts, batch_size=32, max_length=256)
deberta_val_probs= model_probs(deberta_model, deberta_tokenizer, val_texts, batch_size=32, max_length=128)

bert_test_probs    = model_probs(bert_model, bert_tokenizer, test_texts,  batch_size=32, max_length=256)
roberta_test_probs = model_probs(roberta_model, roberta_tokenizer, test_texts, batch_size=32, max_length=256)
deberta_test_probs= model_probs(deberta_model, deberta_tokenizer, test_texts, batch_size=32, max_length=128)


#### Weighted Ensemble with Grid Search
Searching for the optimal combination of model weights using a grid search on the validation set, aiming to maximize the macro-F1 score. The best weights are then applied to the test set to evaluate the final ensemble performance.

In [None]:
def ensemble_probs(probs_list, weights):
    w = np.array(weights, dtype=np.float32)
    w = w / w.sum()
    out = np.zeros_like(probs_list[0], dtype=np.float32)
    for wi, pi in zip(w, probs_list):
        out += wi * pi
    return out

def metrics_from_probs(probs, y_true):
    preds = probs.argmax(axis=1)
    return {
        "accuracy": accuracy_score(y_true, preds),
        "macro_f1": f1_score(y_true, preds, average="macro")
    }

def grid_search_weights_val(probs_list, y_true, step=0.05):
    n = len(probs_list)
    best = {"weights": None, "macro_f1": -1.0, "accuracy": 0.0}
    grid = np.arange(0.0, 1.0 + 1e-9, step)

    if n == 2:
        for w0 in grid:
            w = [w0, 1.0 - w0]
            scores = metrics_from_probs(ensemble_probs(probs_list, w), y_true)
            if scores["macro_f1"] > best["macro_f1"]:
                best = {"weights": w, **scores}
        return best

    if n == 3:
        for w0 in grid:
            for w1 in grid:
                s = w0 + w1
                if s <= 1.0 + 1e-9:
                    w2 = 1.0 - s
                    w = [w0, w1, w2]
                    scores = metrics_from_probs(ensemble_probs(probs_list, w), y_true)
                    if scores["macro_f1"] > best["macro_f1"]:
                        best = {"weights": w, **scores}
        return best

    raise ValueError("Only supports 2 or 3 models in this helper.")

# Lists for 3-model case (BERT + RoBERTa + DeBerta)
val_probs_list  = [bert_val_probs, roberta_val_probs, deberta_val_probs]
test_probs_list = [bert_test_probs, roberta_test_probs, deberta_test_probs]

# Grid search on validation
best = grid_search_weights_val(val_probs_list, val_labels, step=0.05)
print("Best weights on validation:", best)

# Apply best weights to test
ens_test_probs = ensemble_probs(test_probs_list, best["weights"])
test_scores = metrics_from_probs(ens_test_probs, test_labels)

print(f" Test (weighted ensemble) Accuracy: {test_scores['accuracy']:.4f}")
print(f" Test (weighted ensemble) Macro-F1: {test_scores['macro_f1']:.4f}")

#### Final Ensemble Evaluation
Producing classification metrics (precision, recall, and F1-score) for each sentiment class. In addition, visualizing confusion matrix to show where the model performs well and where it confuses between classes.

In [None]:
# Final predictions from ensemble
ens_test_preds = ens_test_probs.argmax(axis=1)

# Class labels
class_labels = [
    "Extremely Negative",
    "Negative",
    "Neutral",
    "Positive",
    "Extremely Positive"
]

# Per-class F1, Precision, Recall
print("Classification Report:")
print(classification_report(test_labels, ens_test_preds, target_names=class_labels, digits=4))

# Confusion Matrix
cm = confusion_matrix(test_labels, ens_test_preds)

plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=class_labels,
            yticklabels=class_labels)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix - Final Weighted Ensemble")
plt.tight_layout()
plt.show()

#### Pairwise Model Agreement Analysis
Comparing predictions of each model pair (BERT, RoBERTa, and DeBERTa) on the test set. Reporting agreement rate, Cohen’s κ (chance-adjusted agreement), and prediction correlation, helping to assess model diversity.

In [None]:
# Convert probs -> hard predictions for test
bert_test_preds    = bert_test_probs.argmax(axis=1)
roberta_test_preds = roberta_test_probs.argmax(axis=1)
deberta_test_preds = deberta_test_probs.argmax(axis=1)

def pairwise_stats(name1, p1, name2, p2):
    agree = accuracy_score(p1, p2)
    kappa = cohen_kappa_score(p1, p2)
    corr, _ = pearsonr(p1, p2)
    print(f"{name1} vs {name2}:")
    print(f"  • Agreement: {agree:.4f}")
    print(f"  • Cohen's κ: {kappa:.4f}")
    print(f"  • Pred. correlation: {corr:.4f}\n")

pairwise_stats("BERT",    bert_test_preds,    "RoBERTa", roberta_test_preds)
pairwise_stats("BERT",    bert_test_preds,    "DeBERTa", deberta_test_preds)
pairwise_stats("RoBERTa", roberta_test_preds, "DeBERTa", deberta_test_preds)