<a href="https://colab.research.google.com/github/Hillascher5/nlp-tweets-sentiment-analysis/blob/main/Evaluate_saved_models.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Evaluate Saved Best Models

Evaluate models of all types, From Trainer API and manual code models.

In [1]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [2]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from datasets import Dataset
from sklearn.metrics import classification_report
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import numpy as np
import random
import torch
import nltk
nltk.download('stopwords')

import os
os.environ["WANDB_DISABLED"] = "true"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
def set_seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed_all(42)

#### Load test set

In [4]:
df_test = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test_to_evaluate.csv", encoding='latin1')

In [5]:
# Mapping sentiments to unique numeric IDs
unique_labels = sorted(df_test["Sentiment"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
df_test["label"] = df_test["Sentiment"].map(label2id)

In [6]:
# Minimal pre-processing
def light_preprocess(text):
    return text.strip()                             # Remove unnecessary spaces

is_preprocessed = "minimal_preprocess"
df_test["clean_text"] = df_test["OriginalTweet"].apply(light_preprocess)

In [7]:
# Convert DataFrame to Hugging Face Dataset
hf_test = Dataset.from_pandas(df_test[["clean_text", "label"]])

### Load saved models

In [8]:
model_path_hf = "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer/"
bert_model_name_hf = "bert_best_model_stratify_maxl_256_minimal_preprocess_5000_samples_optuna"
bert_model = AutoModelForSequenceClassification.from_pretrained(model_path_hf + bert_model_name_hf)
bert_tokenizer = AutoTokenizer.from_pretrained(model_path_hf + bert_model_name_hf)

roberta_model_name_hf = "roberta_best_model_stratify_maxl_256_minimal_preprocess_5000_samples_optuna"
roberta_model = AutoModelForSequenceClassification.from_pretrained(model_path_hf + roberta_model_name_hf)
roberta_tokenizer = AutoTokenizer.from_pretrained(model_path_hf + roberta_model_name_hf)

deberta_model_name_hf = "deberta_best_model_stratify_maxl_128_minimal_preprocess_5000_samples_optuna"
deberta_model = AutoModelForSequenceClassification.from_pretrained(model_path_hf + deberta_model_name_hf)
deberta_tokenizer = AutoTokenizer.from_pretrained(model_path_hf + deberta_model_name_hf)

model_path_manual = "/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/Manual_finetune_min_preproc_5000_samples_opt/"
bert_model_name_manual = "manual2hf_bert-base-uncased_5000_samples_opt"
bert_model_man = AutoModelForSequenceClassification.from_pretrained(model_path_manual + bert_model_name_manual)
bert_tokenizer_man = AutoTokenizer.from_pretrained(model_path_manual + bert_model_name_manual)

roberta_model_name_manual = "manual2hf_roberta-base_5000_samples_opt"
roberta_model_man = AutoModelForSequenceClassification.from_pretrained(model_path_manual + roberta_model_name_manual)
roberta_tokenizer_man = AutoTokenizer.from_pretrained(model_path_manual + roberta_model_name_manual)

deberta_model_name_manual = "manual2hf_deberta-base_5000_samples_opt"
deberta_model_man = AutoModelForSequenceClassification.from_pretrained(model_path_manual + deberta_model_name_manual)
deberta_tokenizer_man = AutoTokenizer.from_pretrained(model_path_manual + deberta_model_name_manual)

In [9]:
# Tokenize function
def tokenize_function_bert(examples):
    return bert_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

def tokenize_function_roberta(examples):
    return roberta_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

def tokenize_function_deberta(examples):
    return deberta_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=128)

def tokenize_function_bert_man(examples):
    return bert_tokenizer_man(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

def tokenize_function_roberta_man(examples):
    return roberta_tokenizer_man(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

def tokenize_function_deberta_man(examples):
    return deberta_tokenizer_man(examples["clean_text"], truncation=True, padding='max_length', max_length=128)

In [10]:
# Tokenize
tokenized_bert_test = hf_test.map(tokenize_function_bert, batched=True)
tokenized_bert_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_test = hf_test.map(tokenize_function_roberta, batched=True)
tokenized_roberta_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_deberta_test = hf_test.map(tokenize_function_deberta, batched=True)
tokenized_deberta_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_bert_test_man = hf_test.map(tokenize_function_bert_man, batched=True)
tokenized_bert_test_man.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_test_man = hf_test.map(tokenize_function_roberta_man, batched=True)
tokenized_roberta_test_man.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_deberta_test_man = hf_test.map(tokenize_function_deberta_man, batched=True)
tokenized_deberta_test_man.set_format("torch", columns=["input_ids", "attention_mask", "label"])

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

Map:   0%|          | 0/6744 [00:00<?, ? examples/s]

In [11]:
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)

    return {
        'accuracy': accuracy_score(labels, predictions),
        'f1_macro': f1_score(labels, predictions, average='macro'),
        'precision_macro': precision_score(labels, predictions, average='macro', zero_division=0),
        'recall_macro': recall_score(labels, predictions, average='macro'),
    }

In [12]:
# Define trainers
bert_trainer_hf = Trainer(model=bert_model, tokenizer=bert_tokenizer, compute_metrics=compute_metrics)
roberta_trainer_hf = Trainer(model=roberta_model, tokenizer=roberta_tokenizer, compute_metrics=compute_metrics)
deberta_trainer_hf = Trainer(model=deberta_model, tokenizer=deberta_tokenizer, compute_metrics=compute_metrics)

bert_trainer_man = Trainer(model=bert_model_man, tokenizer=bert_tokenizer_man, compute_metrics=compute_metrics)
roberta_trainer_man = Trainer(model=roberta_model_man, tokenizer=roberta_tokenizer_man, compute_metrics=compute_metrics)
deberta_trainer_man = Trainer(model=deberta_model_man, tokenizer=deberta_tokenizer_man, compute_metrics=compute_metrics)

  bert_trainer_hf = Trainer(model=bert_model, tokenizer=bert_tokenizer, compute_metrics=compute_metrics)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  roberta_trainer_hf = Trainer(model=roberta_model, tokenizer=roberta_tokenizer, compute_metrics=compute_metrics)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  deberta_trainer_hf = Trainer(model=deberta_model, tokenizer=deberta_tokenizer, compute_metrics=compute_metrics)
Using the `WANDB_DISABLED` environment variable is deprecated and will be removed in v5. Use the --report_to flag to control the integrations used for logging result (for instance --report_to none).
  bert_trainer_man = Trainer(model=bert_model_man, tokenizer=bert_toke

In [13]:
# Evaluate
bert_metrics = bert_trainer_hf.evaluate(tokenized_bert_test)
print(bert_metrics)

roberta_metrics = roberta_trainer_hf.evaluate(tokenized_roberta_test)
print(roberta_metrics)

deberta_metrics = deberta_trainer_hf.evaluate(tokenized_deberta_test)
print(deberta_metrics)

bert_metrics_man = bert_trainer_man.evaluate(tokenized_bert_test_man)
print(bert_metrics_man)

roberta_metrics_man = roberta_trainer_man.evaluate(tokenized_roberta_test_man)
print(roberta_metrics_man)

deberta_metrics_man = deberta_trainer_man.evaluate(tokenized_deberta_test_man)
print(deberta_metrics_man)

{'eval_loss': 0.44392096996307373, 'eval_model_preparation_time': 0.0028, 'eval_accuracy': 0.8738137603795967, 'eval_f1_macro': 0.8769761333522315, 'eval_precision_macro': 0.8740451081043522, 'eval_recall_macro': 0.8804287469513886, 'eval_runtime': 24.2378, 'eval_samples_per_second': 278.243, 'eval_steps_per_second': 34.78}


{'eval_loss': 0.4813210070133209, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.8480130486358244, 'eval_f1_macro': 0.8525720367542782, 'eval_precision_macro': 0.8570748717529435, 'eval_recall_macro': 0.8501132208347102, 'eval_runtime': 24.1626, 'eval_samples_per_second': 279.109, 'eval_steps_per_second': 34.889}


{'eval_loss': 0.5082252621650696, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.8622479240806643, 'eval_f1_macro': 0.865642445929695, 'eval_precision_macro': 0.8622872155945721, 'eval_recall_macro': 0.8703075919827932, 'eval_runtime': 21.1957, 'eval_samples_per_second': 318.177, 'eval_steps_per_second': 39.772}


{'eval_loss': 0.42912065982818604, 'eval_model_preparation_time': 0.0028, 'eval_accuracy': 0.8677342823250297, 'eval_f1_macro': 0.8709301566512441, 'eval_precision_macro': 0.8713895954642836, 'eval_recall_macro': 0.8708124710309202, 'eval_runtime': 23.9405, 'eval_samples_per_second': 281.698, 'eval_steps_per_second': 35.212}


{'eval_loss': 0.47616323828697205, 'eval_model_preparation_time': 0.0027, 'eval_accuracy': 0.858244365361803, 'eval_f1_macro': 0.8617900220146438, 'eval_precision_macro': 0.8580514783717135, 'eval_recall_macro': 0.8665737809030926, 'eval_runtime': 24.1078, 'eval_samples_per_second': 279.744, 'eval_steps_per_second': 34.968}


{'eval_loss': 0.4169268012046814, 'eval_model_preparation_time': 0.0029, 'eval_accuracy': 0.8781138790035588, 'eval_f1_macro': 0.8815493598342773, 'eval_precision_macro': 0.878641943202382, 'eval_recall_macro': 0.885041621601031, 'eval_runtime': 20.8539, 'eval_samples_per_second': 323.393, 'eval_steps_per_second': 40.424}


In [None]:
# Define which metrics to keep
keys_to_include = [
    'eval_loss',
    'eval_accuracy',
    'eval_f1_macro',
    'eval_precision_macro',
    'eval_recall_macro'
]

# Organize metrics
metrics_table = pd.DataFrame([
    {"Model": "BERT (HF)", **{k: v for k, v in bert_metrics.items() if k in keys_to_include}},
    {"Model": "RoBERTa (HF)", **{k: v for k, v in roberta_metrics.items() if k in keys_to_include}},
    {"Model": "DeBERTa (HF)", **{k: v for k, v in deberta_metrics.items() if k in keys_to_include}},
    {"Model": "BERT (Manual)", **{k: v for k, v in bert_metrics_man.items() if k in keys_to_include}},
    {"Model": "RoBERTa (Manual)", **{k: v for k, v in roberta_metrics_man.items() if k in keys_to_include}},
    {"Model": "DeBERTa (Manual)", **{k: v for k, v in deberta_metrics_man.items() if k in keys_to_include}},
])

# Set index to model name for clarity
metrics_table.set_index("Model", inplace=True)

# Round values for better readability
metrics_table = metrics_table.round(4)

from IPython.display import display
display(metrics_table)

In [15]:
# Predictions for classification report
bert_preds = bert_trainer_hf.predict(tokenized_bert_test)
bert_y_pred = bert_preds.predictions.argmax(axis=-1)
bert_y_true = bert_preds.label_ids
print('BERT HF Trainer model:')
print(classification_report(bert_y_true, bert_y_pred, digits=4))

roberta_preds = roberta_trainer_hf.predict(tokenized_roberta_test)
roberta_y_pred = roberta_preds.predictions.argmax(axis=-1)
roberta_y_true = roberta_preds.label_ids
print('RoBERTa HF Trainer model:')
print(classification_report(roberta_y_true, roberta_y_pred, digits=4))

deberta_preds = deberta_trainer_hf.predict(tokenized_deberta_test)
deberta_y_pred = deberta_preds.predictions.argmax(axis=-1)
deberta_y_true = deberta_preds.label_ids
print('DeBERTa HF Trainer model:')
print(classification_report(deberta_y_true, deberta_y_pred, digits=4))

bert_preds_man = bert_trainer_man.predict(tokenized_bert_test_man)
bert_y_pred_man = bert_preds_man.predictions.argmax(axis=-1)
bert_y_true_man = bert_preds_man.label_ids
print('BERT manual code model:')
print(classification_report(bert_y_true_man, bert_y_pred_man, digits=4))

roberta_preds_man = roberta_trainer_man.predict(tokenized_roberta_test_man)
roberta_y_pred_man = roberta_preds_man.predictions.argmax(axis=-1)
roberta_y_true_man = roberta_preds_man.label_ids
print('RoBERTA manual code model:')
print(classification_report(roberta_y_true_man, roberta_y_pred_man, digits=4))

deberta_preds_man = deberta_trainer_man.predict(tokenized_deberta_test_man)
deberta_y_pred_man = deberta_preds_man.predictions.argmax(axis=-1)
deberta_y_true_man = deberta_preds_man.label_ids
print('DeBERTA manual code model:')
print(classification_report(deberta_y_true_man, deberta_y_pred_man, digits=4))


BERT HF Trainer model:
              precision    recall  f1-score   support

           0     0.8624    0.9078    0.8845       911
           1     0.8867    0.9244    0.9051      1084
           2     0.8538    0.8491    0.8515      1644
           3     0.8848    0.8664    0.8755      1250
           4     0.8825    0.8544    0.8683      1855

    accuracy                         0.8738      6744
   macro avg     0.8740    0.8804    0.8770      6744
weighted avg     0.8739    0.8738    0.8736      6744



RoBERTa HF Trainer model:
              precision    recall  f1-score   support

           0     0.8372    0.8749    0.8556       911
           1     0.8919    0.8681    0.8799      1084
           2     0.7957    0.8364    0.8155      1644
           3     0.9324    0.8168    0.8708      1250
           4     0.8281    0.8544    0.8411      1855

    accuracy                         0.8480      6744
   macro avg     0.8571    0.8501    0.8526      6744
weighted avg     0.8510    0.8480    0.8486      6744



DeBERTa HF Trainer model:
              precision    recall  f1-score   support

           0     0.8333    0.9111    0.8705       911
           1     0.8706    0.9188    0.8941      1084
           2     0.8357    0.8230    0.8293      1644
           3     0.8994    0.8512    0.8746      1250
           4     0.8724    0.8474    0.8597      1855

    accuracy                         0.8622      6744
   macro avg     0.8623    0.8703    0.8656      6744
weighted avg     0.8629    0.8622    0.8620      6744



BERT manual code model:
              precision    recall  f1-score   support

           0     0.8444    0.8814    0.8625       911
           1     0.9213    0.8967    0.9088      1084
           2     0.8357    0.8352    0.8354      1644
           3     0.8967    0.8680    0.8821      1250
           4     0.8589    0.8728    0.8658      1855

    accuracy                         0.8677      6744
   macro avg     0.8714    0.8708    0.8709      6744
weighted avg     0.8683    0.8677    0.8679      6744



RoBERTA manual code model:
              precision    recall  f1-score   support

           0     0.8285    0.9067    0.8658       911
           1     0.8879    0.9059    0.8968      1084
           2     0.8568    0.8078    0.8316      1644
           3     0.8617    0.8672    0.8644      1250
           4     0.8554    0.8453    0.8503      1855

    accuracy                         0.8582      6744
   macro avg     0.8581    0.8666    0.8618      6744
weighted avg     0.8585    0.8582    0.8579      6744



DeBERTA manual code model:
              precision    recall  f1-score   support

           0     0.8574    0.9243    0.8896       911
           1     0.9064    0.9114    0.9089      1084
           2     0.8684    0.8467    0.8574      1644
           3     0.8843    0.8808    0.8826      1250
           4     0.8766    0.8620    0.8693      1855

    accuracy                         0.8781      6744
   macro avg     0.8786    0.8850    0.8815      6744
weighted avg     0.8782    0.8781    0.8780      6744



In [16]:
# from IPython.display import Javascript

# def disconnect_runtime():
#     display(Javascript('google.colab.kernel.disconnect()'))

# disconnect_runtime()