<a href="https://colab.research.google.com/github/Hillascher5/nlp-tweets-sentiment-analysis/blob/main/Fix_bug_in_roberta.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# # Needed for Google Colab
# !pip install --quiet evaluate transformers optuna datasets nltk scikit-learn
# !pip install numpy==1.26.4

In [None]:
%env CUDA_LAUNCH_BLOCKING=1

from wordcloud import WordCloud, STOPWORDS
from geopy.geocoders import Nominatim
from geopy.exc import GeocoderTimedOut
from tqdm import tqdm
from collections import Counter
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, classification_report, f1_score, confusion_matrix, cohen_kappa_score
from sklearn.model_selection import train_test_split
from nltk.stem import WordNetLemmatizer
from transformers import AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding, TrainingArguments, Trainer
from datasets import Dataset
from torch.utils.data import DataLoader, TensorDataset
from optuna.pruners import MedianPruner
from scipy.stats import pearsonr
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import torch.nn.functional as F
import os
import re
import string
import time
import glob
import random
import nltk
import evaluate
import transformers
import torch
import optuna

USE_WANDB = True  # False when running without wandb
SAVE_OPTUNA_DB = False
SAVE_BEST_MODELS = True

num_train_samples = 5000

if USE_WANDB:
    import wandb
    wandb.login()

    os.environ["WANDB_PROJECT"] = f"tweet-sentiment-classification_{num_train_samples}_samples_optuna_roberta_bug_fix"
    os.environ["WANDB_INIT_TIMEOUT"] = "180"
else:
    os.environ["WANDB_DISABLED"] = "true"

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

In [None]:
def set_seed_all(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed_all(42)

In [None]:
# Load data
df_train = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_train.csv', encoding='latin1')
df_test = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/nlp_project/Data/Corona_NLP_test.csv', encoding='latin1')

In [None]:
# Merge and shuffle for better stratified splits
df_full = pd.concat([df_train, df_test], ignore_index=True)
df_full = df_full.sample(frac=1.0, random_state=42).reset_index(drop=True)

In [None]:
# Try minimal pre-processing
def light_preprocess(text):
    return text.strip()                             # Remove unnecessary spaces

is_preprocessed = "minimal_preprocess"
df_full["clean_text"] = df_full["OriginalTweet"].apply(light_preprocess)

In [None]:
# Mapping sentiments to unique numeric IDs
unique_labels = sorted(df_full["Sentiment"].unique())
label2id = {label: idx for idx, label in enumerate(unique_labels)}
df_full["label"] = df_full["Sentiment"].map(label2id)

In [None]:
# Stratified split: 70% train, 15% val, 15% test
train_val_df, test_df = train_test_split(df_full, test_size=0.15, stratify=df_full["label"], random_state=42)
train_df, val_df = train_test_split(train_val_df, test_size=0.1765, stratify=train_val_df["label"], random_state=42)

# Confirm sizes
print("Train size:", len(train_df))
print("Val size:", len(val_df))
print("Test size:", len(test_df))

In [None]:
train_subset_df, _ = train_test_split(
    train_df[["clean_text", "label"]],
    train_size=num_train_samples,
    stratify=train_df["label"],
    random_state=42
)

val_subset_df, _ = train_test_split(
    val_df[["clean_text", "label"]],
    train_size=500,
    stratify=val_df["label"],
    random_state=42
)

In [None]:
# Choose pretrained models
roberta_model_name = "roberta-base"

sentiment_labels = df_full['Sentiment'].unique()
n_labels = len(sentiment_labels)

# Load RoBERTa tokenizer and model
roberta_tokenizer = AutoTokenizer.from_pretrained(roberta_model_name)
roberta_model = AutoModelForSequenceClassification.from_pretrained(roberta_model_name, num_labels=n_labels)

In [None]:
def tokenize_function_roberta(examples):
    return roberta_tokenizer(examples["clean_text"], truncation=True, padding='max_length', max_length=256)

In [None]:
# Convert DataFrame to Hugging Face Dataset
hf_subset_train = Dataset.from_pandas(train_subset_df)
hf_subset_val = Dataset.from_pandas(val_subset_df)

hf_train = Dataset.from_pandas(train_df[["clean_text", "label"]])
hf_val = Dataset.from_pandas(val_df[["clean_text", "label"]])
hf_test = Dataset.from_pandas(test_df[["clean_text", "label"]])

In [None]:
# Tokenize subsets
# Tokenize for RoBERTa
tokenized_roberta_train_sub = hf_subset_train.map(tokenize_function_roberta, batched=True)
tokenized_roberta_train_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_val_sub = hf_subset_val.map(tokenize_function_roberta, batched=True)
tokenized_roberta_val_sub.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_train = hf_train.map(tokenize_function_roberta, batched=True)
tokenized_roberta_train.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_val = hf_val.map(tokenize_function_roberta, batched=True)
tokenized_roberta_val.set_format("torch", columns=["input_ids", "attention_mask", "label"])

tokenized_roberta_test = hf_test.map(tokenize_function_roberta, batched=True)
tokenized_roberta_test.set_format("torch", columns=["input_ids", "attention_mask", "label"])

In [None]:
accuracy_metric = evaluate.load("accuracy")
f1_metric = evaluate.load("f1")
precision_metric = evaluate.load("precision")
recall_metric = evaluate.load("recall")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=1)
    return {
        "accuracy": accuracy_metric.compute(predictions=predictions, references=labels)["accuracy"],
        "f1_macro": f1_metric.compute(predictions=predictions, references=labels, average="macro")["f1"],
        "precision_macro": precision_metric.compute(predictions=predictions, references=labels, average="macro")["precision"],
        "recall_macro": recall_metric.compute(predictions=predictions, references=labels, average="macro")["recall"],
    }

In [None]:
# 1) Load existing study from the SQLite file
db_path = "/content/drive/MyDrive/Colab Notebooks/nlp_project/optuna/roberta_study_stratify_maxl_256_minimal_preprocess_5000_samples_optuna.db"
study_name = "roberta_study_stratify_minimal_preprocess"

study_roberta = optuna.load_study(
    study_name=study_name,
    storage=f"sqlite:///{db_path}"
)

best_trial_roberta = study_roberta.best_trial
best_params_roberta = best_trial_roberta.params
print("Loaded best RoBERTa params:", best_params_roberta)

In [None]:
def build_trainer(model_checkpoint, trial, run_prefix, train_dataset, val_dataset, report_to="none"):
    # Sample hyperparameters
    learning_rate = trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    num_epochs = trial.suggest_int("num_train_epochs", 2, 5)
    n_samples = len(train_dataset)

    model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=5)
    tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

    run_name = f"{run_prefix}-ep{num_epochs}-lr{learning_rate}-bs{batch_size}-samples{n_samples}-run{int(time.time())}"

    args = TrainingArguments(
        output_dir=f"./results/{run_prefix}/{run_name}",
        disable_tqdm=True,
        fp16=True,
        eval_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        num_train_epochs=num_epochs,
        learning_rate=learning_rate,
        weight_decay=0.01,
        label_smoothing_factor=0.1,
        load_best_model_at_end=True,
        save_total_limit=1,
        logging_strategy="epoch",
        logging_dir=f"./logs/{run_prefix}/{run_name}",
        run_name=run_name,
        report_to=report_to,
        metric_for_best_model="f1_macro",
        greater_is_better=True
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        tokenizer=tokenizer,
        compute_metrics=compute_metrics
    )

    return trainer

In [None]:
best_params_roberta = best_trial_roberta.params
run_name_roberta = f"roberta_final_stratify_{is_preprocessed}-ep{best_params_roberta['num_train_epochs']}-lr{best_params_roberta['learning_rate']:.1e}-bs{best_params_roberta['batch_size']}"
if USE_WANDB:
  wandb.init(project=f"tweet-sentiment-classification-best_{num_train_samples}_samples_optuna_roberta_bug_fix", name=run_name_roberta, reinit=True)

final_trainer_roberta = build_trainer(
    model_checkpoint="roberta-base",
    trial=best_trial_roberta,
    run_prefix=f"roberta_final_stratify_{is_preprocessed}",
    train_dataset=tokenized_roberta_train,
    val_dataset=tokenized_roberta_val,
    report_to="wandb" if USE_WANDB else "none"
)
final_trainer_roberta.train()
final_trainer_roberta.evaluate(tokenized_roberta_test)
if USE_WANDB:
  wandb.finish()

In [None]:
save_dir_best = f"/content/drive/MyDrive/Colab Notebooks/nlp_project/models/w_test_split/HF_Trainer"

if SAVE_BEST_MODELS:
  final_trainer_roberta.save_model(f"{save_dir_best}/roberta_fixed_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna")
  roberta_tokenizer.save_pretrained(f"{save_dir_best}/roberta_fixed_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna")
  !cp -r {save_dir_best}roberta_fixed_final_stratify_{is_preprocessed}_{num_train_samples}_samples_optuna "{save_dir_best}/roberta_fixed_best_model_stratify_maxl_256_{is_preprocessed}_{num_train_samples}_samples_optuna"
