In [1]:
import pandas as pd
import math
import matplotlib.pyplot as plt
from datasets import Dataset
from transformers import (
    BertTokenizer, RobertaTokenizer, XLMRobertaTokenizer,
    AutoModelForSequenceClassification, TrainingArguments, Trainer
)
import evaluate
from sklearn.model_selection import train_test_split

In [2]:
train_df = pd.read_csv('dataset/train.csv')
test_df = pd.read_csv('dataset/test.csv')

In [3]:
train, val = train_test_split(train_df, test_size=0.2, stratify=train_df['Sentiment'], random_state=42)

In [4]:
train_dataset = Dataset.from_pandas(train)
val_dataset = Dataset.from_pandas(val)

label_list = sorted(train_df['Sentiment'].unique())
label2id = {l: i for i, l in enumerate(label_list)}
id2label = {i: l for i, l in enumerate(label_list)}
num_labels = len(label_list)

def encode_labels(example):
    example['labels'] = label2id[example['Sentiment']]
    return example

train_dataset = train_dataset.map(encode_labels)
val_dataset = val_dataset.map(encode_labels)



In [5]:
bert_tok = BertTokenizer.from_pretrained('bert-base-cased')
roberta_tok = RobertaTokenizer.from_pretrained('roberta-base')
xlmr_tok = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

In [6]:
def tok_bert(example):
    return bert_tok(example['Text'], truncation=True, padding='max_length', max_length=64)

def tok_roberta(example):
    return roberta_tok(example['Text'], truncation=True, padding='max_length', max_length=64)

def tok_xlmr(example):
    return xlmr_tok(example['Text'], truncation=True, padding='max_length', max_length=64)


In [7]:
print(bert_tok.tokenize(train_dataset[0]['Text']))
print(roberta_tok.tokenize(train_dataset[0]['Text']))
print(xlmr_tok.tokenize(train_dataset[0]['Text']))

In [8]:
accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")

def compute_metrics(eval_pred):
    preds, labels = eval_pred
    preds = preds.argmax(axis=1)
    return {
        "accuracy": accuracy.compute(predictions=preds, references=labels)["accuracy"],
        "f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"]
    }

def train_and_eval(model_name, tokenizer_func):
    tokenized_train = train_dataset.map(tokenizer_func, batched=True).remove_columns(['Text'])
    tokenized_val = val_dataset.map(tokenizer_func, batched=True).remove_columns(['Text'])
    tokenized_train.set_format('torch')
    tokenized_val.set_format('torch')

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

    args = TrainingArguments(
        output_dir=f"./results/{model_name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        learning_rate=2e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=5,
        weight_decay=0.01,
        logging_steps=10,
        load_best_model_at_end=True,
        disable_tqdm=False
    )

    trainer = Trainer(
        model=model,
        args=args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        tokenizer=bert_tok,
        compute_metrics=compute_metrics,
    )

    trainer.train()
    return trainer.evaluate(), trainer.state.log_history


In [None]:
results = {}
loss = {}

results['BERT'], loss['BERT'] = train_and_eval('bert-base-cased', tok_bert)
results['RoBERTa'], loss['RoBERTa'] = train_and_eval('roberta-base', tok_roberta)
results['XLM-R'], loss['XLM-R'] = train_and_eval('xlm-roberta-base', tok_xlmr)

results

In [22]:
def plot_loss_curves(loss_dict, cols=3):
    n_models = len(loss_dict)
    rows = math.ceil(n_models / cols)
    fig, axes = plt.subplots(rows, cols, figsize=(6 * cols, 4 * rows))
    axes = axes.flatten()

    for i, (model_name, log_history) in enumerate(loss_dict.items()):
        log_history = pd.DataFrame(log_history)
        train_loss = log_history[log_history['loss'].notna()]
        eval_loss = log_history[log_history['eval_loss'].notna()]

        ax = axes[i]
        ax.plot(train_loss['step'], train_loss['loss'], label='Train Loss', color='tab:blue')
        ax.plot(eval_loss['step'], eval_loss['eval_loss'], label='Val Loss', color='tab:orange', linestyle='--')
        ax.set_title(f"{model_name}")
        ax.set_xlabel("Training Step")
        ax.set_ylabel("Loss")
        ax.grid(True, linestyle=":")
        ax.legend()

    for j in range(i + 1, len(axes)):
        fig.delaxes(axes[j])

    plt.tight_layout(rect=[0, 0, 1, 0.95])
    plt.show()

In [23]:
plot_loss_curves(loss)