In [None]:
import numpy as np
import os
import pandas as pd
import torch

from sklearn.metrics import accuracy_score
from tqdm import tqdm
from transformers import AutoTokenizer, pipeline

In [None]:
if torch.cuda.is_available(): device = torch.device("cuda")
elif torch.backends.mps.is_available(): device = torch.device("mps")
else: device = torch.device("cpu")

In [None]:
os.makedirs("output/", exist_ok=True)

In [None]:
test = pd.read_csv("data/processed/test.csv")

RoBERTa has a context length of 512 tokens. The Hugging Face pipeline does not support automatic truncation of longer sequences. A simple workaround is to truncate the text before classification, but since tokenization occurs at the subword level, the exact truncation point is unpredictable. To avoid exceeding the limit, it's necessary to pass shorter sequences to the model than it could process (e.g., 1500 characters, that could correspond to 450 tokens in average).

A more precise approach is to tokenize the text and then truncate it. However, since the pipeline does not accept tokenized input, the truncated text must be decoded before classification. This process is inefficient, as it requires encoding the text twice, but allows to make use of the full model capacity.

We also perform truncation from the left, to preserve summaries that sometimes appear at the end of reviews.

# Zero-shot classification with RoBERTa

In [None]:
# Load tokenizer separately and set truncation strategy
tokenizer = AutoTokenizer.from_pretrained("roberta-large-mnli")
tokenizer.truncation_side = "left"
tokenizer.model_max_length = 512

# Pass this tokenizer to the pipeline
classifier = pipeline("zero-shot-classification",
                      model = "roberta-large-mnli",
                      tokenizer = tokenizer,
                      framework = "pt",
                      device = device)

In [None]:
labels_set = [
    ["positive", "negative"],
    ["positive sentiment", "negative sentiment"],
    ["positive review", "negative review"],
    ["favorable opinion", "unfavorable opinion"],
    ["good movie", "bad movie"],
    ["excellent", "terrible"]
]

In [None]:
def get_sentiment(text, labels):
    result = classifier(text, 
                        candidate_labels = labels, 
                        truncation = True)
    return result["labels"][0]

models = []
for _, labels in enumerate(labels_set):
    # Create a column name based on the first label in each set
    column_name = f"RoBERTa_{labels[0].replace(' ', '_')}"
    models.append(column_name)
    
    # Apply the sentiment analysis with the current set of labels
    tqdm.pandas(desc=f"Processing with {labels}", unit = " reviews")
    test[column_name] = test["text"].progress_apply(lambda x: get_sentiment(x, labels))

    # Standardize labels
    mapping = {labels[0]: "positive", 
               labels[1]: "negative"}
    test[column_name] = test[column_name].replace(mapping)

In [None]:
accuracies = {model: accuracy_score(test["sentiment"], test[model]) for model in models}
accuracy_avg = pd.DataFrame(accuracies.items(), columns=["Labels", "Accuracy"])
accuracy_avg.style.hide(axis="index")

In [None]:
best_model = accuracy_avg.loc[accuracy_avg['Accuracy'].idxmax(), 'Model']
test = test.rename(columns={best_model: "RoBERTa_base"})
test[['review_id', 'RoBERTa_base']].to_csv("output/RoBERTa_base.csv")

# Zero-shot classification with SiEBERT

In [None]:
# Load tokenizer separately and set truncation strategy
tokenizer = AutoTokenizer.from_pretrained("siebert/sentiment-roberta-large-english")
tokenizer.truncation_side = "left"
tokenizer.model_max_length = 512

# Pass the tokenizer to the pipeline
classifier = pipeline("sentiment-analysis",
                      model = "siebert/sentiment-roberta-large-english",
                      tokenizer = tokenizer,
                      framework = "pt",
                      device = device)

In [None]:
def get_sentiment(text):
    result = classifier(text, truncation=True)
    return result[0]["label"].lower()

tqdm.pandas(unit = " reviews")
test["SiEBERT"] = test["text"].progress_apply(get_sentiment)

In [None]:
test[['review_id', 'SiEBERT']].to_csv("output/SiEBERT.csv")

In [None]:
accuracy = accuracy_score(test['sentiment'], test['SiEBERT'])
print(f"Accuracy: {accuracy:.4f}")