In [1]:
import numpy as np
import os
import pandas as pd
import torch

from sklearn.metrics import accuracy_score
from tqdm import tqdm
from transformers import pipeline

In [2]:
# Set device
if torch.cuda.is_available(): device = torch.device("cuda")
elif torch.backends.mps.is_available(): device = torch.device("mps")
else: device = torch.device("cpu")

# Avoid issues with multithreading
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [3]:
train = pd.read_csv("data/processed/train.csv")
test = pd.read_csv("data/processed/test.csv")

# Zero-shot classification with RoBERTa

In [4]:
classifier = pipeline("zero-shot-classification",
                      model = "roberta-large-mnli",
                      framework = "pt", # Using PyTorch to avoid conflicts with Keras
                      device = device)

Some weights of the model checkpoint at roberta-large-mnli were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Device set to use mps


In [5]:
def get_sentiment(text):
    result = classifier(text[:512], candidate_labels = ["positive", "negative"])
    sentiment = result["labels"][0]
    return sentiment

tqdm.pandas(unit = 'reviews')
test["RoBERTa_base"] = test["text"].progress_apply(get_sentiment)

100%|██████████| 25000/25000 [19:28<00:00, 21.39reviews/s]


In [6]:
os.makedirs("output/", exist_ok=True)
test[['review_id', 'RoBERTa_base']].to_csv("output/RoBERTa_base.csv")

In [7]:
accuracy = accuracy_score(test['sentiment'], test['RoBERTa_base'])
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.7953


# Zero-shot classification with SiEBERT

In [8]:
classifier = pipeline("sentiment-analysis",
                      model = "siebert/sentiment-roberta-large-english",
                      framework = "pt", # Using PyTorch to avoid conflicts with Keras
                      device = device)

Device set to use mps


In [9]:
def get_sentiment(text):
    result = classifier(text[:512])[0]
    sentiment = result['label'].lower()
    return sentiment

tqdm.pandas(unit = 'reviews')
test["SiEBERT"] = test["text"].progress_apply(get_sentiment)

100%|██████████| 25000/25000 [09:54<00:00, 42.03reviews/s]


In [10]:
os.makedirs("output/", exist_ok=True)
test[['review_id', 'SiEBERT']].to_csv("output/SiEBERT.csv")

In [11]:
accuracy = accuracy_score(test['sentiment'], test['SiEBERT'])
print(f"Accuracy: {accuracy:.4f}")

Accuracy: 0.9048
