# Imports

In [None]:
import pandas as pd
from transformers import pipeline
from tqdm import tqdm
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Load dataset

In [6]:
df = pd.read_csv('data/IMDB-movie-reviews.csv', sep=';', encoding='ISO-8859-1')
df = df[['review', 'sentiment']]

In [7]:
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


# Use pretrained models

In [8]:
# Create the pipelines
sentiment_model_1 = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    device=-1
)
sentiment_model_2 = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    device=-1
)
sentiment_model_3 = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    device=-1
)
sentiment_model_4 = pipeline(
    "sentiment-analysis",
    model="siebert/sentiment-roberta-large-english",
    device=-1
)

# Function to predict in batches (faster than per review)
def batch_predict(pipe, texts, batch_size=16, max_length=512):
    results = []
    for i in tqdm(range(0, len(texts), batch_size)):
        batch = texts[i : i + batch_size]
        # ensure the input is not too long for the model
        batch = [t[:max_length] for t in batch]
        preds = pipe(batch)
        results.extend(preds)
    return results

# Make predictions
texts = df['review'].tolist()
preds1 = batch_predict(sentiment_model_1, texts)
preds2 = batch_predict(sentiment_model_2, texts)
preds3 = batch_predict(sentiment_model_3, texts)
preds4 = batch_predict(sentiment_model_4, texts)

# Unpack to labels and scores
df['label_model1'] = [p['label'] for p in preds1]
df['score_model1'] = [p['score'] for p in preds1]

df['label_model2'] = [p['label'] for p in preds2]
df['score_model2'] = [p['score'] for p in preds2]

df['label_model3'] = [p['label'] for p in preds3]
df['score_model3'] = [p['score'] for p in preds3]

df['label_model4'] = [p['label'] for p in preds4]
df['score_model4'] = [p['score'] for p in preds4]

# Check the first few rows
df.head()

config.json:   0%|          | 0.00/747 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/499M [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/687 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


pytorch_model.bin:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/256 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/150 [00:00<?, ?B/s]

100%|██████████| 7/7 [00:05<00:00,  1.33it/s]
100%|██████████| 7/7 [00:09<00:00,  1.40s/it]
100%|██████████| 7/7 [00:08<00:00,  1.23s/it]
100%|██████████| 7/7 [00:29<00:00,  4.17s/it]


Unnamed: 0,review,sentiment,label_model1,score_model1,label_model2,score_model2,label_model3,score_model3,label_model4,score_model4
0,One of the other reviewers has mentioned that ...,positive,NEGATIVE,0.601758,2 stars,0.266522,LABEL_1,0.470995,POSITIVE,0.998774
1,A wonderful little production. <br /><br />The...,positive,POSITIVE,0.9997,5 stars,0.506213,LABEL_2,0.973879,POSITIVE,0.998925
2,I thought this was a wonderful way to spend ti...,positive,POSITIVE,0.999031,4 stars,0.421487,LABEL_2,0.821539,POSITIVE,0.998933
3,Basically there's a family where a little boy ...,negative,NEGATIVE,0.999282,3 stars,0.426351,LABEL_0,0.596891,NEGATIVE,0.999474
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,POSITIVE,0.999811,4 stars,0.532169,LABEL_2,0.870845,POSITIVE,0.998887


# Save results

In [9]:
df.to_csv('result/sentiment_benchmarks.csv', index=False)

# Compute evaluation metrics

In [None]:
# normalize model1 labels to lowercase
df['pred1'] = df['label_model1'].str.lower()

# map nlptown star-ratings to binary sentiment
def star2bin(label):
    stars = int(label.split()[0])
    return 'negative' if stars <= 2 else 'positive'
df['pred2'] = df['label_model2'].apply(star2bin)

# map Twitter-RoBERTa three-way sentiment to binary (treat neutral as negative)
def three2bin(label):
    lab = label.lower()
    return 'positive' if lab == 'positive' else 'negative'
df['pred3'] = df['label_model3'].apply(three2bin)

# normalize RoBERTa-large labels to lowercase
df['pred4'] = df['label_model4'].str.lower()

# true labels
y_true = df['sentiment']

# compute metrics for each model
for i in range(1, 5):
    y_pred = df[f'pred{i}']
    print(f"=== Model {i} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print()

=== Model 1 ===
Accuracy: 0.86
              precision    recall  f1-score   support

    negative       0.87      0.90      0.88        58
    positive       0.85      0.81      0.83        42

    accuracy                           0.86       100
   macro avg       0.86      0.85      0.86       100
weighted avg       0.86      0.86      0.86       100

Confusion Matrix:
 [[52  6]
 [ 8 34]]

=== Model 2 ===
Accuracy: 0.81
              precision    recall  f1-score   support

    negative       0.91      0.74      0.82        58
    positive       0.72      0.90      0.80        42

    accuracy                           0.81       100
   macro avg       0.82      0.82      0.81       100
weighted avg       0.83      0.81      0.81       100

Confusion Matrix:
 [[43 15]
 [ 4 38]]

=== Model 3 ===
Accuracy: 0.58
              precision    recall  f1-score   support

    negative       0.58      1.00      0.73        58
    positive       0.00      0.00      0.00        42

    accurac

# Save benchmark results

In [13]:
# Write a consolidated benchmark report for all models
with open('result/benchmark_report.txt', 'w') as f:
    for i in range(1, 5):
        # Select the right pred column
        y_pred = df[f'pred{i}']
        # Model name mapping
        model_name = {
            1: "DistilBERT-SST2",
            2: "nlptown Multilingual",
            3: "Twitter-RoBERTa",
            4: "RoBERTa-large SST2"
        }[i]

        f.write(f"Model {i} ({model_name})\n")
        f.write(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}\n\n")
        f.write(classification_report(y_true, y_pred, zero_division=0))
        f.write("\n" + ("-"*40) + "\n\n")