# Imports

In [1]:
import uvicorn
import gradio as gr
import pandas as pd
import nest_asyncio
from tqdm import tqdm
from pathlib import Path
from fastapi import FastAPI
from threading import Thread
from pydantic import BaseModel
from transformers import pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

  from .autonotebook import tqdm as notebook_tqdm
Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Load dataset

In [2]:
df = pd.read_csv(Path('data') / 'IMDB-movie-reviews.csv', sep=';', encoding='ISO-8859-1')
df.head(10)

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive
5,"Probably my all-time favorite movie, a story o...",positive
6,I sure would like to see a resurrection of a u...,positive
7,"This show was an amazing, fresh & innovative i...",negative
8,Encouraged by the positive comments about this...,negative
9,If you like original gut wrenching laughter yo...,positive


# Use pretrained models

In [3]:
# Create the sentiment analysis pipelines
distilbert_pipeline = pipeline(
    "sentiment-analysis",
    model="distilbert-base-uncased-finetuned-sst-2-english",
    # https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english
    device=-1
)
roberta_large_pipeline = pipeline(
    "sentiment-analysis",
    model="siebert/sentiment-roberta-large-english",
    # https://huggingface.co/siebert/sentiment-roberta-large-english
    device=-1
)
multilingual_pipeline = pipeline(
    "sentiment-analysis",
    model="nlptown/bert-base-multilingual-uncased-sentiment",
    # https://huggingface.co/nlptown/bert-base-multilingual-uncased-sentiment
    device=-1
)
textattack_bert_pipeline = pipeline(
    "sentiment-analysis",
    model="textattack/bert-base-uncased-SST-2",
    # https://huggingface.co/textattack/bert-base-uncased-SST-2
    device=-1
)
textattack_roberta_pipeline = pipeline(
    "sentiment-analysis",
    model="textattack/roberta-base-SST-2",
    # https://huggingface.co/textattack/roberta-base-SST-2
    device=-1
)
twitter_roberta_pipeline = pipeline(
    "sentiment-analysis",
    model="cardiffnlp/twitter-roberta-base-sentiment",
    # https://huggingface.co/cardiffnlp/twitter-roberta-base-sentiment
    device=-1
)

# Function to predict in batches
def batch_predict(pipe, texts, batch_size=16, max_length=512):
    results = []
    for i in tqdm(range(0, len(texts), batch_size), desc="Predicting"):
        batch = [t[:max_length] for t in texts[i : i + batch_size]]
        preds = pipe(batch)
        results.extend(preds)
    return results

# Make predictions
texts = df['review'].tolist()
preds_distilbert = batch_predict(distilbert_pipeline, texts)
preds_roberta_large = batch_predict(roberta_large_pipeline, texts)
preds_multilingual = batch_predict(multilingual_pipeline, texts)
preds_textattack_bert = batch_predict(textattack_bert_pipeline, texts)
preds_textattack_roberta = batch_predict(textattack_roberta_pipeline, texts)
preds_twitter_roberta = batch_predict(twitter_roberta_pipeline, texts)

# Unpack to labels and scores
df['label_distilbert']       = [p['label'] for p in preds_distilbert]
df['score_distilbert']       = [p['score'] for p in preds_distilbert]

df['label_roberta_large']    = [p['label'] for p in preds_roberta_large]
df['score_roberta_large']    = [p['score'] for p in preds_roberta_large]

df['label_multilingual']     = [p['label'] for p in preds_multilingual]
df['score_multilingual']     = [p['score'] for p in preds_multilingual]

df['label_textattack_bert']  = [p['label'] for p in preds_textattack_bert]
df['score_textattack_bert']  = [p['score'] for p in preds_textattack_bert]

df['label_textattack_roberta']= [p['label'] for p in preds_textattack_roberta]
df['score_textattack_roberta']= [p['score'] for p in preds_textattack_roberta]

df['label_twitter_roberta']  = [p['label'] for p in preds_twitter_roberta]
df['score_twitter_roberta']  = [p['score'] for p in preds_twitter_roberta]

Some weights of the model checkpoint at textattack/roberta-base-SST-2 were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Predicting: 100%|██████████| 7/7 [00:05<00:00,  1.34it/s]
Predicting: 100%|██████████| 7/7 [00:46<00:00,  6.69s/it]
Predicting: 100%|██████████| 7/7 [00:09<00:00,  1.39s/it]
Predicting: 100%|██████████| 7/7 [00:08<00:00,  1.28s/it]
Predicting: 100%|██████████| 7/7 [00:09<00:00,  1.33s/it]
Predicting: 10

In [4]:
df.head(10)

Unnamed: 0,review,sentiment,label_distilbert,score_distilbert,label_roberta_large,score_roberta_large,label_multilingual,score_multilingual,label_textattack_bert,score_textattack_bert,label_textattack_roberta,score_textattack_roberta,label_twitter_roberta,score_twitter_roberta
0,One of the other reviewers has mentioned that ...,positive,NEGATIVE,0.601758,POSITIVE,0.998774,2 stars,0.266522,LABEL_1,0.989746,LABEL_1,0.946431,LABEL_1,0.470995
1,A wonderful little production. <br /><br />The...,positive,POSITIVE,0.9997,POSITIVE,0.998925,5 stars,0.506213,LABEL_1,0.999622,LABEL_1,0.999189,LABEL_2,0.973879
2,I thought this was a wonderful way to spend ti...,positive,POSITIVE,0.999031,POSITIVE,0.998933,4 stars,0.421487,LABEL_1,0.999445,LABEL_1,0.997534,LABEL_2,0.821539
3,Basically there's a family where a little boy ...,negative,NEGATIVE,0.999282,NEGATIVE,0.999474,3 stars,0.426351,LABEL_0,0.98381,LABEL_0,0.984107,LABEL_0,0.596891
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,POSITIVE,0.999811,POSITIVE,0.998887,4 stars,0.532169,LABEL_1,0.999538,LABEL_1,0.994273,LABEL_2,0.870845
5,"Probably my all-time favorite movie, a story o...",positive,POSITIVE,0.999543,POSITIVE,0.998879,5 stars,0.527373,LABEL_1,0.999295,LABEL_1,0.99708,LABEL_2,0.96592
6,I sure would like to see a resurrection of a u...,positive,POSITIVE,0.973266,POSITIVE,0.998406,3 stars,0.305226,LABEL_1,0.997984,LABEL_1,0.990563,LABEL_2,0.79583
7,"This show was an amazing, fresh & innovative i...",negative,NEGATIVE,0.999639,NEGATIVE,0.999511,1 star,0.527377,LABEL_0,0.998305,LABEL_0,0.99124,LABEL_0,0.887934
8,Encouraged by the positive comments about this...,negative,NEGATIVE,0.999747,NEGATIVE,0.999517,1 star,0.780873,LABEL_0,0.998682,LABEL_0,0.995693,LABEL_0,0.941357
9,If you like original gut wrenching laughter yo...,positive,POSITIVE,0.999701,POSITIVE,0.9989,5 stars,0.736615,LABEL_1,0.999583,LABEL_1,0.998446,LABEL_2,0.978246


# Save results

In [5]:
df.to_csv('result/sentiment_benchmarks.csv', index=False)

# Compute evaluation metrics

In [6]:
# Mapping helpers
def star2bin(label):
    stars = int(label.split()[0])
    return 'negative' if stars <= 2 else 'positive'

def ta_label2bin(label):
    return 'positive' if label == 'LABEL_1' else 'negative'

def tw_roberta3bin(label):
    return 'positive' if label == 'LABEL_2' or label.lower() == 'positive' else 'negative'

# Create binary predictions matching your renamed columns
df['pred_distilbert']        = df['label_distilbert'].str.lower()
df['pred_roberta_large']     = df['label_roberta_large'].str.lower()
df['pred_multilingual']      = df['label_multilingual'].apply(star2bin)
df['pred_textattack_bert']   = df['label_textattack_bert'].apply(ta_label2bin)
df['pred_textattack_roberta']= df['label_textattack_roberta'].apply(ta_label2bin)
df['pred_twitter_roberta']   = df['label_twitter_roberta'].apply(tw_roberta3bin)

# True labels
y_true = df['sentiment']

# Iterate over each model and print metrics
model_keys = [
    'distilbert',
    'roberta_large',
    'multilingual',
    'textattack_bert',
    'textattack_roberta',
    'twitter_roberta'
]

for key in model_keys:
    y_pred = df[f'pred_{key}']
    print(f"=== {key.replace('_', ' ').title()} ===")
    print("Accuracy:", accuracy_score(y_true, y_pred))
    print(classification_report(y_true, y_pred, zero_division=0))
    print("Confusion Matrix:\n", confusion_matrix(y_true, y_pred))
    print()

=== Distilbert ===
Accuracy: 0.86
              precision    recall  f1-score   support

    negative       0.87      0.90      0.88        58
    positive       0.85      0.81      0.83        42

    accuracy                           0.86       100
   macro avg       0.86      0.85      0.86       100
weighted avg       0.86      0.86      0.86       100

Confusion Matrix:
 [[52  6]
 [ 8 34]]

=== Roberta Large ===
Accuracy: 0.93
              precision    recall  f1-score   support

    negative       0.96      0.91      0.94        58
    positive       0.89      0.95      0.92        42

    accuracy                           0.93       100
   macro avg       0.93      0.93      0.93       100
weighted avg       0.93      0.93      0.93       100

Confusion Matrix:
 [[53  5]
 [ 2 40]]

=== Multilingual ===
Accuracy: 0.81
              precision    recall  f1-score   support

    negative       0.91      0.74      0.82        58
    positive       0.72      0.90      0.80        4

# Save benchmark results

In [7]:
# Mapping from model key to human-readable name
model_names = {
    'distilbert': "DistilBERT-SST2",
    'roberta_large': "Siebert RoBERTa-large",
    'multilingual': "nlptown 1-5 stars",
    'textattack_bert': "TextAttack BERT-SST2",
    'textattack_roberta': "TextAttack RoBERTa-SST2",
    'twitter_roberta': "CardiffNLP Twitter RoBERTa",
}

# Write consolidated benchmark report
with open("result/benchmark_report.txt", "w", encoding="utf-8") as f:
    for key, name in model_names.items():
        y_pred = df[f"pred_{key}"]
        f.write(f"{name}\n")
        f.write(f"Accuracy: {accuracy_score(y_true, y_pred):.4f}\n\n")
        f.write(classification_report(y_true, y_pred, zero_division=0))
        f.write("\n" + "-" * 50 + "\n\n")

# Interactive demo with Gradio

In [None]:
# Mapping human-readable names to your pipelines
pipeline_map = {
    "DistilBERT-SST2": distilbert_pipeline,
    "RoBERTa-large SST2": roberta_large_pipeline,
    "nlptown Multilingual": multilingual_pipeline,
    "TextAttack BERT-SST2": textattack_bert_pipeline,
    "TextAttack RoBERTa-SST2": textattack_roberta_pipeline,
    "Twitter RoBERTa": twitter_roberta_pipeline,
}

# Unified label mappers for each model
label_mappers = {
    "DistilBERT-SST2":        lambda raw: raw.capitalize(),
    "RoBERTa-large SST2":     lambda raw: raw.capitalize(),
    "nlptown Multilingual":   lambda raw: "Positive" if int(raw.split()[0]) >= 3 else "Negative",
    "TextAttack BERT-SST2":   lambda raw: "Positive" if raw == "LABEL_1" else "Negative",
    "TextAttack RoBERTa-SST2":lambda raw: "Positive" if raw == "LABEL_1" else "Negative",
    "Twitter RoBERTa":        lambda raw: "Positive" if raw == "LABEL_2" else "Negative",
}

def classify(text: str, model_name: str):
    pipe   = pipeline_map[model_name]
    raw    = pipe(text[:512])[0]["label"]
    score  = float(pipe(text[:512])[0]["score"])
    label  = label_mappers[model_name](raw)
    return label, score

iface = gr.Interface(
    fn=classify,
    inputs=[
        gr.Textbox(lines=5, placeholder="Type a movie review…", label="Review"),
        gr.Radio(choices=list(pipeline_map.keys()), label="Model",
                 value="DistilBERT-SST2"),
    ],
    outputs=[
        gr.Label(num_top_classes=1, label="Predicted Sentiment"),
        gr.Number(label="Confidence"),
    ],
    title="IMDB Review Sentiment Demo",
    description="Pick a model and see Positive vs. Negative",
)

iface.launch()

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.




INFO:     127.0.0.1:38034 - "GET / HTTP/1.1" 200 OK
INFO:     127.0.0.1:38034 - "GET /theme.css?v=63194d3741d384f9f85db890247b6c0ef9e7abac0f297f40a15c59fe4baba916 HTTP/1.1" 200 OK


# Predict API with FastAPI

In [None]:
nest_asyncio.apply()

# Reuse your pipeline_map and label_mappers from above
app = FastAPI(title="IMDB Sentiment API")

class ReviewRequest(BaseModel):
    text: str
    model: str  = "DistilBERT-SST2"

@app.post("/predict")
def predict(req: ReviewRequest):
    if req.model not in pipeline_map:
        return {
            "error": f"Unknown model '{req.model}'. "
                     f"Choose from: {list(pipeline_map.keys())}"
        }
    pipe  = pipeline_map[req.model]
    raw   = pipe(req.text[:512])[0]
    label = label_mappers[req.model](raw["label"])
    score = float(raw["score"])
    return {
        "model": req.model,
        "label": label,
        "score": round(score, 4)
    }

# Launch FastAPI in a background thread
def _run_api():
    uvicorn.run(app, host="127.0.0.1", port=8000, log_level="info")

thread = Thread(target=_run_api, daemon=True)
thread.start()

print("🚀 API running at http://127.0.0.1:8000/docs")

🚀 API running at http://127.0.0.1:8000/docs


INFO:     Started server process [17468]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://127.0.0.1:8000 (Press CTRL+C to quit)
