# qeustion 5 - sentinemnt analysis


In [1]:
%pip install transformers datasets torch

[0mNote: you may need to restart the kernel to use updated packages.


## Load Model and Tokenizer

In [2]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

#the nodel used
model_urI = "michelecafagna26/t5-base-finetuned-sst2-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_urI)
model = AutoModelForSeq2SeqLM.from_pretrained(model_urI)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

print(f"Successfully loaded model: {model_urI} on device {device}")

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Successfully loaded model: michelecafagna26/t5-base-finetuned-sst2-sentiment on device cuda


## Sentiment Prediction Function

In [3]:
def sentiment(text, eval=False):
    if eval:
        model.eval()

    inputs = tokenizer("sentiment: " + text, max_length=128, truncation=True, return_tensors="pt").input_ids.to(next(model.parameters()).device)
    preds = model.generate(inputs)
    decoded_preds = tokenizer.batch_decode(sequences=preds, skip_special_tokens=True)
    prediction = decoded_preds[0]
    
    if prediction == 'p':
        if not eval:
            print(f"{text} has positive Sentiment ")
        return 1
    elif prediction == 'n':
        if not eval:
            print(f"{text} has negative Sentiment ")
        return 0
    print(f"Error")
    return -1

## Predict Sentiment

In [4]:
text = "This movie is awesome"
_ = sentiment(text)


This movie is awesome has positive Sentiment 


In [5]:
text = "I didn’t like the movie so much"
_ = sentiment(text)

I didn’t like the movie so much has negative Sentiment 


In [6]:
text = "I’m not sure what I think about this movie."
_ = sentiment(text)

I’m not sure what I think about this movie. has negative Sentiment 


In [7]:
text = "Did you like the movie?"
_ = sentiment(text)

Did you like the movie? has positive Sentiment 


## Load SST2 

In [8]:
from datasets import load_dataset

dataset = load_dataset("SetFit/sst2")
test_dataset = dataset["test"]



Repo card metadata block was not found. Setting CardData to empty.


## Evaluate Model

creates a data loader for faster and bathced evalutaion

In [9]:
from torch.utils.data import DataLoader
def preprocess(batch):
    return tokenizer(batch["text"], truncation=True, padding='max_length', max_length=128)
test_dataset = test_dataset.map(preprocess, batched=True)
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_loader = DataLoader(test_dataset, batch_size=32)

for every batch generetes predictions -> transfering the predictions to their respective integers -> does a comparrision with the dataset's labels

In [14]:
from tqdm.auto import tqdm

bullseye = 0
total_examples = 0

for batch  in tqdm(test_loader, desc="Evaluating batches"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    labels = batch['label'].to(device)
    preds = model.generate(input_ids=input_ids, attention_mask=attention_mask)
    decoded_preds = tokenizer.batch_decode(preds, skip_special_tokens=True)
    pred_labels = [1 if p == 'p' else 0 for p in decoded_preds]
    bullseye += sum([int(pred == label.item()) for pred, label in zip(pred_labels, labels)])
    total_examples += len(labels)

accuracy = bullseye / total_examples
print(f"\nAccuracy is {accuracy*100:.1f}%")

Evaluating batches: 100%|██████████| 57/57 [00:18<00:00,  3.08it/s]


Accuracy is 94.7%





##  SST2 Balance

simply goes over all the example in the dataset and saves their labels

In [11]:
label_counts = {}

for example in test_dataset:
    label = int(example["label"])
    label_counts[label] = label_counts.get(label, 0) + 1

label_map_print = {0: "negative (n)", 1: "positive (p)"}
for label_id, count in label_counts.items():
    print(f"Label {label_id} ({label_map_print[label_id]}): {count} examples\n")

count_n = label_counts.get(0, 0)
count_p = label_counts.get(1, 0)
total = count_n + count_p
proportion_n = count_n / total
proportion_p = count_p / total
print(f"Proportion of Negative (0): {proportion_n:.4f}\n")
print(f"Proportion of Positive (1): {proportion_p:.4f}\n")


Label 0 (negative (n)): 912 examples

Label 1 (positive (p)): 909 examples

Proportion of Negative (0): 0.5008

Proportion of Positive (1): 0.4992

