In [7]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import nltk 
import pandas as pd
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
import matplotlib.pyplot as plt
from transformers import DistilBertForSequenceClassification


In [None]:

input_file = "twitter-2016train-A.txt"
output_file = "twitter-2016train-A.csv"

data = pd.read_csv(input_file, sep="\t", header=None, quoting=3)  

data.columns = ["ID", "Sentiment", "Text"]

# Save to CSV
#data.to_csv(output_file, index=False)

print(f"File has been converted and saved as {output_file}")


File has been converted and saved as twitter-2016train-A.csv


In [11]:
data = pd.read_csv('twitter-2016train-A.csv')
data.head()
data['Sentiment'].value_counts()

Sentiment
positive    3094
neutral     2043
negative     863
Name: count, dtype: int64

In [None]:

def calculate_accuracy(dataset):
    sia = SIA()
    correct = 0
    for index, row in dataset.iterrows():
        text = row['Text']
        true_label = row['Sentiment']

        sentiment_scores = sia.polarity_scores(text)
        predicted_label = 'positive' if sentiment_scores['compound'] > 0.05 else \
                          'negative' if sentiment_scores['compound'] < -0.05 else 'neutral'

        if predicted_label == true_label:
            correct += 1
    
    return correct / len(dataset)

calculate_accuracy(data)


0.5065

In [None]:
testdf = pd.read_csv('Reddit_Data.csv')
testdf = testdf.head(500)
testdf.columns = ["Text", "Sentiment"]

sentiment_mapping = {-1: "negative", 0: "neutral", 1: "positive"}
testdf["Sentiment"] = testdf["Sentiment"].map(sentiment_mapping)

testdf["Text"] = testdf["Text"].fillna("").astype(str)

testdf

# NLTK Model

In [None]:
from sklearn.metrics import classification_report
from nltk.sentiment import SentimentIntensityAnalyzer as SIA


def calculate_classification_report(dataset):
    sia = SIA()
    true_labels = []
    predicted_labels = []

    for index, row in dataset.iterrows():
        text = row['Text']
        true_label = row['Sentiment']

        sentiment_scores = sia.polarity_scores(text)
        predicted_label = 'positive' if sentiment_scores['compound'] > 0.05 else \
                          'negative' if sentiment_scores['compound'] < -0.05 else 'neutral'

        true_labels.append(true_label)
        predicted_labels.append(predicted_label)

    report = classification_report(true_labels, predicted_labels, target_names=['negative', 'neutral', 'positive'])
    print(report)
    

calculate_classification_report(data)


              precision    recall  f1-score   support

    negative       0.41      0.51      0.45       391
     neutral       0.46      0.34      0.39       765
    positive       0.55      0.62      0.58       843

    accuracy                           0.49      1999
   macro avg       0.47      0.49      0.47      1999
weighted avg       0.49      0.49      0.48      1999



# TextBlob Model

In [None]:
from sklearn.metrics import classification_report
from textblob import TextBlob
import pandas as pd

def calculate_classification_report(dataset):
    true_labels = []
    predicted_labels = []

    for index, row in dataset.iterrows():
        text = row['Text']
        true_label = row['Sentiment']

        blob = TextBlob(text)
        polarity = blob.sentiment.polarity

        if polarity > 0:
            predicted_label = 'positive'
        elif polarity < 0:
            predicted_label = 'negative'
        else:
            predicted_label = 'neutral'

        true_labels.append(true_label)
        predicted_labels.append(predicted_label)

    report = classification_report(true_labels, predicted_labels, target_names=['neagative', 'neutral', 'positve'], zero_division=0)
    print(report)

calculate_classification_report(data)


              precision    recall  f1-score   support

   neagative       0.28      0.31      0.30       391
     neutral       0.43      0.39      0.41       765
     positve       0.52      0.54      0.53       843

    accuracy                           0.44      1999
   macro avg       0.41      0.41      0.41      1999
weighted avg       0.44      0.44      0.44      1999



# distilbert-base-uncased-finetuned-sst-2-english

In [None]:
from sklearn.metrics import classification_report
from transformers import pipeline
import pandas as pd

classifier = pipeline('sentiment-analysis')

def calculate_classification_report(dataset):
    true_labels = []
    predicted_labels = []

    for index, row in dataset.iterrows():
        text = row['Text']
        true_label = row['Sentiment']
        sentiment_prediction = classifier(text)[0]
        predicted_label = sentiment_prediction['label'].lower() 

        if predicted_label == 'negative': 
            predicted_label = 'negative'
        elif predicted_label == 'positive':
            predicted_label = 'positive'
        else:
            predicted_label = 'neutral'  

        true_labels.append(true_label)
        predicted_labels.append(predicted_label)

    report = classification_report(true_labels, predicted_labels, target_names=['negative', 'neutral', 'positive'], zero_division=0)
    print(report)

calculate_classification_report(data)


No model was supplied, defaulted to distilbert/distilbert-base-uncased-finetuned-sst-2-english and revision 714eb0f (https://huggingface.co/distilbert/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


              precision    recall  f1-score   support

    negative       0.26      0.87      0.40       391
     neutral       0.00      0.00      0.00       765
    positive       0.66      0.54      0.60       843

    accuracy                           0.40      1999
   macro avg       0.31      0.47      0.33      1999
weighted avg       0.33      0.40      0.33      1999



# twitter-roberta-base-sentiment

In [6]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import pipeline
from sklearn.metrics import classification_report
import numpy as np
from scipy.special import softmax
import pandas as pd

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def calculate_classification_report(dataset):
    MODEL = "cardiffnlp/twitter-roberta-base-sentiment"

    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    true_labels = []
    predicted_labels = []
    for index, row in dataset.iterrows():
        text = row['Text']
        true_label = row['Sentiment']

        text = preprocess(text)

        encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        predicted_label = 'positive' if ranking[0] == 2 else 'negative' if ranking[0] == 0 else 'neutral'

        true_labels.append(true_label)
        predicted_labels.append(predicted_label)

    report = classification_report(true_labels, predicted_labels, target_names=['negative', 'neutral', 'positive'], zero_division=0)
    print(report)


calculate_classification_report(data)


NameError: name 'data' is not defined

# twitter-roberta-base-sentiment-latest

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AutoConfig
from transformers import pipeline
from sklearn.metrics import classification_report
import numpy as np
from scipy.special import softmax
import pandas as pd

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def calculate_classification_report(dataset):
    MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
    tokenizer = AutoTokenizer.from_pretrained(MODEL)
    config = AutoConfig.from_pretrained(MODEL)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL)

    true_labels = []
    predicted_labels = []

    for index, row in dataset.iterrows():
        text = row['Text']
        true_label = row['Sentiment']

        text = preprocess(text)

        encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        predicted_label = config.id2label[ranking[0]]  

        if predicted_label == 'positive':
            predicted_label = 'positive'
        elif predicted_label == 'negative':
            predicted_label = 'negative'
        else:
            predicted_label = 'neutral'

        true_labels.append(true_label)
        predicted_labels.append(predicted_label)

    report = classification_report(true_labels, predicted_labels, target_names=['negative', 'neutral', 'postive'], zero_division=0)
    print(report)


calculate_classification_report(data)


Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


              precision    recall  f1-score   support

    negative       0.62      0.75      0.68       391
     neutral       0.58      0.69      0.63       765
     postive       0.85      0.63      0.72       843

    accuracy                           0.67      1999
   macro avg       0.69      0.69      0.68      1999
weighted avg       0.70      0.67      0.68      1999



In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

data = {
    "Model": ["NLTK", "TextBlob", "Roberta (58M)", "Roberta (124M)"],
    "Accuracy": [0.48, 0.44, 0.74, 0.67],
    "Recall": [0.49, 0.41, 0.75, 0.69],
    "F1-Score": [0.47, 0.41, 0.74, 0.68],
}

df = pd.DataFrame(data)

df_melted = df.melt(id_vars="Model", var_name="Metric", value_name="Score")

pivot_df = df_melted.pivot(index="Model", columns="Metric", values="Score")

plt.figure(figsize=(8, 6))
heatmap = sns.heatmap(
    pivot_df,
    annot=True,
    cmap="coolwarm",
    fmt=".2f",
    linewidths=0.5,
    cbar_kws={"label": "Score"},
)

plt.title("Model Performance Metrics", fontsize=16)
plt.ylabel("Model", fontsize=12)
plt.xlabel("Metric", fontsize=12)
plt.tight_layout()
plt.show()


## Fine Tuning

In [3]:
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer
import torch
import time

model_name = "cardiffnlp/twitter-roberta-base-sentiment"

def read_twitter_data(file_path):
    data = pd.read_csv(file_path)
    texts = data['Text'].tolist()
    label_mapping = {
        "negative": 0,
        "neutral": 1,
        "positive": 2
    }
    labels = data['Sentiment'].map(label_mapping).tolist()
    return texts, labels

train_file = "twitter-2016train-A.csv"
test_file = "twitter-2016dev-A.csv"

train_texts, train_labels = read_twitter_data(train_file)
test_texts, test_labels = read_twitter_data(test_file)

train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=.2)

tokenizer = AutoTokenizer.from_pretrained(model_name)

train_encodings = tokenizer(train_texts, truncation=True, padding=True)
val_encodings = tokenizer(val_texts, truncation=True, padding=True)
test_encodings = tokenizer(test_texts, truncation=True, padding=True)

class TwitterSentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = TwitterSentimentDataset(train_encodings, train_labels)
val_dataset = TwitterSentimentDataset(val_encodings, val_labels)
test_dataset = TwitterSentimentDataset(test_encodings, test_labels)

from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
import torch

model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment", num_labels=3)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer
)

start_time = time.time()
print(f"Starting training at: {time.ctime(start_time)}")

trainer.train()

end_time = time.time()
print(f"Finished training at: {time.ctime(end_time)}")
print(f"Training duration: {end_time - start_time} seconds")


Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.
  trainer = Trainer(


Starting training at: Thu Dec  5 07:33:48 2024


  0%|          | 0/900 [00:00<?, ?it/s]

{'loss': 0.6495, 'grad_norm': 6.784781455993652, 'learning_rate': 1.0000000000000002e-06, 'epoch': 0.03}
{'loss': 0.634, 'grad_norm': 11.83465576171875, 'learning_rate': 2.0000000000000003e-06, 'epoch': 0.07}
{'loss': 0.5725, 'grad_norm': 13.066274642944336, 'learning_rate': 3e-06, 'epoch': 0.1}
{'loss': 0.5455, 'grad_norm': 7.430876731872559, 'learning_rate': 4.000000000000001e-06, 'epoch': 0.13}
{'loss': 0.62, 'grad_norm': 7.372413158416748, 'learning_rate': 5e-06, 'epoch': 0.17}
{'loss': 0.5535, 'grad_norm': 11.339559555053711, 'learning_rate': 6e-06, 'epoch': 0.2}
{'loss': 0.5296, 'grad_norm': 18.505552291870117, 'learning_rate': 7.000000000000001e-06, 'epoch': 0.23}
{'loss': 0.5131, 'grad_norm': 15.720605850219727, 'learning_rate': 8.000000000000001e-06, 'epoch': 0.27}
{'loss': 0.5514, 'grad_norm': 10.298125267028809, 'learning_rate': 9e-06, 'epoch': 0.3}
{'loss': 0.5404, 'grad_norm': 18.28573989868164, 'learning_rate': 1e-05, 'epoch': 0.33}
{'loss': 0.4385, 'grad_norm': 10.277826

  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.5294369459152222, 'eval_runtime': 65.7169, 'eval_samples_per_second': 18.26, 'eval_steps_per_second': 0.289, 'epoch': 1.0}
{'loss': 0.445, 'grad_norm': 12.2139892578125, 'learning_rate': 3.1e-05, 'epoch': 1.03}
{'loss': 0.4654, 'grad_norm': 25.845762252807617, 'learning_rate': 3.2000000000000005e-05, 'epoch': 1.07}
{'loss': 0.4453, 'grad_norm': 18.3277530670166, 'learning_rate': 3.3e-05, 'epoch': 1.1}
{'loss': 0.4632, 'grad_norm': 22.22279167175293, 'learning_rate': 3.4000000000000007e-05, 'epoch': 1.13}
{'loss': 0.4648, 'grad_norm': 20.698711395263672, 'learning_rate': 3.5e-05, 'epoch': 1.17}
{'loss': 0.5635, 'grad_norm': 20.40741729736328, 'learning_rate': 3.6e-05, 'epoch': 1.2}
{'loss': 0.3939, 'grad_norm': 11.87649154663086, 'learning_rate': 3.7e-05, 'epoch': 1.23}
{'loss': 0.4427, 'grad_norm': 21.463041305541992, 'learning_rate': 3.8e-05, 'epoch': 1.27}
{'loss': 0.4069, 'grad_norm': 23.146318435668945, 'learning_rate': 3.9000000000000006e-05, 'epoch': 1.3}
{'loss':

  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.6037917733192444, 'eval_runtime': 65.0387, 'eval_samples_per_second': 18.451, 'eval_steps_per_second': 0.292, 'epoch': 2.0}
{'loss': 0.2843, 'grad_norm': 10.6403169631958, 'learning_rate': 3.625e-05, 'epoch': 2.03}
{'loss': 0.3025, 'grad_norm': 12.859021186828613, 'learning_rate': 3.5e-05, 'epoch': 2.07}
{'loss': 0.2402, 'grad_norm': 10.841034889221191, 'learning_rate': 3.375000000000001e-05, 'epoch': 2.1}
{'loss': 0.2027, 'grad_norm': 17.02640724182129, 'learning_rate': 3.2500000000000004e-05, 'epoch': 2.13}
{'loss': 0.2991, 'grad_norm': 6.704336643218994, 'learning_rate': 3.125e-05, 'epoch': 2.17}
{'loss': 0.23, 'grad_norm': 13.587874412536621, 'learning_rate': 3e-05, 'epoch': 2.2}
{'loss': 0.2638, 'grad_norm': 11.648451805114746, 'learning_rate': 2.8749999999999997e-05, 'epoch': 2.23}
{'loss': 0.3658, 'grad_norm': 18.21412467956543, 'learning_rate': 2.7500000000000004e-05, 'epoch': 2.27}
{'loss': 0.2988, 'grad_norm': 6.200451850891113, 'learning_rate': 2.625e-05, 'ep

  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.6798734664916992, 'eval_runtime': 62.986, 'eval_samples_per_second': 19.052, 'eval_steps_per_second': 0.302, 'epoch': 3.0}
{'train_runtime': 3534.7372, 'train_samples_per_second': 4.074, 'train_steps_per_second': 0.255, 'train_loss': 0.44507627103063796, 'epoch': 3.0}
Finished training at: Thu Dec  5 08:32:43 2024
Training duration: 3535.0471205711365 seconds


In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from sklearn.metrics import classification_report
import numpy as np
from scipy.special import softmax
import pandas as pd

def preprocess(text):
    new_text = []
    for t in text.split(" "):
        t = '@user' if t.startswith('@') and len(t) > 1 else t
        t = 'http' if t.startswith('http') else t
        new_text.append(t)
    return " ".join(new_text)

def calculate_classification_report(dataset):
   
    model_path = "./results/checkpoint-900"  

    tokenizer = AutoTokenizer.from_pretrained(model_path)
    model = AutoModelForSequenceClassification.from_pretrained(model_path)

    true_labels = []
    predicted_labels = []

    for index, row in dataset.iterrows():
        text = row['Text']
        true_label = row['Sentiment']

        text = preprocess(text)

        encoded_input = tokenizer(text, return_tensors='pt', truncation=True, max_length=512)
        output = model(**encoded_input)
        scores = output[0][0].detach().numpy()
        scores = softmax(scores)

        ranking = np.argsort(scores)
        ranking = ranking[::-1]
        predicted_label = 'positive' if ranking[0] == 2 else 'negative' if ranking[0] == 0 else 'neutral'

        true_labels.append(true_label)
        predicted_labels.append(predicted_label)
    report = classification_report(true_labels, predicted_labels, target_names=['negative', 'neutral', 'positive'], zero_division=0)
    print(report)


calculate_classification_report(data)


              precision    recall  f1-score   support

    negative       0.93      0.89      0.91       863
     neutral       0.90      0.89      0.90      2043
    positive       0.95      0.96      0.95      3094

    accuracy                           0.93      6000
   macro avg       0.92      0.91      0.92      6000
weighted avg       0.93      0.93      0.93      6000

