# Fine-tune Twitter-RoBERTa on Sentiment140

This notebook fine-tunes `cardiffnlp/twitter-roberta-base-sentiment` on the Sentiment140 dataset using the same preprocessing as classic models.

In [None]:
# Install required libraries
!pip install datasets emoji
























In [45]:
# Upgrade transformers to latest version for TrainingArguments compatibility
%pip install transformers==4.40.0

















In [46]:
import transformers
print(transformers.__version__)

4.40.0



In [47]:
# Import libraries
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
import emoji
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\queri\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\queri\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Preprocessing Function (same as classic models)

In [48]:
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def clean_tweet(tweet):
    tweet = re.sub(r'http\S+|www\S+|https\S+', '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r'@\w+', '', tweet)
    tweet = re.sub(r'\W', ' ', tweet)
    tweet = re.sub(r'\d', '', tweet)
    tweet = re.sub(r'\s+', ' ', tweet).strip()
    tokens = tweet.split()
    tokens = [lemmatizer.lemmatize(stemmer.stem(word)) for word in tokens if word.lower() not in stop_words]
    tweet = ' '.join(tokens)
    return tweet

## Load and Prepare Sentiment140 Data

In [49]:
df = pd.read_csv('sentiment140.csv', encoding='ISO-8859-1')
df.columns = ['target', 'id', 'date', 'flag', 'user', 'text']
df['cleaned_text'] = df['text'].apply(clean_tweet)

# Map targets to 0 (negative), 1 (neutral), 2 (positive) for RoBERTa
def map_target(val):
    if val == 0:
        return 0  # negative
    elif val == 2:
        return 1  # neutral
    else:
        return 2  # positive

df['label'] = df['target'].apply(map_target)

# Reduce dataset size for demo/training speed (optional)
#f = df.sample(200000, random_state=42)

## Train/Validation Split

In [50]:
train_df, val_df = train_test_split(df, test_size=0.1, stratify=df['label'], random_state=42)
print(f"Train size: {len(train_df)}, Validation size: {len(val_df)}")

Train size: 1439999, Validation size: 160000



## Tokenization

In [51]:
model_name = "cardiffnlp/twitter-roberta-base-sentiment"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples['cleaned_text'], truncation=True, padding='max_length', max_length=128)



## Convert to HuggingFace Dataset

In [52]:
from datasets import Dataset

train_dataset = Dataset.from_pandas(train_df[['cleaned_text', 'label']].reset_index(drop=True))
val_dataset = Dataset.from_pandas(val_df[['cleaned_text', 'label']].reset_index(drop=True))

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

Map:   0%|          | 0/1439999 [00:00<?, ? examples/s]

Map:   0%|          | 0/160000 [00:00<?, ? examples/s]

## Model Setup

In [53]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3)



## Training Arguments

In [54]:
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    logging_dir='./logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    save_total_limit=1
)

## Define Metrics

In [55]:
%pip install evaluate

import evaluate
accuracy = evaluate.load('accuracy')

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)







## Trainer Setup and Training

In [56]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

trainer.train()

  0%|          | 0/180000 [00:00<?, ?it/s]

{'loss': 0.6504, 'grad_norm': 5.710131645202637, 'learning_rate': 4.997222222222223e-05, 'epoch': 0.0}

{'loss': 0.5493, 'grad_norm': 7.0283589363098145, 'learning_rate': 4.994444444444445e-05, 'epoch': 0.0}
{'loss': 0.5493, 'grad_norm': 7.0283589363098145, 'learning_rate': 4.994444444444445e-05, 'epoch': 0.0}
{'loss': 0.5503, 'grad_norm': 5.387299537658691, 'learning_rate': 4.991666666666667e-05, 'epoch': 0.0}
{'loss': 0.5503, 'grad_norm': 5.387299537658691, 'learning_rate': 4.991666666666667e-05, 'epoch': 0.0}
{'loss': 0.5517, 'grad_norm': 4.560728073120117, 'learning_rate': 4.9888888888888894e-05, 'epoch': 0.0}
{'loss': 0.5517, 'grad_norm': 4.560728073120117, 'learning_rate': 4.9888888888888894e-05, 'epoch': 0.0}
{'loss': 0.5412, 'grad_norm': 3.863149881362915, 'learning_rate': 4.986111111111111e-05, 'epoch': 0.01}
{'loss': 0.5412, 'grad_norm': 3.863149881362915, 'learning_rate': 4.986111111111111e-05, 'epoch': 0.01}
{'loss': 0.5507, 'grad_norm': 7.912822246551514, 'learning_rate': 

  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.6931484937667847, 'eval_accuracy': 0.5, 'eval_runtime': 575.0376, 'eval_samples_per_second': 278.243, 'eval_steps_per_second': 8.695, 'epoch': 1.0}

{'loss': 0.693, 'grad_norm': 0.5016804337501526, 'learning_rate': 2.4972222222222226e-05, 'epoch': 1.0}
{'loss': 0.693, 'grad_norm': 0.5016804337501526, 'learning_rate': 2.4972222222222226e-05, 'epoch': 1.0}
{'loss': 0.6933, 'grad_norm': 0.5654802322387695, 'learning_rate': 2.4944444444444447e-05, 'epoch': 1.0}
{'loss': 0.6933, 'grad_norm': 0.5654802322387695, 'learning_rate': 2.4944444444444447e-05, 'epoch': 1.0}
{'loss': 0.6934, 'grad_norm': 0.43717437982559204, 'learning_rate': 2.4916666666666668e-05, 'epoch': 1.0}
{'loss': 0.6934, 'grad_norm': 0.43717437982559204, 'learning_rate': 2.4916666666666668e-05, 'epoch': 1.0}
{'loss': 0.693, 'grad_norm': 0.4623045027256012, 'learning_rate': 2.488888888888889e-05, 'epoch': 1.0}
{'loss': 0.693, 'grad_norm': 0.4623045027256012, 'learning_rate': 2.488888888888889e-05, 'epoch': 1.0}

  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.5019493699073792, 'eval_accuracy': 0.75809375, 'eval_runtime': 575.3246, 'eval_samples_per_second': 278.104, 'eval_steps_per_second': 8.691, 'epoch': 2.0}

{'train_runtime': 36855.8779, 'train_samples_per_second': 78.142, 'train_steps_per_second': 4.884, 'train_loss': 0.660599854681227, 'epoch': 2.0}
{'train_runtime': 36855.8779, 'train_samples_per_second': 78.142, 'train_steps_per_second': 4.884, 'train_loss': 0.660599854681227, 'epoch': 2.0}


TrainOutput(global_step=180000, training_loss=0.660599854681227, metrics={'train_runtime': 36855.8779, 'train_samples_per_second': 78.142, 'train_steps_per_second': 4.884, 'total_flos': 1.8944152920745114e+17, 'train_loss': 0.660599854681227, 'epoch': 2.0})

## Evaluate on Validation Set

In [57]:
eval_results = trainer.evaluate()
print(eval_results)

  0%|          | 0/5000 [00:00<?, ?it/s]

{'eval_loss': 0.5019493699073792, 'eval_accuracy': 0.75809375, 'eval_runtime': 575.1808, 'eval_samples_per_second': 278.173, 'eval_steps_per_second': 8.693, 'epoch': 2.0}



## Inference Example

In [58]:
from transformers import pipeline

finetuned_pipe = pipeline("sentiment-analysis", model=trainer.model, tokenizer=tokenizer)
texts = ["I love this product!", "This is the worst experience ever."]
results = finetuned_pipe(texts)
for text, result in zip(texts, results):
    print(f"Text: {text}")
    print(f"Sentiment: {result['label']}, Score: {result['score']:.4f}\n")

Text: I love this product!
Sentiment: LABEL_2, Score: 0.9452

Text: This is the worst experience ever.
Sentiment: LABEL_0, Score: 0.9074


Sentiment: LABEL_2, Score: 0.9452

Text: This is the worst experience ever.
Sentiment: LABEL_0, Score: 0.9074



In [None]:
import os
import joblib

os.makedirs('./saved_models', exist_ok=True)

# Save the model and tokenizer using Hugging Face's save_pretrained
model_path = './saved_models/roberta_sentiment140'
trainer.model.save_pretrained(model_path)
tokenizer.save_pretrained(model_path)

# Also save the preprocessing function for deployment
joblib.dump(clean_tweet, './saved_models/tweet_cleaner.pkl')

print(f"Model and tokenizer saved to {model_path}")

Model and tokenizer saved to ./saved_models/roberta_sentiment140



In [None]:
from transformers import pipeline
import numpy as np
from scipy.special import softmax

model_name = "cardiffnlp/twitter-roberta-base-sentiment"  
sentiment_analyzer = pipeline("sentiment-analysis", model=model_name, tokenizer=model_name, return_all_scores=True)
## Using  fine-tuned model
#sentiment_analyzer = pipeline("sentiment-analysis", 
#                             model=trainer.model, 
#                            tokenizer=tokenizer, 
#                             return_all_scores=True)


def analyze_sentiment(text):
    
    result = sentiment_analyzer(text)[0]
    
    result.sort(key=lambda x: x['score'], reverse=True)
    
    print(f"Text: {text}")
    for i, score_data in enumerate(result):
        label = score_data['label']
        score = score_data['score']
        print(f"{i+1}) {label} {score:.4f}")
    print()

texts = ["I love this product!", "This is the worst experience ever.", "Good night 😊"]

for text in texts:
    analyze_sentiment(text)

Text: I love this product!
1) LABEL_2 0.9916
2) LABEL_1 0.0063
3) LABEL_0 0.0021

Text: This is the worst experience ever.
1) LABEL_0 0.9766
2) LABEL_1 0.0199
3) LABEL_2 0.0036

Text: Good night 😊
1) LABEL_2 0.8466
2) LABEL_1 0.1458
3) LABEL_0 0.0076



In [75]:
from transformers import pipeline
import numpy as np
from scipy.special import softmax

model_name = "cardiffnlp/twitter-roberta-base-sentiment"  
#sentiment_analyzer = pipeline("sentiment-analysis", model=model_name, tokenizer=model_name, return_all_scores=True)
## Using  fine-tuned model
sentiment_analyzer = pipeline("sentiment-analysis", 
                            model=trainer.model, 
                           tokenizer=tokenizer, 
                             return_all_scores=True)


def analyze_sentiment(text):
    
    result = sentiment_analyzer(text)[0]
    
    result.sort(key=lambda x: x['score'], reverse=True)
    
    print(f"Text: {text}")
    for i, score_data in enumerate(result):
        label = score_data['label']
        score = score_data['score']
        print(f"{i+1}) {label} {score:.4f}")
    print()

texts = ["I love this product!", "This is the worst experience ever.", "Good night 😊"]

for text in texts:
    analyze_sentiment(text)

Text: I love this product!
1) LABEL_2 0.9452
2) LABEL_0 0.0548
3) LABEL_1 0.0000

Text: This is the worst experience ever.
1) LABEL_0 0.9074
2) LABEL_2 0.0926
3) LABEL_1 0.0000

Text: Good night 😊
1) LABEL_2 0.9270
2) LABEL_0 0.0730
3) LABEL_1 0.0000

