<a href="https://colab.research.google.com/github/MAHESH20L/Sentiment_analysis_of_tweets/blob/main/Sentiment_analysis_of_tweets_DistilRoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import torch
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    Trainer,
    TrainingArguments
)

from torch.utils.data import Dataset


In [None]:
df = pd.read_csv("BTC_tweets_daily_example.csv")
print(df.head())


In [None]:
def clean_text(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "", text)
    text = re.sub(r"#\w+", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    return text


In [None]:
df["Tweet"] = df["Tweet"].astype(str)
df["Tweet"] = df["Tweet"].apply(clean_text)

# Remove rows without labels
df = df.dropna(subset=["New_Sentiment_State"])


In [None]:
label_mapping = {
    label: idx for idx, label in enumerate(sorted(df["New_Sentiment_State"].unique()))
}

df["label"] = df["New_Sentiment_State"].map(label_mapping)

num_labels = len(label_mapping)
print("Label mapping:", label_mapping)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df["Tweet"],
    df["label"],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)


In [None]:
MODEL_NAME = "distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
).to(device)

In [None]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels):
        self.encodings = tokenizer(
            texts.tolist(),
            truncation=True,
            padding=True,
            max_length=128
        )
        self.labels = labels.tolist()

    def __getitem__(self, idx):
        item = {k: torch.tensor(v[idx]) for k, v in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [None]:
train_dataset = TweetDataset(X_train, y_train)
test_dataset = TweetDataset(X_test, y_test)


In [None]:
training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=1,        # üî• FAST
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_steps=100,
    save_strategy="no",
    eval_strategy="no",
    fp16=True,                 # üî• MUCH FASTER ON GPU
    report_to="none"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset
)

In [None]:
trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=[str(key) for key in label_mapping.keys()]))

In [None]:
cm = confusion_matrix(y_test, y_pred)

disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap="Blues")
plt.title("DistilRoBERTa Confusion Matrix")
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Convert numeric labels to Series
actual_counts = pd.Series(y_test).value_counts().sort_index()
pred_counts = pd.Series(y_pred).value_counts().sort_index()

# Align indexes so missing classes don't cause NaN
df_plot = pd.DataFrame({
    "Actual": actual_counts,
    "Predicted": pred_counts
}).fillna(0)

# Plot
df_plot.plot(
    kind="bar",
    figsize=(6,4)
)

plt.title("Actual vs Predicted Sentiment Distribution")
plt.xlabel("Sentiment Class")
plt.ylabel("Count")
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

neg_pos = {
    "Negative": pred_counts.get(0, 0),   # class 0
    "Positive": pred_counts.get(2, 0)    # class 2
}

plt.bar(neg_pos.keys(), neg_pos.values())
plt.title("Negative vs Positive Predictions")
plt.ylabel("Count")
plt.tight_layout()
plt.show()


In [None]:
acc = accuracy_score(y_test, y_pred)

plt.bar(["Accuracy"], [acc])
plt.ylim(0, 1)
plt.title("Model Accuracy")
plt.show()


In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load DistilRoBERTa
MODEL_NAME = "distilroberta-base"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=3
)

model.eval()

# Input sentence
sentence = "I just love how Bitcoin destroyed my savings today"

# Tokenize
inputs = tokenizer(
    sentence,
    return_tensors="pt",
    truncation=True,
    padding=True
)

# Predict
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.softmax(outputs.logits, dim=1)

# Labels (model output space: 0,1,2)
labels = {
    0: "Negative üò°",
    1: "Neutral üòê",
    2: "Positive üòä"
}

# Max-probability decision
pred_id = torch.argmax(probs).item()

# Output
print("Sentence:", sentence)
print("Probabilities:")
print("Negative:", round(probs[0][0].item(), 3))
print("Neutral :", round(probs[0][1].item(), 3))
print("Positive:", round(probs[0][2].item(), 3))
print("\nFinal Prediction (max prob):", labels[pred_id])
