In [None]:
import os
import re
import nltk
import torch
import pickle
import pandas as pd
import numpy as np
import streamlit as st
import matplotlib.pyplot as plt
import seaborn as sns
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from torch.utils.data import Dataset
from transformers import (BertTokenizer, BertForSequenceClassification,
                          Trainer, TrainingArguments)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [w for w in tokens if w not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return ' '.join(tokens)

In [None]:
df = pd.read_excel('dataset_2_2.xlsx', sheet_name='Sheet1')
df[['text', 'emotion']] = df['row1'].str.split(';', expand=True)
df['cleaned_text'] = df['text'].apply(preprocess_text)
emotion_map = {'joy': 0, 'sadness': 1, 'anger': 2, 'fear': 3, 'love': 4, 'surprise': 5}
df['label'] = df['emotion'].map(emotion_map)

In [None]:
class TweetDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text, add_special_tokens=True, max_length=self.max_len,
            return_token_type_ids=False, padding='max_length', truncation=True,
            return_attention_mask=True, return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    df['cleaned_text'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
train_dataset = TweetDataset(X_train.values, y_train.values, tokenizer)
test_dataset = TweetDataset(X_test.values, y_test.values, tokenizer)

In [None]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased', num_labels=len(emotion_map)
).to(device)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    evaluation_strategy='epoch',
    save_strategy='epoch',
    load_best_model_at_end=True,
    metric_for_best_model='f1',
    no_cuda=True
)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    return {'f1': f1_score(labels, preds, average='macro')}

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    compute_metrics=compute_metrics
)
trainer.train()

In [None]:
predictions = trainer.predict(test_dataset)
y_pred = predictions.predictions.argmax(-1)
y_true = y_test.values
print("Macro F1:", f1_score(y_true, y_pred, average='macro'))
print("Classification Report:\n", classification_report(y_true, y_pred, target_names=emotion_map.keys()))

In [None]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)
lr_model = LogisticRegression(max_iter=1000, random_state=42)
lr_model.fit(X_train_tfidf, y_train)

y_pred_lr = lr_model.predict(X_test_tfidf)
print("Baseline Macro F1:", f1_score(y_true, y_pred_lr, average='macro'))
print("Baseline Report:\n", classification_report(y_true, y_pred_lr, target_names=emotion_map.keys()))

# Save models
model.save_pretrained('./emotion_bert_model')
tokenizer.save_pretrained('./emotion_bert_model')
with open('tfidf_vectorizer.pkl', 'wb') as f: pickle.dump(tfidf, f)
with open('lr_model.pkl', 'wb') as f: pickle.dump(lr_model, f)

In [None]:
st.title("Tweet Emotion Classifier")
st.write("Enter a tweet to predict its emotion using fine-tuned BERT and compare with Logistic Regression baseline.")

user_input = st.text_area("Enter your tweet:")
if user_input:
    cleaned_input = preprocess_text(user_input)
    encoding = tokenizer(cleaned_input, return_tensors='pt', max_length=128, padding='max_length', truncation=True)
    model.eval()
    with torch.no_grad():
        input_ids = encoding['input_ids'].to(device)
        attention_mask = encoding['attention_mask'].to(device)
        outputs = model(input_ids, attention_mask=attention_mask)
        probs = torch.nn.functional.softmax(outputs.logits, dim=-1)
        pred_label = torch.argmax(probs, dim=-1).item()
        emotion = {v: k for k, v in emotion_map.items()}[pred_label]
        st.write(f"**Predicted Emotion (BERT):** {emotion}")
        st.write("**Probabilities (BERT):**")
        for emo, prob in zip(emotion_map.keys(), probs[0].numpy()):
            st.write(f"{emo}: {prob:.4f}")
    # Logistic Regression
    input_tfidf = tfidf.transform([cleaned_input])
    lr_pred = lr_model.predict(input_tfidf)[0]
    lr_emotion = {v: k for k, v in emotion_map.items()}[lr_pred]
    lr_probs = lr_model.predict_proba(input_tfidf)[0]
    st.write(f"**Predicted Emotion (TF-IDF + Logistic Regression):** {lr_emotion}")
    st.write("**Probabilities (Logistic Regression):**")
    for emo, prob in zip(emotion_map.keys(), lr_probs):
        st.write(f"{emo}: {prob:.4f}")