<a href="https://colab.research.google.com/github/Mahalakshmi-Telidevara/Sentiment-Classification-Using-RoBERTa/blob/main/sentiment_Classification_Using_RoBERTa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pandas numpy joblib matplotlib scikit-learn datasets

In [None]:
!pip install -U transformers==4.51.3

In [3]:
# 1. Import Libraries
import pandas as pd
import numpy as np
import joblib
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from imblearn.over_sampling import SMOTE
import torch
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

In [None]:
# Load Dataset
try:
    df = pd.read_csv('twitter_multi_class_sentiment.csv')
    if not df.empty:
        print("Dataset uploaded successfully!\n")
        print("First few rows:\n", df.head())
        print("\nLast few rows:\n", df.tail())
        print("\nUnique sentiment labels:", df['label_name'].unique())
    else:
        print("Dataset upload unsuccessful: File is empty.")
except FileNotFoundError:
    print("Dataset upload unsuccessful: File not found.")
except Exception as e:
    print("Dataset upload unsuccessful:", str(e))

In [None]:
# 3. Preprocess the data
df.dropna(subset=['label_name'], inplace=True)
label_encoder = LabelEncoder()
df['encoded_label'] = label_encoder.fit_transform(df['label_name'])
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['encoded_label'], test_size=0.2, random_state=42)

# Check class distribution in the dataset
print("Original dataset class distribution:")
print(df['encoded_label'].value_counts())
print("\nTraining set class distribution:")
print(y_train.value_counts())
print("\nTesting set class distribution:")
print(y_test.value_counts())
print("Preprocessing completed successfully!")

In [None]:
# 4. TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

In [None]:
# 5. Train SVM Model
print("Training SVM model...")
svm_model = SVC(kernel='linear', probability=True)
svm_model.fit(X_train_tfidf, y_train)

In [None]:
# 6. Predictions
y_pred_svm = svm_model.predict(X_test_tfidf)

In [None]:
# 7. Classification report of SVM
print("SVM Classification Report:")
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))

In [None]:
# 8. Calculate SVM performance
svm_eval_results = {
    'accuracy': accuracy_score(y_test, y_pred_svm),
    'precision': precision_score(y_test, y_pred_svm, average='weighted'),
    'recall': recall_score(y_test, y_pred_svm, average='weighted'),
    'f1-score': f1_score(y_test, y_pred_svm, average='weighted')
}

In [None]:
# 9. Save model and vectorizer
joblib.dump(svm_model, "svm_model.pkl")
joblib.dump(vectorizer, "tfidf_vectorizer.pkl")

In [None]:
# 10. Train Random Forest Model
print("Training Random Forest model...")
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

In [None]:
# 11. Predictions
y_pred_rf = rf_model.predict(X_test_tfidf)

In [None]:
# 12. Classification report
print("Random Forest Classification Report:")
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))

In [None]:
# 13. Calculate Random Forest performance
rf_eval_results = {
    'accuracy': accuracy_score(y_test, y_pred_rf),
    'precision': precision_score(y_test, y_pred_rf, average='weighted'),
    'recall': recall_score(y_test, y_pred_rf, average='weighted'),
    'f1-score': f1_score(y_test, y_pred_rf, average='weighted')
}

In [None]:
# 14. Save Random Forest model
joblib.dump(rf_model, "rf_model.pkl")

In [None]:
# 15. Tokenization for RoBERTa
# Load tokenizer
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')

# Define tokenization function
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=128)

# Convert Pandas DataFrame to Hugging Face Dataset
train_df = pd.DataFrame({'text': X_train, 'label': y_train})
test_df = pd.DataFrame({'text': X_test, 'label': y_test})

train_dataset = Dataset.from_pandas(train_df).map(tokenize_function, batched=True, remove_columns=["text"])
test_dataset = Dataset.from_pandas(test_df).map(tokenize_function, batched=True, remove_columns=["text"])

In [None]:
# 16. Load pre-trained RoBERTa model with sequence classification head
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained RoBERTa model for classification
model = RobertaForSequenceClassification.from_pretrained(
    'roberta-base', num_labels=len(label_encoder.classes_)
).to(device)  # Move model to GPU if available


In [None]:
# 17. Define training arguments
training_args = TrainingArguments(
    output_dir='./results',
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
    save_steps=10,
    load_best_model_at_end=True,
    report_to="none",  # Disable W&B logging if not needed
    push_to_hub=False  # Ensure no API interactions
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)



In [None]:
# 18. Start training
print("Training RoBERTa model...")
trainer.train()

In [None]:
# 19. Evaluate RoBERTa model
eval_results = trainer.evaluate(test_dataset)
print(eval_results)

In [None]:
# 20. Get predictions for RoBERTa
y_pred_roberta = trainer.predict(test_dataset).predictions
y_pred_roberta = np.argmax(y_pred_roberta, axis=1)

In [None]:
# 21. Classification report of RoBERTa
print("RoBERTa Classification Report:")
print(classification_report(y_test, y_pred_roberta, target_names=label_encoder.classes_))

In [None]:
# 22. Calculate Random Forest performance
roberta_eval_results = {
    'accuracy': accuracy_score(y_test, y_pred_roberta),
    'precision': precision_score(y_test, y_pred_roberta, average='weighted'),
    'recall': recall_score(y_test, y_pred_roberta, average='weighted'),
    'f1-score': f1_score(y_test, y_pred_roberta, average='weighted')
}

In [None]:
# 23. Save RoBERTa model
model.save_pretrained('./saved_model')
tokenizer.save_pretrained('./saved_model')
joblib.dump(label_encoder, "./saved_model/label_encoder.pkl")

In [None]:
# 24. Plot comparison graph
metrics = ['accuracy', 'precision', 'recall', 'f1-score']
roberta_scores = [roberta_eval_results[m] for m in metrics]
svm_scores = [svm_eval_results[m] for m in metrics]
rf_scores = [rf_eval_results[m] for m in metrics]

x = np.arange(len(metrics))
width = 0.25

fig, ax = plt.subplots()
rects1 = ax.bar(x - width, roberta_scores, width, label='RoBERTa')
rects2 = ax.bar(x, svm_scores, width, label='SVM')
rects3 = ax.bar(x + width, rf_scores, width, label='Random Forest')

ax.set_ylabel('Scores')
ax.set_title('Model Performance Comparison')
ax.set_xticks(x)
ax.set_xticklabels(metrics)
ax.legend()

plt.ylim(0, 1)
plt.show()

In [None]:
# 25. Confusion Matrices
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
models = ['Random Forest', 'SVM', 'RoBERTa']
preds = [y_pred_rf, y_pred_svm, y_pred_roberta]
for i, (model, y_pred) in enumerate(zip(models, preds)):
    cm = confusion_matrix(y_test, y_pred)
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[i])
    axes[i].set_title(f'{model} Confusion Matrix')
    axes[i].set_xlabel('Predicted')
    axes[i].set_ylabel('Actual')
plt.show()

In [None]:
! pip install streamlit -q

In [None]:
! wget -q -O - ipv4.icanhazip.com

In [None]:
%%writefile app.py
import streamlit as st
from transformers import RobertaTokenizer, RobertaForSequenceClassification, pipeline
import os
import joblib
import torch

# Ensure the model is correctly located (Colab path handling)
model_path = "/content/saved_model"  # Adjust this if the model is in another directory

# Streamlit UI
st.title("Sentiment Classification Using RoBERTa")
st.write("Enter a text below, and the app will predict its sentiment.")

if not os.path.exists(model_path):
    st.error("Model directory not found! Ensure you have trained and saved the model.")
else:
    try:
        # Load model and tokenizer
        model = RobertaForSequenceClassification.from_pretrained(model_path)
        tokenizer = RobertaTokenizer.from_pretrained(model_path)

        # Load LabelEncoder to decode sentiment labels
        label_encoder_path = os.path.join(model_path, "label_encoder.pkl")
        if os.path.exists(label_encoder_path):
            label_encoder = joblib.load(label_encoder_path)
        else:
            st.error("Label encoder not found!")
            label_encoder = None  # Prevent crash if missing

        # Create sentiment analysis pipeline (ensuring GPU is used if available)
        device = 0 if torch.cuda.is_available() else -1  # Use GPU if available
        predictor = pipeline("sentiment-analysis", model=model, tokenizer=tokenizer, device=device)

        # Text input
        text_input = st.text_area("Enter text:", "")

        if st.button("Analyze"):
            if text_input.strip():
                result = predictor(text_input)
                predicted_label = result[0]['label']
                confidence = result[0]['score']

                # Convert "LABEL_X" to actual sentiment if label_encoder is loaded
                if label_encoder:
                    label_index = int(predicted_label.split('_')[-1])
                    decoded_label = label_encoder.inverse_transform([label_index])[0]
                    st.success(f"**Prediction:** {decoded_label}")
                else:
                    st.success(f"**Prediction (Raw):** {predicted_label}")

                st.write(f"**Confidence:** {confidence:.4f}")

            else:
                st.warning("Please enter some text.")

    except Exception as e:
        st.error(f"Error loading model: {str(e)}")

In [None]:
! streamlit run app.py & npx localtunnel --port 8501