<a href="https://colab.research.google.com/github/KaifAhmad1/sentiment-analyzer/blob/main/NEXA_Sentiment_Analysis_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Sentiment Analysis System using Twitter Tweets.**

In [23]:
!pip install -qU dash xgboost

In [58]:
# ---------------------------
# 1. Import Libraries
# ---------------------------
import os
import re
import numpy as np
import pandas as pd
from wordcloud import WordCloud

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

import plotly.express as px
import plotly.graph_objects as go

# For deep learning model (optional)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, Dropout
from tensorflow.keras.utils import to_categorical

import xgboost as xgb

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [26]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

# Load Dataset
train_path = '/content/drive/MyDrive/Twitter Sentiment Data/twitter_training.csv'
test_path = '/content/drive/MyDrive/Twitter Sentiment Data/twitter_validation.csv'
data = pd.read_csv(train_path)
data.head(25)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
5,2402,Borderlands,Positive,So I spent a few hours making something for fu...
6,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
7,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
8,2402,Borderlands,Positive,So I spent a few hours making something for fu...
9,2402,Borderlands,Positive,2010 So I spent a few hours making something f...


In [27]:
data = pd.read_csv(train_path, header=None)

# Assign descriptive column names based on the expected structure:
data.columns = ['id', 'topic', 'sentiment', 'text']

In [28]:
print("\nData Information:")
data.info()


Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   topic      74682 non-null  object
 2   sentiment  74682 non-null  object
 3   text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [29]:
print("\nStatistical Summary for Numeric Columns:")
print(data.describe())


Statistical Summary for Numeric Columns:
                 id
count  74682.000000
mean    6432.586165
std     3740.427870
min        1.000000
25%     3195.000000
50%     6422.000000
75%     9601.000000
max    13200.000000


In [30]:
print("\nSentiment Distribution:")
print(data['sentiment'].value_counts())


Sentiment Distribution:
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [38]:
# ---------------------------
# 3. Data Cleaning
# ---------------------------
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        return " ".join(tokens)
    return text

In [39]:
# Drop rows with missing text and create a new column with cleaned text
data = data.dropna(subset=['text'])
data['clean_text'] = data['text'].apply(clean_text)

In [40]:
# ---------------------------
# 4. Filter Data (Keep Only Three Sentiments)
# ---------------------------
data_ml = data[data['sentiment'].isin(['Positive', 'Negative', 'Neutral'])].copy()

# Create a simulated date column (for sentiment trend analysis)
data_ml['date'] = pd.date_range(start='2021-01-01', periods=len(data_ml), freq='h')
data_ml['day'] = data_ml['date'].dt.date

In [41]:
# ---------------------------
# 5. Exploratory Data Analysis (EDA)
# ---------------------------
# 5.1 Sentiment Distribution Plot
sentiment_counts = data_ml['sentiment'].value_counts().reindex(['Positive', 'Neutral', 'Negative']).reset_index()
sentiment_counts.columns = ['sentiment', 'count']
fig1 = px.bar(sentiment_counts, x='sentiment', y='count',
              title="Sentiment Distribution",
              labels={'sentiment': 'Sentiment', 'count': 'Count'},
              text='count')
fig1.update_traces(texttemplate='%{text}', textposition='outside')
fig1.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig1.show()

In [42]:
# 5.2 Word Clouds for Each Sentiment
sentiments = ['Positive', 'Neutral', 'Negative']
for sentiment in sentiments:
    text_data = " ".join(data_ml[data_ml['sentiment'] == sentiment]['clean_text'].tolist())
    wc = WordCloud(width=800, height=400, background_color='white').generate(text_data)
    fig_wc = px.imshow(wc.to_array(), title=f"Word Cloud for {sentiment} Tweets")
    fig_wc.update_xaxes(visible=False)
    fig_wc.update_yaxes(visible=False)
    fig_wc.update_layout(coloraxis_showscale=False)
    fig_wc.show()

In [43]:
# 5.3 Sentiment Trends Over Time
sentiment_trends = data_ml.groupby(['day', 'sentiment']).size().unstack(fill_value=0).reset_index()
fig3 = px.line(sentiment_trends, x='day', y=['Positive', 'Neutral', 'Negative'],
               title='Sentiment Trends Over Time',
               labels={'value': 'Number of Tweets', 'day': 'Date'})
fig3.update_layout(xaxis_title="Date", yaxis_title="Number of Tweets")
fig3.show()

In [51]:
# ---------------------------
# 6. Preprocessing for Machine Learning Models
# ---------------------------
# Use the filtered data for training and testing
X = data_ml['clean_text']
y = data_ml['sentiment']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [52]:
# For the deep learning model, use Tokenizer and pad sequences
max_words = 5000   # vocabulary size
max_len = 100      # maximum sequence length

In [53]:
tokenizer_dl = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer_dl.fit_on_texts(X_train)
X_train_seq = tokenizer_dl.texts_to_sequences(X_train)
X_test_seq = tokenizer_dl.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [54]:
# Encode labels for deep learning (and get number of classes)
y_train_encoded, label_mapping = pd.factorize(y_train)
y_test_encoded, _ = pd.factorize(y_test)
num_classes = len(np.unique(y_train_encoded))
y_train_cat = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)

In [55]:
# ---------------------------
# 3. Classical Machine Learning Models with Hyperparameter Tuning
# ---------------------------

# Model 1: Logistic Regression (baseline)
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Performance:
              precision    recall  f1-score   support

    Negative       0.79      0.81      0.80      4427
     Neutral       0.71      0.69      0.70      3678
    Positive       0.76      0.76      0.76      4120

    accuracy                           0.75     12225
   macro avg       0.75      0.75      0.75     12225
weighted avg       0.75      0.75      0.75     12225

Accuracy: 0.7539468302658486


In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
# Model 2: Random Forest with GridSearchCV for hyperparameter tuning
rf_model = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [20, 40],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(rf_model, param_grid_rf, cv=3, n_jobs=-1, scoring='accuracy')
grid_rf.fit(X_train_tfidf, y_train)
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_tfidf)
print("Random Forest Best Params:", grid_rf.best_params_)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))

Random Forest Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 40}
Random Forest Performance:
              precision    recall  f1-score   support

    Negative       0.92      0.90      0.91      4427
     Neutral       0.91      0.86      0.88      3678
    Positive       0.85      0.92      0.88      4120

    accuracy                           0.89     12225
   macro avg       0.89      0.89      0.89     12225
weighted avg       0.90      0.89      0.89     12225

Accuracy: 0.8936605316973415


In [None]:

# Build a deeper LSTM model with additional layers and increased epochs.
embedding_dim = 128

def build_lstm_model(embedding_dim=128, lstm_units1=128, lstm_units2=64, dropout_rate=0.5):
    model = Sequential([
        Embedding(max_words, embedding_dim, input_length=max_len),
        SpatialDropout1D(0.2),
        # First LSTM layer with return_sequences=True to stack another LSTM layer
        LSTM(lstm_units1, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
        # Second LSTM layer (no return_sequences)
        LSTM(lstm_units2, dropout=0.2, recurrent_dropout=0.2),
        Dense(64, activation='relu'),
        Dropout(dropout_rate),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

lstm_model = build_lstm_model()
lstm_model.summary()

# Train the LSTM model for more epochs (e.g., 10 epochs; adjust as needed)
history = lstm_model.fit(X_train_pad, y_train_cat,
                         epochs=10,
                         batch_size=128,
                         validation_split=0.2,
                         verbose=1)

# Evaluate on test data
loss, lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test_cat, verbose=0)
print("\nDeep LSTM Model Accuracy:", lstm_accuracy)

Epoch 1/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m238s[0m 725ms/step - accuracy: 0.3650 - loss: 1.0961 - val_accuracy: 0.3672 - val_loss: 1.0947
Epoch 2/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m248s[0m 708ms/step - accuracy: 0.3649 - loss: 1.0958 - val_accuracy: 0.3672 - val_loss: 1.0944
Epoch 3/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 702ms/step - accuracy: 0.3627 - loss: 1.0958 - val_accuracy: 0.3672 - val_loss: 1.0947
Epoch 4/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m263s[0m 705ms/step - accuracy: 0.3708 - loss: 1.0950 - val_accuracy: 0.3672 - val_loss: 1.0946
Epoch 5/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m260s[0m 699ms/step - accuracy: 0.3670 - loss: 1.0948 - val_accuracy: 0.3672 - val_loss: 1.0945
Epoch 6/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m261s[0m 697ms/step - accuracy: 0.3674 - loss: 1.0950 - val_accuracy: 0.3672 - val_loss: 1.0944
Epoc

In [None]:
# ---------------------------
# 5. Model Comparison and Evaluation
# ---------------------------
# Collect accuracies from all models
models_accuracy = {
    'Logistic Regression': accuracy_score(y_test, y_pred_lr),
    'Random Forest': accuracy_score(y_test, y_pred_rf),
    'Deep LSTM': lstm_accuracy
}

print("\nOverall Model Comparison (Accuracy):")
for model_name, acc in models_accuracy.items():
    print(f"{model_name}: {acc:.4f}")

# Plot confusion matrices for the classical models
def plot_confusion_matrix(y_true, y_pred, title):
    cm = confusion_matrix(y_true, y_pred, labels=['Positive','Neutral','Negative'])
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues',
                xticklabels=['Positive','Neutral','Negative'],
                yticklabels=['Positive','Neutral','Negative'])
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title(title)
    plt.show()

plot_confusion_matrix(y_test, y_pred_lr, "Logistic Regression Confusion Matrix")
plot_confusion_matrix(y_test, y_pred_rf, "Random Forest Confusion Matrix")
# For the deep learning model, predictions need to be extracted from probabilities.
y_pred_lstm = np.argmax(lstm_model.predict(X_test_pad), axis=1)
# Map encoded predictions back to sentiment labels
labels = label_mapping
y_test_labels = y_test.values  # original labels
# You can compare using classification_report if you map predictions back properly.
print("\nDeep LSTM Classification Report:")
from sklearn.metrics import classification_report
# Assuming the order of label mapping is the same:
print(classification_report(y_test_encoded, y_pred_lstm, target_names=labels))