<a href="https://colab.research.google.com/github/KaifAhmad1/sentiment-analyzer/blob/main/NEXA_Sentiment_Analysis_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Sentiment Analysis System using Twitter Tweets.**

In [1]:
!pip install -q dash

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m5.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.0/228.0 kB[0m [31m9.9 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import re
import numpy as np
import pandas as pd
from wordcloud import WordCloud

# NLTK imports for text preprocessing
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sklearn for ML models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# For deep learning model (optional, not integrated into dash below)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.utils import to_categorical

# Plotly for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [3]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

# Load Dataset
train_path = '/content/drive/MyDrive/Twitter Sentiment Data/twitter_training.csv'
test_path = '/content/drive/MyDrive/Twitter Sentiment Data/twitter_validation.csv'
data = pd.read_csv(train_path)
data.head(25)

Mounted at /content/drive


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
5,2402,Borderlands,Positive,So I spent a few hours making something for fu...
6,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
7,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
8,2402,Borderlands,Positive,So I spent a few hours making something for fu...
9,2402,Borderlands,Positive,2010 So I spent a few hours making something f...


In [4]:
data = pd.read_csv(train_path, header=None)

# Assign descriptive column names based on the expected structure:
data.columns = ['id', 'topic', 'sentiment', 'text']

In [5]:
print("\nData Information:")
data.info()


Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   topic      74682 non-null  object
 2   sentiment  74682 non-null  object
 3   text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [6]:
print("\nStatistical Summary for Numeric Columns:")
print(data.describe())


Statistical Summary for Numeric Columns:
                 id
count  74682.000000
mean    6432.586165
std     3740.427870
min        1.000000
25%     3195.000000
50%     6422.000000
75%     9601.000000
max    13200.000000


In [7]:
print("\nSentiment Distribution:")
print(data['sentiment'].value_counts())


Sentiment Distribution:
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [8]:
# -------------------------------
# Data Cleaning Function
# -------------------------------
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        return " ".join(tokens)
    return text

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


True

In [9]:
# Drop rows with missing text and create a clean text column
data = data.dropna(subset=['text'])
data['clean_text'] = data['text'].apply(clean_text)

In [10]:
# Consider only Positive, Negative, and Neutral sentiments (drop others, e.g., 'Irrelevant')
data_ml = data[data['sentiment'].isin(['Positive', 'Negative', 'Neutral'])].copy()

In [11]:
# Create a simulated date column (for sentiment trend analysis)
data_ml['date'] = pd.date_range(start='2021-01-01', periods=len(data_ml), freq='h')
data_ml['day'] = data_ml['date'].dt.date

In [12]:
# =============================================
# STEP 2: EXPLORATORY DATA ANALYSIS (EDA) using Plotly
# =============================================

# 1. Interactive Sentiment Distribution Plot
sentiment_counts = data_ml['sentiment'].value_counts().reindex(['Positive', 'Neutral', 'Negative']).reset_index()
sentiment_counts.columns = ['sentiment', 'count']
fig1 = px.bar(sentiment_counts, x='sentiment', y='count',
              title="Sentiment Distribution",
              labels={'sentiment': 'Sentiment', 'count': 'Count'},
              text='count')
fig1.update_traces(texttemplate='%{text}', textposition='outside')
fig1.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig1.show()

In [13]:
# 2. Interactive Word Clouds for each sentiment category
sentiments = ['Positive', 'Neutral', 'Negative']
for sentiment in sentiments:
    text_data = " ".join(data_ml[data_ml['sentiment'] == sentiment]['clean_text'].tolist())
    wc = WordCloud(width=800, height=400, background_color='white').generate(text_data)
    # Convert the wordcloud to an image array and plot with Plotly
    fig_wc = px.imshow(wc.to_array(), title=f"Word Cloud for {sentiment} Reviews")
    fig_wc.update_xaxes(visible=False)
    fig_wc.update_yaxes(visible=False)
    fig_wc.update_layout(coloraxis_showscale=False)
    fig_wc.show()

In [14]:
# 3. Interactive Sentiment Trends Over Time
# Group by day and sentiment, then unstack for plotting
sentiment_trends = data_ml.groupby(['day', 'sentiment']).size().unstack(fill_value=0).reset_index()
fig3 = px.line(sentiment_trends, x='day', y=['Positive', 'Neutral', 'Negative'],
               title='Sentiment Trends Over Time',
               labels={'value': 'Number of Reviews', 'day': 'Date'})
fig3.update_layout(xaxis_title="Date", yaxis_title="Number of Reviews")
fig3.show()

In [15]:
# Split dataset for classical ML models
X = data['clean_text']
y = data['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [16]:
# TF-IDF vectorization (for classical ML models)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [17]:
# For the deep learning model, use Tokenizer and pad sequences
max_words = 5000   # vocabulary size
max_len = 100      # maximum sequence length

In [18]:
tokenizer_dl = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer_dl.fit_on_texts(X_train)
X_train_seq = tokenizer_dl.texts_to_sequences(X_train)
X_test_seq = tokenizer_dl.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

In [19]:
# Encode labels for deep learning (and get number of classes)
y_train_encoded, label_mapping = pd.factorize(y_train)
y_test_encoded, _ = pd.factorize(y_test)
num_classes = len(np.unique(y_train_encoded))
y_train_cat = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)

In [20]:
# ---------------------------
# 3. Classical Machine Learning Models with Hyperparameter Tuning
# ---------------------------

# Model 1: Logistic Regression (baseline)
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))
print("Accuracy:", accuracy_score(y_test, y_pred_lr))

Logistic Regression Performance:
              precision    recall  f1-score   support

  Irrelevant       0.69      0.50      0.58      2696
    Negative       0.72      0.77      0.75      4380
     Neutral       0.62      0.64      0.63      3605
    Positive       0.67      0.72      0.69      4119

    accuracy                           0.68     14800
   macro avg       0.68      0.66      0.66     14800
weighted avg       0.68      0.68      0.67     14800

Accuracy: 0.6766216216216216


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import  GridSearchCV
# Model 2: Random Forest with GridSearchCV for hyperparameter tuning
rf_model = RandomForestClassifier(random_state=42)
param_grid_rf = {
    'n_estimators': [100, 200],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5]
}
grid_rf = GridSearchCV(rf_model, param_grid_rf, cv=3, n_jobs=-1, scoring='accuracy')
grid_rf.fit(X_train_tfidf, y_train)
best_rf = grid_rf.best_estimator_
y_pred_rf = best_rf.predict(X_test_tfidf)
print("Random Forest Best Params:", grid_rf.best_params_)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_pred_rf))


A worker stopped while some jobs were given to the executor. This can be caused by a too short worker timeout or by a memory leak.



In [None]:
import xgboost as xgb