<a href="https://colab.research.google.com/github/KaifAhmad1/sentiment-analyzer/blob/main/NEXA_Sentiment_Analysis_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Sentiment Analysis System using Twitter Tweets.**

In [None]:
!pip install -q dash

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m45.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m101.7/101.7 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m228.0/228.0 kB[0m [31m16.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import os
import re
import numpy as np
import pandas as pd
from wordcloud import WordCloud

# NLTK imports for text preprocessing
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Sklearn for ML models
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# For deep learning model (optional, not integrated into dash below)
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D
from tensorflow.keras.utils import to_categorical

# Plotly for interactive visualizations
import plotly.express as px
import plotly.graph_objects as go

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

# Load Dataset
train_path = '/content/drive/MyDrive/Twitter Sentiment Data/twitter_training.csv'
test_path = '/content/drive/MyDrive/Twitter Sentiment Data/twitter_validation.csv'
data = pd.read_csv(train_path)
data.head(25)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
5,2402,Borderlands,Positive,So I spent a few hours making something for fu...
6,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
7,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
8,2402,Borderlands,Positive,So I spent a few hours making something for fu...
9,2402,Borderlands,Positive,2010 So I spent a few hours making something f...


In [None]:
data = pd.read_csv(train_path, header=None)

# Assign descriptive column names based on the expected structure:
data.columns = ['id', 'topic', 'sentiment', 'text']

In [None]:
print("\nData Information:")
data.info()


Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   topic      74682 non-null  object
 2   sentiment  74682 non-null  object
 3   text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [None]:
print("\nStatistical Summary for Numeric Columns:")
print(data.describe())


Statistical Summary for Numeric Columns:
                 id
count  74682.000000
mean    6432.586165
std     3740.427870
min        1.000000
25%     3195.000000
50%     6422.000000
75%     9601.000000
max    13200.000000


In [None]:
print("\nSentiment Distribution:")
print(data['sentiment'].value_counts())


Sentiment Distribution:
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [None]:
# -------------------------------
# Data Cleaning Function
# -------------------------------
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        return " ".join(tokens)
    return text

nltk.download('punkt_tab')

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [None]:
# Drop rows with missing text and create a clean text column
data = data.dropna(subset=['text'])
data['clean_text'] = data['text'].apply(clean_text)

In [None]:
# Consider only Positive, Negative, and Neutral sentiments (drop others, e.g., 'Irrelevant')
data_ml = data[data['sentiment'].isin(['Positive', 'Negative', 'Neutral'])].copy()

In [None]:
# Create a simulated date column (for sentiment trend analysis)
data_ml['date'] = pd.date_range(start='2021-01-01', periods=len(data_ml), freq='h')
data_ml['day'] = data_ml['date'].dt.date

In [None]:
# =============================================
# STEP 2: EXPLORATORY DATA ANALYSIS (EDA) using Plotly
# =============================================

# 1. Interactive Sentiment Distribution Plot
sentiment_counts = data_ml['sentiment'].value_counts().reindex(['Positive', 'Neutral', 'Negative']).reset_index()
sentiment_counts.columns = ['sentiment', 'count']
fig1 = px.bar(sentiment_counts, x='sentiment', y='count',
              title="Sentiment Distribution",
              labels={'sentiment': 'Sentiment', 'count': 'Count'},
              text='count')
fig1.update_traces(texttemplate='%{text}', textposition='outside')
fig1.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig1.show()

In [None]:
# 2. Interactive Word Clouds for each sentiment category
sentiments = ['Positive', 'Neutral', 'Negative']
for sentiment in sentiments:
    text_data = " ".join(data_ml[data_ml['sentiment'] == sentiment]['clean_text'].tolist())
    wc = WordCloud(width=800, height=400, background_color='white').generate(text_data)
    # Convert the wordcloud to an image array and plot with Plotly
    fig_wc = px.imshow(wc.to_array(), title=f"Word Cloud for {sentiment} Reviews")
    fig_wc.update_xaxes(visible=False)
    fig_wc.update_yaxes(visible=False)
    fig_wc.update_layout(coloraxis_showscale=False)
    fig_wc.show()

In [None]:
# 3. Interactive Sentiment Trends Over Time
# Group by day and sentiment, then unstack for plotting
sentiment_trends = data_ml.groupby(['day', 'sentiment']).size().unstack(fill_value=0).reset_index()
fig3 = px.line(sentiment_trends, x='day', y=['Positive', 'Neutral', 'Negative'],
               title='Sentiment Trends Over Time',
               labels={'value': 'Number of Reviews', 'day': 'Date'})
fig3.update_layout(xaxis_title="Date", yaxis_title="Number of Reviews")
fig3.show()

In [None]:
# =============================================
# STEP 3A: MACHINE LEARNING MODEL DEVELOPMENT (Logistic Regression)
# =============================================

# Prepare data for classical ML
X = data_ml['clean_text']
y = data_ml['sentiment']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Train a Logistic Regression model
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)

# Predictions and evaluation
y_pred_lr = lr_model.predict(X_test_tfidf)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_lr))
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

    Negative       0.79      0.81      0.80      4427
     Neutral       0.71      0.69      0.70      3678
    Positive       0.76      0.76      0.76      4120

    accuracy                           0.75     12225
   macro avg       0.75      0.75      0.75     12225
weighted avg       0.75      0.75      0.75     12225

Logistic Regression Accuracy: 0.7539468302658486


In [None]:
# =============================================
# STEP 3B: DEEP LEARNING MODEL DEVELOPMENT (LSTM)
# =============================================

# For the LSTM model, we use a Tokenizer to vectorize text and pad sequences.
max_words = 5000  # Vocabulary size
max_len = 100     # Maximum review length

# Tokenize the clean text
tokenizer = Tokenizer(num_words=max_words, oov_token="")
tokenizer.fit_on_texts(X_train)
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)

# Pad sequences so that they have equal length
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Correct the label encoding using pd.factorize
y_train_encoded, label_mapping = pd.factorize(y_train)
y_test_encoded, _ = pd.factorize(y_test)

# Convert labels to categorical for Keras
y_train_cat = to_categorical(y_train_encoded)
y_test_cat = to_categorical(y_test_encoded)

# Build the LSTM model
embedding_dim = 64
lstm_model = Sequential([
    Embedding(max_words, embedding_dim, input_length=max_len),
    SpatialDropout1D(0.2),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dense(y_train_cat.shape[1], activation='softmax')
])

lstm_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
lstm_model.summary()

# Train the LSTM model
history = lstm_model.fit(X_train_pad, y_train_cat,
                         epochs=5,
                         batch_size=128,
                         validation_split=0.2,
                         verbose=1)

# Evaluate on test set
loss, accuracy = lstm_model.evaluate(X_test_pad, y_test_cat, verbose=0)
print("\nLSTM Model Accuracy:", accuracy)


Argument `input_length` is deprecated. Just remove it.



Epoch 1/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m112s[0m 306ms/step - accuracy: 0.3633 - loss: 1.0954 - val_accuracy: 0.3672 - val_loss: 1.0950
Epoch 2/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m118s[0m 230ms/step - accuracy: 0.3654 - loss: 1.0948 - val_accuracy: 0.3672 - val_loss: 1.0944
Epoch 3/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 228ms/step - accuracy: 0.3691 - loss: 1.0951 - val_accuracy: 0.3672 - val_loss: 1.0946
Epoch 4/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 227ms/step - accuracy: 0.3704 - loss: 1.0944 - val_accuracy: 0.3672 - val_loss: 1.0948
Epoch 5/5
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 221ms/step - accuracy: 0.3642 - loss: 1.0953 - val_accuracy: 0.3672 - val_loss: 1.0944

LSTM Model Accuracy: 0.30085888504981995


In [None]:
# =============================================
# STEP 4: DASHBOARD FOR VISUALIZATION & INSIGHTS
# =============================================
import dash
from dash import dcc, html, Input, Output, State
import plotly.express as px

# Prepare data for top words analysis
from collections import Counter

def get_top_words(sentiment_label, top_n=10):
    # Concatenate all texts for the given sentiment and split into words
    texts = data_ml[data_ml['sentiment'] == sentiment_label]['clean_text']
    all_words = " ".join(texts).split()
    word_counts = Counter(all_words)
    top_words = word_counts.most_common(top_n)
    return pd.DataFrame(top_words, columns=['word', 'count'])

top_positive = get_top_words('Positive')
top_negative = get_top_words('Negative')

# Prepare sentiment trend data (daily aggregation)
sentiment_trend_df = sentiment_trends.reset_index()
sentiment_trend_df['day'] = pd.to_datetime(sentiment_trend_df['day'])

# Define a function for sentiment prediction from raw text using the LR model
def predict_sentiment(review_text):
    cleaned = clean_text(review_text)
    vect = tfidf_vectorizer.transform([cleaned])
    pred = lr_model.predict(vect)[0]
    return pred

# Initialize the Dash app
app = dash.Dash(__name__)

app.layout = html.Div([
    html.H1("Customer Reviews Sentiment Dashboard"),

    html.Div([
        html.H3("Real-time Sentiment Prediction"),
        dcc.Input(id='input-text', type='text', placeholder='Enter a review...', style={'width': '60%'}),
        html.Button('Predict', id='predict-button', n_clicks=0),
        html.Div(id='prediction-output', style={'marginTop': 20, 'fontSize': 20})
    ], style={'padding': '20px', 'border': '1px solid #ccc'}),

    html.Div([
        html.H3("Top Words in Positive Reviews"),
        dcc.Graph(
            id='positive-words',
            figure=px.bar(top_positive, x='word', y='count', title='Positive Reviews Top Words')
        )
    ], style={'padding': '20px', 'border': '1px solid #ccc', 'marginTop': 20}),

    html.Div([
        html.H3("Top Words in Negative Reviews"),
        dcc.Graph(
            id='negative-words',
            figure=px.bar(top_negative, x='word', y='count', title='Negative Reviews Top Words')
        )
    ], style={'padding': '20px', 'border': '1px solid #ccc', 'marginTop': 20}),

    html.Div([
        html.H3("Sentiment Trends Over Time"),
        dcc.Graph(
            id='sentiment-trends',
            figure=px.line(sentiment_trend_df, x='day', y=['Positive', 'Neutral', 'Negative'],
                           title='Daily Sentiment Trends', labels={'value': 'Count', 'day': 'Date'})
        )
    ], style={'padding': '20px', 'border': '1px solid #ccc', 'marginTop': 20})
])

# Callback for real-time sentiment prediction
@app.callback(
    Output('prediction-output', 'children'),
    Input('predict-button', 'n_clicks'),
    State('input-text', 'value')
)
def update_prediction(n_clicks, input_text):
    if n_clicks > 0 and input_text:
        sentiment = predict_sentiment(input_text)
        return f"Predicted Sentiment: {sentiment}"
    return "Enter a review and click 'Predict'"

if __name__ == '__main__':
    # Run the Dash app (for Jupyter, consider using JupyterDash from dash)
    app.run_server(debug=True)

<IPython.core.display.Javascript object>