<a href="https://colab.research.google.com/github/KaifAhmad1/sentiment-analyzer/blob/main/NEXA_Sentiment_Analysis_System.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Sentiment Analysis System using Twitter Tweets.**

In [4]:
!pip install -qU dash xgboost

In [5]:
import os
import re
import numpy as np
import pandas as pd
from wordcloud import WordCloud
import nltk
import joblib
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D, Dropout
from tensorflow.keras.utils import to_categorical
import xgboost as xgb

# Download NLTK data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('punkt_tab')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [6]:
from google.colab import drive
# Mount Google Drive
drive.mount('/content/drive')

# Load Dataset
train_path = '/content/drive/MyDrive/Twitter Sentiment Data/twitter_training.csv'
test_path = '/content/drive/MyDrive/Twitter Sentiment Data/twitter_validation.csv'
data = pd.read_csv(train_path)
data.head(25)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Unnamed: 0,2401,Borderlands,Positive,"im getting on borderlands and i will murder you all ,"
0,2401,Borderlands,Positive,I am coming to the borders and I will kill you...
1,2401,Borderlands,Positive,im getting on borderlands and i will kill you ...
2,2401,Borderlands,Positive,im coming on borderlands and i will murder you...
3,2401,Borderlands,Positive,im getting on borderlands 2 and i will murder ...
4,2401,Borderlands,Positive,im getting into borderlands and i can murder y...
5,2402,Borderlands,Positive,So I spent a few hours making something for fu...
6,2402,Borderlands,Positive,So I spent a couple of hours doing something f...
7,2402,Borderlands,Positive,So I spent a few hours doing something for fun...
8,2402,Borderlands,Positive,So I spent a few hours making something for fu...
9,2402,Borderlands,Positive,2010 So I spent a few hours making something f...


In [7]:
data = pd.read_csv(train_path, header=None)

# Assign descriptive column names based on the expected structure:
data.columns = ['id', 'topic', 'sentiment', 'text']

In [8]:
print("\nData Information:")
data.info()


Data Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 74682 entries, 0 to 74681
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   id         74682 non-null  int64 
 1   topic      74682 non-null  object
 2   sentiment  74682 non-null  object
 3   text       73996 non-null  object
dtypes: int64(1), object(3)
memory usage: 2.3+ MB


In [9]:
print("\nStatistical Summary for Numeric Columns:")
print(data.describe())


Statistical Summary for Numeric Columns:
                 id
count  74682.000000
mean    6432.586165
std     3740.427870
min        1.000000
25%     3195.000000
50%     6422.000000
75%     9601.000000
max    13200.000000


In [10]:
print("\nSentiment Distribution:")
print(data['sentiment'].value_counts())


Sentiment Distribution:
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [11]:
#  Data Cleaning
def clean_text(text):
    if isinstance(text, str):
        text = text.lower()  # Convert to lowercase
        text = re.sub(r'\d+', '', text)  # Remove numbers
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = word_tokenize(text)
        tokens = [word for word in tokens if word not in stop_words]
        return " ".join(tokens)
    return text

In [12]:
# Drop rows with missing text and create a new column with cleaned text
data = data.dropna(subset=['text'])
data['clean_text'] = data['text'].apply(clean_text)

In [13]:
# Filter Data (Keep Only Three Sentiments)
data_ml = data[data['sentiment'].isin(['Positive', 'Negative', 'Neutral'])].copy()

# Create a simulated date column (for sentiment trend analysis)
data_ml['date'] = pd.date_range(start='2021-01-01', periods=len(data_ml), freq='h')
data_ml['day'] = data_ml['date'].dt.date

In [14]:
# Exploratory Data Analysis (EDA)
# Sentiment Distribution Plot
sentiment_counts = data_ml['sentiment'].value_counts().reindex(['Positive', 'Neutral', 'Negative']).reset_index()
sentiment_counts.columns = ['sentiment', 'count']
fig1 = px.bar(sentiment_counts, x='sentiment', y='count',
              title="Sentiment Distribution",
              labels={'sentiment': 'Sentiment', 'count': 'Count'},
              text='count')
fig1.update_traces(texttemplate='%{text}', textposition='outside')
fig1.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')
fig1.show()

In [15]:
# Word Clouds for Each Sentiment
sentiments = ['Positive', 'Neutral', 'Negative']
for sentiment in sentiments:
    text_data = " ".join(data_ml[data_ml['sentiment'] == sentiment]['clean_text'].tolist())
    wc = WordCloud(width=800, height=400, background_color='white').generate(text_data)
    fig_wc = px.imshow(wc.to_array(), title=f"Word Cloud for {sentiment} Tweets")
    fig_wc.update_xaxes(visible=False)
    fig_wc.update_yaxes(visible=False)
    fig_wc.update_layout(coloraxis_showscale=False)
    fig_wc.show()

In [16]:
# Sentiment Trends Over Time
sentiment_trends = data_ml.groupby(['day', 'sentiment']).size().unstack(fill_value=0).reset_index()
fig3 = px.line(sentiment_trends, x='day', y=['Positive', 'Neutral', 'Negative'],
               title='Sentiment Trends Over Time',
               labels={'value': 'Number of Tweets', 'day': 'Date'})
fig3.update_layout(xaxis_title="Date", yaxis_title="Number of Tweets")
fig3.show()

In [17]:
# Split Data
X = data_ml['clean_text']
y = data_ml['sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [18]:
# Encoding for Classical Models (TF-IDF)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [19]:
# Logistic Regression
print("----- Logistic Regression -----")
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)
y_pred_lr = lr_model.predict(X_test_tfidf)
print("Logistic Regression Performance:")
print(classification_report(y_test, y_pred_lr))
lr_accuracy = accuracy_score(y_test, y_pred_lr)
lr_precision = precision_score(y_test, y_pred_lr, average='weighted')
lr_recall = recall_score(y_test, y_pred_lr, average='weighted')
lr_f1 = f1_score(y_test, y_pred_lr, average='weighted')

----- Logistic Regression -----
Logistic Regression Performance:
              precision    recall  f1-score   support

    Negative       0.79      0.81      0.80      4427
     Neutral       0.71      0.69      0.70      3678
    Positive       0.76      0.76      0.76      4120

    accuracy                           0.75     12225
   macro avg       0.75      0.75      0.75     12225
weighted avg       0.75      0.75      0.75     12225



In [20]:
# Random Forest
print("\n----- Random Forest -----")
rf_model = RandomForestClassifier(n_estimators=25, random_state=42)
rf_model.fit(X_train_tfidf, y_train)
y_pred_rf = rf_model.predict(X_test_tfidf)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
rf_accuracy = accuracy_score(y_test, y_pred_rf)
rf_precision = precision_score(y_test, y_pred_rf, average='weighted')
rf_recall = recall_score(y_test, y_pred_rf, average='weighted')
rf_f1 = f1_score(y_test, y_pred_rf, average='weighted')


----- Random Forest -----
Random Forest Performance:
              precision    recall  f1-score   support

    Negative       0.92      0.90      0.91      4427
     Neutral       0.90      0.85      0.88      3678
    Positive       0.85      0.91      0.88      4120

    accuracy                           0.89     12225
   macro avg       0.89      0.89      0.89     12225
weighted avg       0.89      0.89      0.89     12225



In [21]:
# Model Training - Deep LSTM Model
max_words = 5000   # Vocabulary size
max_len = 100      # Maximum sequence length

# Tokenization and Padding
tokenizer_dl = Tokenizer(num_words=max_words, oov_token="")
tokenizer_dl.fit_on_texts(X_train)
X_train_seq = tokenizer_dl.texts_to_sequences(X_train)
X_test_seq = tokenizer_dl.texts_to_sequences(X_test)
X_train_pad = pad_sequences(X_train_seq, maxlen=max_len, padding='post', truncating='post')
X_test_pad = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')

# Encode labels for categorical training
y_train_encoded, label_mapping = pd.factorize(y_train)
y_test_encoded, _ = pd.factorize(y_test)
num_classes = len(np.unique(y_train_encoded))
y_train_cat = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_cat = to_categorical(y_test_encoded, num_classes=num_classes)

In [None]:
# Define and build the LSTM model
embedding_dim = 128
def build_lstm_model(embedding_dim=128, lstm_units1=128, lstm_units2=64, dropout_rate=0.5):
    model = Sequential([
        Embedding(max_words, embedding_dim, input_length=max_len),
        SpatialDropout1D(0.2),
        LSTM(lstm_units1, dropout=0.2, recurrent_dropout=0.2, return_sequences=True),
        LSTM(lstm_units2, dropout=0.2, recurrent_dropout=0.2),
        Dense(64, activation='relu'),
        Dropout(dropout_rate),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

lstm_model = build_lstm_model(embedding_dim)
lstm_model.summary()

# Train the LSTM model
history = lstm_model.fit(X_train_pad, y_train_cat,
                         epochs=10,
                         batch_size=128,
                         validation_split=0.2,
                         verbose=1)


Argument `input_length` is deprecated. Just remove it.



Epoch 1/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 1s/step - accuracy: 0.3534 - loss: 1.0963 - val_accuracy: 0.3672 - val_loss: 1.0946
Epoch 2/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m380s[0m 1s/step - accuracy: 0.3664 - loss: 1.0952 - val_accuracy: 0.3672 - val_loss: 1.0944
Epoch 3/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m379s[0m 1s/step - accuracy: 0.3660 - loss: 1.0955 - val_accuracy: 0.3672 - val_loss: 1.0955
Epoch 4/10
[1m306/306[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m381s[0m 1s/step - accuracy: 0.3706 - loss: 1.0948 - val_accuracy: 0.3672 - val_loss: 1.0944
Epoch 5/10


In [None]:
# Evaluate the LSTM model
loss, lstm_accuracy = lstm_model.evaluate(X_test_pad, y_test_cat, verbose=0)
y_pred_lstm = np.argmax(lstm_model.predict(X_test_pad), axis=1)
y_test_lstm = np.argmax(y_test_cat, axis=1)
print("\nDeep LSTM Model Performance:")
print(classification_report(y_test_lstm, y_pred_lstm))
lstm_precision = precision_score(y_test_lstm, y_pred_lstm, average='weighted')
lstm_recall = recall_score(y_test_lstm, y_pred_lstm, average='weighted')
lstm_f1 = f1_score(y_test_lstm, y_pred_lstm, average='weighted')

In [None]:
# Model Performance Comparison and Saving Best Model
# Create a DataFrame to compare performance
comparison_df = pd.DataFrame({
    'Model': ['Logistic Regression', 'Random Forest', 'LSTM'],
    'Accuracy': [lr_accuracy, rf_accuracy, lstm_accuracy],
    'Precision': [lr_precision, rf_precision, lstm_precision],
    'Recall': [lr_recall, rf_recall, lstm_recall],
    'F1 Score': [lr_f1, rf_f1, lstm_f1]
})

print("\nModel Comparison:")
print(comparison_df)

fig_comparison = px.bar(comparison_df, x='Model', y=['Accuracy', 'Precision', 'Recall', 'F1 Score'],
                        title='Model Performance Comparison', barmode='group')
fig_comparison.show()

# Ensure the "models" directory exists
os.makedirs('models', exist_ok=True)

# Map each model to its F1 score
f1_scores = {
    'Logistic Regression': lr_f1,
    'Random Forest': rf_f1,
    'LSTM': lstm_f1
}

# Identify the best model based on F1 score
best_model_name = max(f1_scores, key=f1_scores.get)
best_f1 = f1_scores[best_model_name]

print(f"\nBest performing model: {best_model_name} with F1 Score: {best_f1:.4f}")

In [None]:
# Save the best model accordingly
if best_model_name == 'Logistic Regression':
    joblib.dump(lr_model, 'models/best_lr_model.joblib')
    joblib.dump(tfidf_vectorizer, 'models/tfidf_vectorizer.joblib')
    print("Saved Logistic Regression model and TF-IDF vectorizer to 'models/'")
elif best_model_name == 'Random Forest':
    joblib.dump(rf_model, 'models/best_rf_model.joblib')
    joblib.dump(tfidf_vectorizer, 'models/tfidf_vectorizer.joblib')
    print("Saved Random Forest model and TF-IDF vectorizer to 'models/'")
elif best_model_name == 'LSTM':
    lstm_model.save('models/best_lstm_model.h5')
    print("Saved LSTM model to 'models/'")