In [35]:
# Import necessary libraries
import pandas as pd
import numpy as np
import nltk
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.metrics import classification_report

In [37]:
 # Read the Data from the Given Excel File
data = pd.read_csv('Twitter_Data.csv')
data.head()

Unnamed: 0,clean_text,category
0,when modi promised “minimum government maximum...,-1.0
1,talk all the nonsense and continue all the dra...,0.0
2,what did just say vote for modi welcome bjp t...,1.0
3,asking his supporters prefix chowkidar their n...,1.0
4,answer who among these the most powerful world...,1.0


In [38]:
#  Change the Dependent Variable to Categorical
data['category'] = data['category'].map({-1: 'Negative', 0: 'Neutral', 1: 'Positive'})


In [39]:
#  Missing Value Analysis and Drop Null Values
data.dropna(inplace=True)

In [42]:
#Text cleaning
def clean_text(text):
    text = text.lower()
    text = ''.join([c for c in text if c.isalnum() or c.isspace()])
    return text


data['clean_text'] = data['clean_text'].apply(clean_text)



In [43]:
#  Create a New Column for Sentence Length
data['sentence_length'] = data['clean_text'].apply(lambda x: len(x.split()))

In [44]:
#  Split Data into Dependent(X) and Independent(y) Dataframe
X = data['clean_text']
y = data['category']

In [45]:
#  Text Preprocessing (Tokenization and Padding)
max_words = 10000  
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)
X_sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(X_sequences)

In [46]:
#  Encode the Dependent Variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [47]:
# Step 9: Split the Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.2, random_state=42)

In [48]:
#  Build and Compile the LSTM Model
model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=128, input_length=X_padded.shape[1]))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(3, activation='softmax'))  # Three categories: Negative, Neutral, Positive
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [49]:
#  Train the Model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x23e0c936700>

In [50]:
# Normalize Predictions
y_pred = model.predict(X_test)
y_pred_normalized = np.argmax(y_pred, axis=1)



In [51]:
#  Measure Performance Metrics and Accuracy
accuracy = (y_pred_normalized == y_test).mean()
print(f"Accuracy: {accuracy:.2f}")


Accuracy: 0.97


In [52]:
#  Print Classification Report
target_names = label_encoder.classes_
print(classification_report(y_test, y_pred_normalized, target_names=target_names))

              precision    recall  f1-score   support

    Negative       0.94      0.95      0.95      7152
     Neutral       0.98      0.98      0.98     11067
    Positive       0.98      0.97      0.97     14375

    accuracy                           0.97     32594
   macro avg       0.97      0.97      0.97     32594
weighted avg       0.97      0.97      0.97     32594

