# Sentiment Analysis of Twitter and Reddit Dataset using LSTM

### Importing all libraries

In [None]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, BatchNormalization
import pandas
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
import matplotlib.pyplot as plt
from tensorflow.keras.callbacks import LearningRateScheduler
import numpy as np
from tensorflow.keras.callbacks import ReduceLROnPlateau
from google.colab import drive
from tensorflow.keras import regularizers

In [None]:
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Reading the downloaded Dataset from Kaggle using pandas
(https://www.kaggle.com/datasets/cosmos98twitter-and-reddit-sentimental-analysis-dataset)



In [None]:
data_reddit = pandas.read_csv("/content/Reddit_Data.csv")
data_twitter = pandas.read_csv("/content/Twitter_Data.csv")
data_reddit.rename(columns = {'clean_comment': 'text'}, inplace = True)
data_twitter.rename(columns = {'clean_text': 'text'}, inplace = True)

data = pandas.concat([data_reddit, data_twitter], ignore_index = True)
data = pandas.DataFrame(data)

### Removing all Null values

In [None]:
data.dropna(axis = 0, inplace = True)

### Defining Functions to Preorocess and tokenize the dataset


In [None]:
def preprocess_text(text):
    if isinstance(text, str):  # Check if the text is a non-null string
        # Convert text to lowercase
        text = text.lower()

        # Tokenization
        tokens = word_tokenize(text)

        # Remove punctuation and non-alphanumeric tokens
        tokens = [token for token in tokens if token.isalnum()]

        # Remove stopwords
        stop_words = set(stopwords.words('english'))
        tokens = [token for token in tokens if token not in stop_words]

        # Join tokens back into a sentence
        preprocessed_text = ' '.join(tokens)

        return preprocessed_text
    else:
        return ""  # Return an empty string for NaN or non-string values

def tokenize(item):
  tokenizer = Tokenizer(oov_token="<OOV>")
  tokenizer.fit_on_texts(item)
  sequences = tokenizer.texts_to_sequences(item)
  max_length = 65  # Choose an appropriate value
  padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post', truncating='post')
  vocab_size = len(tokenizer.word_index) + 1
  return padded_sequences, vocab_size

def lr_schedule(epoch, lr):
    return lr * np.exp(1)

### Ordinal Mapping my data as **(-1,0,1)-->(0,1,2)**
*This step is optional*

In [None]:
ordinal_mapping = {-1: 0, 0: 1, 1: 2}

# Apply ordinal encoding to your labels using the apply function
data['encoded_sentiment'] = data['category'].apply(lambda x: ordinal_mapping[x])
data['preprocessed_text'] = data['text'].apply(preprocess_text)

### Tokenizing, Splitting and Converting all data to tensors

In [None]:
x = data.pop("preprocessed_text")
y = data.pop('encoded_sentiment')
x, vocab_size = tokenize(x)
y = to_categorical(y, 3)

X_train,X_test, y_train, y_test = train_test_split(x, y, test_size=0.15, random_state=42)

X_train = tf.convert_to_tensor(X_train)
y_train = tf.convert_to_tensor(y_train)
X_test = tf.convert_to_tensor(X_test)
y_test = tf.convert_to_tensor(y_test)

### Defining all Callbacks for early stopping and learning rate reduction

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
lr_scheduler = ReduceLROnPlateau(monitor='val_loss', factor=0.6, patience=0, verbose=1, min_lr=1e-6)

embedding_dim = 65
max_length = 65

### Initilizing sequential model using Recurrent Neural Network


In [None]:
tf.random.set_seed(42)
model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_length))
model.add(LSTM(8, return_sequences=True))
model.add(BatchNormalization())
model.add(Dropout(rate=0.4))
model.add(LSTM(8, return_sequences=True))
model.add(Dropout(rate=0.4))
model.add(LSTM(8, return_sequences=True))
model.add(Dropout(rate=0.2))
model.add(LSTM(8))
model.add(Dropout(rate=0.2))
model.add(Dense(3, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=tf.keras.optimizers.Adam(learning_rate=0.004), metrics=['accuracy'])

history = model.fit(X_train, y_train, epochs=25, validation_split=0.2, callbacks=[early_stopping,lr_scheduler])

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 4: ReduceLROnPlateau reducing learning rate to 0.002400000113993883.
Epoch 5/25
Epoch 5: ReduceLROnPlateau reducing learning rate to 0.00144000006839633.
Epoch 6/25
Epoch 6: ReduceLROnPlateau reducing learning rate to 0.0008640000130981206.


### Evaluting my model

In [None]:
model.evaluate(X_test,y_test)



[0.3028303384780884, 0.9109201431274414]

### Saving it to my Google Drive

In [None]:
drive.mount('/content/drive')
model.save('/content/drive/My Drive/Colab Notebooks/sentiment_analysis_model_combined2.h5')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
