<a href="https://colab.research.google.com/github/Mahendran180923/News_Senticonomy/blob/main/Sentiment_Analysis_%7C%7C_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
jp797498e_twitter_entity_sentiment_analysis_path = kagglehub.dataset_download('jp797498e/twitter-entity-sentiment-analysis')

print('Data source import complete.')


# Howdy 🤠
## In this notebook i'll be doing **Sentiment Analysis** with **LSTM (Long short term memory)**
### if you aren't familiar with **Sentiment Analysis** : it's the process of identifying and categorizing opinions expressed in a piece of text

### Anyway enough with the yapping and let's cook 😗

<img src='https://uploads.dailydot.com/2024/04/let-him-cook-meme-.jpg?q=65&auto=format&w=1600&ar=2:1&fit=crop' height = 420 width = 620></img>

## Agenda
1. [Introduction](#Introduction)
2. [Data Preprocessing](#Pre-Processing)
3. [Model Training](#Model-Building)
4. [Evaluation](#Evaluation)

## Importing libs and getting the dataset
<a id="Introduction"></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# loading the dataset

file_path = r"/kaggle/input/twitter-entity-sentiment-analysis/twitter_training.csv"
df = pd.read_csv(file_path , header = None ,names=['number' , 'Border' , 'label' , 'message']) # Adjusting the column names

### in this dataset we should build a model that can predict whether the sentence has a -ve or +ve or neutral effect based on the user's messages !

## Now let's discover the data and get to know it !

In [None]:
display(df.head())

In [None]:
display(df['label'].value_counts())

## Okay let's drop the Useless columns

In [None]:
df.drop(['Border' , 'number'] , axis=1 , inplace = True)

In [None]:
# Shape ?

df.shape

In [None]:
# Null values ?

df.isnull().sum()

## Let's Drop the null rows !

In [None]:
df.dropna(inplace = True)

In [None]:
df.shape

<img src='https://media1.tenor.com/m/UGLkFpDi-vsAAAAC/avada-kedavra.gif' height = 420 width = 620></img>

### That's good

## Pre-Processing
### Now let's start with the **Pre-processing** ⚙️
### 1st - LowerCasing text

In [None]:
df['message'] = df['message'].str.lower()

df.head()

### 2 - Removing HTML tags !

In [None]:
from bs4 import BeautifulSoup

def remove_html(text):

    clean_text = BeautifulSoup(text , 'html.parser')

    return clean_text.get_text()

In [None]:
df['message'] = df['message'].apply(remove_html)

display(df['message'].head(2))

### 3 - Removing URLs (using regular expression (regex)

In [None]:
import re

def clean_url(text):

    return re.sub(r'http\S+|www\S+', '', text)

In [None]:
df['message'] = df['message'].apply(clean_url)

df.head()

### 4 - Removing punctuation

In [None]:
def remove_punctuation(text):

    return re.sub(r'[^\w\s]', '', text)

In [None]:
df['message'] = df['message'].apply(remove_punctuation)

df.head()

### 5 - Removing Stop words

#### Stop words like : "the," "is," "and," "in," "on" don't add much of a value to the model in this task

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

def remove_stopwords(text):
    if not isinstance(text, str):  # Handle non-string inputs
        return text

    doc = nlp(text)
    return " ".join([token.text for token in doc if not token.is_stop])

In [None]:
df['message'] = df['message'].apply(remove_stopwords)

In [None]:
df.head()

### 6 - Removing Emojis !

In [None]:
import emoji

def remove_emojis(text):
    return emoji.demojize(text)

df['message'] = df['message'].apply(remove_emojis)

### 7 - Lemmatization
#### changing the word back to its roots like : playing -> play , ate -> eat

In [None]:
def lemmatize_text(text):

    doc = nlp(text)
    return " ".join([token.lemma_ for token in doc])

df['message_lemmatized'] = df['message'].apply(lemmatize_text)

In [None]:
df.head()

## We finished the cleaning part ! Hooray 🥳
### now let's put all what we've done to process incoming input in the future !

In [None]:
def clean_text(text):
    if not isinstance(text, str):  # Handle non-string inputs
        return text

    text = text.lower()  # 1️⃣ Convert to lowercase
    text = remove_html(text)  # 2️⃣ Remove HTML
    text = clean_url(text)  # 3️⃣ Remove URLs
    text = remove_punctuation(text)  # 4️⃣ Remove punctuation
    text = remove_stopwords(text)  # 5️⃣ Remove stopwords
    text = remove_emojis(text)  # 6️⃣ Remove emojis
    text = lemmatize_text(text)  # 7️⃣ Lemmatization

    return text

In [None]:
# let's give it a test !

new_text = "Heyyyy!!! 😊 Check this out: https://example.com <b>Awesome!</b>"
cleaned_text = clean_text(new_text)
print(cleaned_text)

## Word Cloud

In [None]:
from wordcloud import WordCloud

# Convert the column to a single string
text = df['message_lemmatized'].astype(str).str.cat(sep=" ")

# Generate word cloud
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(text)

# Plot the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()

### Now let's a word cloud of the +ve sentences and the -ve according to the data

In [None]:
# Combine text data for each sentiment category
positive_text = " ".join(df[df["label"] == "Positive"]["message_lemmatized"])
negative_text = " ".join(df[df["label"] == "Negative"]["message_lemmatized"])
neutral_text = " ".join(df[df["label"] == "Neutral"]["message_lemmatized"])
irrelevant_text = " ".join(df[df["label"] == "Irrelevant"]["message_lemmatized"])

# Generate word clouds
positive_wordcloud = WordCloud(width=800, height=400, background_color='white').generate(positive_text)
negative_wordcloud = WordCloud(width=800, height=400, background_color='black', colormap='Reds').generate(negative_text)
neutral_wordcloud = WordCloud(width=800, height=400, background_color='gray').generate(neutral_text)
irrelevant_wordcloud = WordCloud(width=800, height=400, background_color='lightgray').generate(irrelevant_text)

# Plot word clouds
fig, ax = plt.subplots(2, 2, figsize=(15, 12))

ax[0, 0].imshow(positive_wordcloud, interpolation='bilinear')
ax[0, 0].set_title("Positive Reviews")
ax[0, 0].axis("off")

ax[0, 1].imshow(negative_wordcloud, interpolation='bilinear')
ax[0, 1].set_title("Negative Reviews")
ax[0, 1].axis("off")

ax[1, 0].imshow(neutral_wordcloud, interpolation='bilinear')
ax[1, 0].set_title("Neutral Reviews")
ax[1, 0].axis("off")

ax[1, 1].imshow(irrelevant_wordcloud, interpolation='bilinear')
ax[1, 1].set_title("Irrelevant Reviews")
ax[1, 1].axis("off")

plt.tight_layout()
plt.show()

## from this word cloud we can remove the word game as it's illogical to exist in every class !

In [None]:
df["message_lemmatized"] = df["message_lemmatized"].str.replace(r'\bgame\b', '', regex=True)

## Pre-Processing for the Target

In [None]:
# I'm going to mix the neutral class with the Irrelevant
df['label'] = df['label'].map({'Positive' : 1 ,  'Negative' : 0 ,'Neutral':2 , 'Irrelevant' : 2 })

In [None]:
df['label'].value_counts()

## Split the data
#### into ( train , test and validation data )
#### since that there's imbalance in the target i'm going to use stratified sampling

In [None]:
from sklearn.model_selection import train_test_split

X = df['message_lemmatized'] # feature matrix ( in this case it's just a vectore cause it's only one column)
y = df['label'] # target column

X_train1 , X_test , y_train1 , y_test = train_test_split(X,y , random_state = 42 , test_size = 0.2  , shuffle = True)
X_train , X_val , y_train , y_val = train_test_split(X_train1 , y_train1 , random_state = 42 , test_size = 0.15  , shuffle = True)

In [None]:
X_train.shape , X_val.shape , X_test.shape , y_train.shape , y_val.shape , y_test.shape

## Tokenization
#### Converting words into numerical tokens

In [None]:
import tensorflow as tf

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(oov_token = 'nothing')
tokenizer.fit_on_texts(X_train) # we call this method to build the tokenizer on the train data only to avoid data leakage !

In [None]:
#  returns the total number of documents (sentences) processed by the tokenizer
# just to make sure that all the sentences have been converted to tokens !

tokenizer.document_count

### Conversion to sequence
#### used to convert each text in the data to sequences of integers based on the tokenizer's vocabulary

In [None]:
X_train_seq = tokenizer.texts_to_sequences(X_train)
X_test_seq = tokenizer.texts_to_sequences(X_test)
X_val_seq = tokenizer.texts_to_sequences(X_val)

### Padding
#### we apply padding on the data to ensure that each sequence is the same length !

In [None]:
# finding the max length
max_len = max(len(tokens) for tokens in X_train_seq)
print("Maximum sequence length (maxlen):", max_len)


In [None]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Perform padding on X_train and X_test sequences and X_val

X_train_padded = pad_sequences(X_train_seq, maxlen=max_len, padding='post')
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post')
X_val_padded = pad_sequences(X_val_seq, maxlen=max_len, padding='post')

In [None]:
# Print the padded sequences for X_train and X_test
print("X_train_padded:")
print(X_train_padded[:1])
print("\nX_test_padded:")
print(X_test_padded[:1])
print("\nX_val_padded:")
print(X_val_padded[:1])

### I was plannig to use Glove as a pretrained model for vector embeddings and then finetune it with my data
### but let's give training an embedding layer from scratch a try first !
### However with what we got so far we can train the model directly but LSTM works well with vector embeddings

In [None]:
# Define vocab size based on the tokenizer
vocab_size = len(tokenizer.word_index) + 1

print(vocab_size)

## Model Building

In [None]:
model = tf.keras.models.Sequential([

    tf.keras.layers.Embedding(input_dim=vocab_size, output_dim=100),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128 , return_sequences = True , dropout = 0.2 , recurrent_dropout = 0.2)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64 , dropout = 0.2 , recurrent_dropout = 0.2)),
    tf.keras.layers.Dense(64 , activation='relu'  , kernel_initializer = 'he_normal'),
    tf.keras.layers.Dense(3 , activation = 'softmax')

])

In [None]:
# defining callbacks for Early stopping and changing the learning rate while training

early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=8, restore_best_weights=True, verbose=1)
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=3, verbose=1)

In [None]:
# compiling the model

# Compile Model
model.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    metrics=["accuracy"]
)

In [None]:
with tf.device('/device:GPU:0'):
  # Train the Model
  history = model.fit(
      X_train_padded,
      y_train,
      validation_data=(X_val_padded, y_val),  # Validation set
      batch_size=32,
      epochs=30,
      callbacks=[early_stopping , reduce_lr],  # to Prevent overfitting
      verbose=1
  )

In [None]:
model.summary()

In [None]:

plt.figure(figsize=(12, 4), dpi=150)
plt.plot(history.history['loss'], label='train_loss')
plt.plot(history.history['val_loss'], label='val_loss')
plt.title('Model Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.legend(loc='upper right')
plt.show()


In [None]:
plt.figure(figsize=(12, 4), dpi=150)
plt.plot(history.history['accuracy'], label='train_accuracy')
plt.plot(history.history['val_accuracy'], label='val_accuracy')
plt.title('Model Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.show()

## Evaluation

In [None]:
# Get model predictions (probabilities)
y_probs = model.predict(X_test_padded)

# Converting probs into Binary values
y_pred = np.argmax(y_probs, axis=1)

In [None]:
# Evaluate the model
loss, accuracy = model.evaluate(X_test_padded, y_test)
print(f"Loss: {loss}, Accuracy: {accuracy}")

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix , classification_report

cfm = confusion_matrix(y_test , y_pred)

In [None]:
# classification report
report = classification_report(y_test, y_pred)

# the report
print("Classification Report:")
print(report)

In [None]:
plt.figure(figsize=(6, 5))
sns.heatmap(cfm, annot=True, fmt="d", cmap="Blues", xticklabels=['Negative' , 'Positive' , 'Neutral'], yticklabels=['Negative' , 'Positive','Neutral'])
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix")
plt.show()

### From the Confusion matrix we can conclude that the model is Okay !

<img src='https://media1.tenor.com/m/xZUiiLfAwzQAAAAC/walter-white-let-him-cook.gif' height = 420 width = 620></img>

## Saving the model and the tokenizer

In [None]:
import pickle

with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

In [None]:
# now the model

model.save("LSTM_Sentiment_analysis.h5")

## Pre-Processing Pipeline for predictions

In [None]:
def preprocess_text(texts, tokenizer):
    """
    Preprocess new incoming text data.

    Args:
        texts (list of str): List of raw text inputs.
        tokenizer (Tokenizer): Pre-trained tokenizer.
        max_len (int): Maximum sequence length.

    Returns:
        np.array: Padded sequences ready for prediction.
    """
    # Convert text to sequences
    text_seq = tokenizer.texts_to_sequences(texts)

    # Apply padding
    text_padded = pad_sequences(text_seq, maxlen=max_len, padding="post")

    return text_padded

In [None]:
def Predict(text , model , tokenizer):

    text = [text]
    text = clean_text(text)
    text_padded =  preprocess_text(text , tokenizer)

    y_prob = model.predict(text_padded)

    y_pred = np.argmax(y_prob, axis=1)

    classes = ['Negative' , 'Positive' , 'Neutral']

    pred_class = classes[y_pred[0]]  # Get predicted class label
    pred_prob = y_prob[0][y_pred[0]] # get predicted prob


    return pred_class, pred_prob

## Loading the model
## Now if u want to use the model again without training it again
### you can simply load it with the tokenzier

In [None]:
# loading the tokenizer

with open("tokenizer.pkl", "rb") as f:
    tokenizer = pickle.load(f)


# Load the saved model
loaded_model = tf.keras.models.load_model('/kaggle/working/LSTM_Sentiment_analysis.h5')

In [None]:
new_text = "I'm Sad"

pred_class  , prob  = Predict(new_text , loaded_model , tokenizer)


In [None]:
print(f"Class Prediction is : {pred_class} with Probabilty {prob}")