<a href="https://colab.research.google.com/github/Harshkotkar/Deep-Learning/blob/main/Emotion_Recognition.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
kazanova_sentiment140_path = kagglehub.dataset_download('kazanova/sentiment140')
crowdflower_twitter_airline_sentiment_path = kagglehub.dataset_download('crowdflower/twitter-airline-sentiment')
bittlingmayer_amazonreviews_path = kagglehub.dataset_download('bittlingmayer/amazonreviews')
jp797498e_twitter_entity_sentiment_analysis_path = kagglehub.dataset_download('jp797498e/twitter-entity-sentiment-analysis')
parulpandey_emotion_dataset_path = kagglehub.dataset_download('parulpandey/emotion-dataset')

print('Data source import complete.')


![SENTIMENT-09-1.png](attachment:504e696e-f893-4788-8962-2c574d2841f7.png)

* <b> Overview :</b> Explore text-based emotion recognition, a dynamic field in <span style="background-color: red; padding: 4px; border-radius:5px;">NLP</span>, focusing on deciphering diverse emotional states in textual content.

* <b> Objective :</b> Build a system for automatic categorization of text into six emotions
( <span style="color: #F8DE22;">joy</span>  ,
 <span style="color: #0c0d49;">sadness</span> ,
 <span style="color: #b82f2f;">fear</span> ,
 <span style="color: #331e1e;">anger</span > ,
 <span style="color: red;">love</span> ,
 <span style="color: #00fff7;">surprise</span>)
* <b> Model Choice : </b> Utilize <span style="background-color: #F8DE22; padding: 4px; border-radius:5px;">LSTM</span>
 (Long Short-Term Memory) networks, a type of <span style="background-color: #F8DE22; padding: 4px; border-radius:5px;">RNN</span>.

* <b> Implementation : </b>
Implemented with <span style="background-color: #F8DE22; padding: 4px; border-radius:5px;">TensorFlow</span>.



In [None]:
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
from wordcloud import WordCloud
from nltk.stem import PorterStemmer
import numpy as np
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.regularizers import l2


# 1 | The Dataset

* any data sets involving sentiment analysis are <span style="background-color: #F8DE22; padding: 2px; border-radius:5px;">binary classification</span> problems
* In this dataset we have <span style="background-color: #F8DE22; padding: 2px; border-radius:5px;">6 different sentiments</span> , which means we'll be treating this problem as a <span style="background-color: #F8DE22; padding: 2px; border-radius:5px;">multiclass classification</span> problem

   ### 1. 1 | Loading Data

In [None]:
val_data = pd.read_csv('/kaggle/input/emotion-dataset/validation.csv')
train_data = pd.read_csv('/kaggle/input/emotion-dataset/training.csv')
test_data = pd.read_csv('/kaggle/input/emotion-dataset/test.csv')

In [None]:
print("Validation data :",val_data.shape)
print("Train data :",train_data.shape)
print("Test data :",test_data.shape)

* There is a lot of data in test, in my case i divided it and put the est in the val_data

In [None]:
half_test_data = test_data.iloc[1000:]
test_data = test_data.iloc[:1000]

val_data = pd.concat([val_data, half_test_data], axis=0)

print("new Vald data :",val_data.shape)
print("new Test data :",test_data.shape)

In [None]:
train_data.head(10)

### 1. 2 | Adding Label Data

In [None]:
labels_dict = {0:'sadness', 1:'joy', 2:'love', 3:'anger', 4:'fear', 5:'surprise'}
train_data['label_name'] = train_data['label'].map(labels_dict)
train_data.head()

In [None]:
train_data.groupby(["label_name","label"]).size()

### 1. 3 | Data Visualization

In [None]:
train_data["label_name"].value_counts().plot(kind='bar',color=['yellow', '#0c0d49', '#b82f2f', '#331e1e', 'red','#00fff7'])

# 2 | Data Cleaning

In [None]:
print(train_data.isnull().sum())
print(val_data.isnull().sum())
print(test_data.isnull().sum())

# 3 | Tokenisation & Stemming

* <span style="background-color: #F8DE22; padding: 2px; border-radius:5px;">Tokenization</span> assigns unique IDs to words, creating a word index or vocabulary.
* <b>Example Sentence :</b> "Tokenization is essential for NLP tasks."
* <b>Tokenized Output : </b>['Tokenization', 'is', 'essential', 'for', 'NLP', 'tasks', '.']


* <span style="background-color: #F8DE22; padding: 2px; border-radius:5px;">Stemming</span> is a technique used to reduce an inflected word down to its word stem.
* <b>Example :</b>
* <b>Original Words :</b> running , programming , swimming , happiness , programmer <span style="background-color: #F8DE22; padding: 2px; border-radius:5px;">  (5 words)</span>
* <b>Stemmed Words :</b> run , program , swim , happi   <span style="background-color: #F8DE22; padding: 2px; border-radius:5px;">  (4 words)</span>

In [None]:
all_list = train_data['text'].tolist() + test_data['text'].tolist() + val_data['text'].tolist()

In [None]:
tokenizer1 = Tokenizer()
tokenizer1.fit_on_texts(all_list)
word_index1 = tokenizer1.word_index

print("Nombre of words without Stemming:",len(word_index1))

stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in word_index1.keys()]

tokenizer2 = Tokenizer()
tokenizer2.fit_on_texts(stemmed_words)
word_index2 = tokenizer2.word_index

print("Nombre of words with Stemming:",len(word_index2))

* load all data to list : <b>[ [ Tokenised_Data ] , label ] </b>

In [None]:
def preprocess_data(data):
    new_data = []
    for index, row in data.iterrows():
        test_split = row['text'].split()
        stemmed_words2 = [stemmer.stem(word) for word in test_split]
        token_list= tokenizer2.texts_to_sequences([stemmed_words2])[0]
        new_data.append([token_list,row['label']])
    return new_data

In [None]:
new_train_data = preprocess_data(train_data)
print(train_data['text'][0])
print(new_train_data[0])

In [None]:
new_val_data = preprocess_data(val_data)
print(val_data['text'][0])
print(new_val_data[0])

In [None]:
# Splitting into train_X and train_y
train_X = [row[0] for row in new_train_data]
train_y = [row[1] for row in new_train_data]

# Print the results
print("train_X:", train_X[0])
print("train_y:", train_y[0])

In [None]:
val_X = [row[0] for row in new_val_data]
val_y = [row[1] for row in new_val_data]

print("train_X:", val_X[0])
print("train_y:", val_y[0])

### 3. 2 | Add Padding

In [None]:
length_of_longest_sentence = len(max(train_X, key=len))
print(length_of_longest_sentence)
print(max(train_X, key=len))

In [None]:
for i in range(len(train_X)):
    for j in range(length_of_longest_sentence-len(train_X[i])):
        train_X[i].append(0)

for i in range(len(val_X)):
    for j in range(length_of_longest_sentence-len(val_X[i])):
        val_X[i].append(0)

### 3. 3 | List to Array (numpy)

In [None]:
train_X = np.array(train_X)
train_y = np.array(train_y)
val_X = np.array(val_X)
val_y = np.array(val_y)

print(train_X.shape,train_y.shape)
print(val_X.shape,val_y.shape)


In [None]:
# Convert labels to one-hot encoding
train_y_one_hot = to_categorical(train_y, num_classes=16000)
val_y_one_hot = to_categorical(val_y, num_classes=16000)

# 4 | Create model (LSTM)

### 4. 1 | Architechture of Bidirectional LSTM Neural Network

![download.png](attachment:e3218baf-2240-45f3-8aac-e6956af5346e.png)

### 4. 2 | Bi- LSTM Neural Network Model training

In [None]:
model = Sequential()
model.add(Embedding(16000, 100, input_length=66))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(16000, activation='softmax'))
adam = Adam(lr=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])
history = model.fit(train_X, train_y_one_hot, epochs=25, verbose=1,validation_data=(val_X,val_y_one_hot))
#print model.summary()
print(model)

# 5 | Resultd And Test

In [None]:
last_accuracy = "{:.3f}".format(history.history['accuracy'][-1])
print("Training Accuracy:", last_accuracy)

### 5. 1 | Plotting Model Accuracy And Loss

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Training Accuracy vs Validation Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Training Loss vs Validation Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Validation'], loc='upper left')
plt.show()

### 5. 2 | Test The Model

In [None]:
def get_text(text):
    tokenizer3 = Tokenizer()
    tokenizer3.fit_on_texts(text)
    word_index3 = tokenizer3.word_index

    stemmed_wordss = [stemmer.stem(word) for word in word_index3.keys()]

    tokens_list= tokenizer2.texts_to_sequences([stemmed_wordss])[0]

    for i in range(len(tokens_list)):
        for j in range(length_of_longest_sentence-len(tokens_list)):
            tokens_list.append(0)
    return tokens_list


In [None]:
 for _ in range(5):
    random_number = random.randint(0, 1000)
    num_to_predicte = random_number

    test = get_text([test_data['text'][num_to_predicte]])

    test = np.array(test)
    test = test.reshape(1, len(test))

    # Make predictions
    predictions = model.predict(test)

    predicted_class = np.argmax(predictions)
    print()
    print('Random value = ',random_number)
    print("Predicted Class:", predicted_class,labels_dict.get(predicted_class))
    print("Actual Class:", test_data['label'][num_to_predicte])
    print()

### 5. 2 | Confusion Matrix

In [None]:
new_test_data=preprocess_data(test_data)

test_X = [row[0] for row in new_train_data]
test_y = [row[1] for row in new_train_data]

for i in range(len(test_X)):
    for j in range(length_of_longest_sentence-len(test_X[i])):
        test_X[i].append(0)

test_X = np.array(test_X)
test_y = np.array(test_y)

test_y_one_hot = to_categorical(test_y, num_classes=16000)

In [None]:
y_pred = model.predict(test_X)
y_pred_classes = np.argmax(y_pred, axis=1)

In [None]:
y_true_labels = np.argmax(test_y_one_hot, axis=1)

labels=['sadness','joy','love','anger','fear','surprise']
#labels = list(set(labels).intersection(set(np.unique(y_true_labels)).union(set(np.unique(y_pred_classes)))))

cm = confusion_matrix(y_true_labels, y_pred_classes)
df_cm = pd.DataFrame(cm, labels, labels)
ax = sns.heatmap(df_cm, annot=True, annot_kws={'size': 16}, square=True, cbar=False, fmt='g')
ax.set_ylim(0, 6)
plt.xlabel('Predicted')
plt.ylabel('Actual')
ax.invert_yaxis()
plt.show()
