#  Emotion Detection from Text

# Load the dataset

In [3]:
import pandas as pd
df = pd.read_csv("Emotion_classify_Data.csv")  
print(df.head())  


                                             Comment Emotion
0  i seriously hate one subject to death but now ...    fear
1                 im so full of life i feel appalled   anger
2  i sit here to write i start to dig out my feel...    fear
3  ive been really angry with r and i feel like a...     joy
4  i feel suspicious if there is no one outside l...    fear


# Data Cleaning


In [4]:
import re

def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    return text.strip()

# Apply text cleaning
df["Cleaned_Comment"] = df["Comment"].apply(clean_text)

# Display cleaned text
print(df[["Comment", "Cleaned_Comment"]].head())


                                             Comment  \
0  i seriously hate one subject to death but now ...   
1                 im so full of life i feel appalled   
2  i sit here to write i start to dig out my feel...   
3  ive been really angry with r and i feel like a...   
4  i feel suspicious if there is no one outside l...   

                                     Cleaned_Comment  
0  i seriously hate one subject to death but now ...  
1                 im so full of life i feel appalled  
2  i sit here to write i start to dig out my feel...  
3  ive been really angry with r and i feel like a...  
4  i feel suspicious if there is no one outside l...  


# Tokenization

In [5]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\R.MUNIRANJANI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
pip install spacy

Collecting spacy
  Downloading spacy-3.8.4-cp312-cp312-win_amd64.whl.metadata (27 kB)
Collecting spacy-legacy<3.1.0,>=3.0.11 (from spacy)
  Downloading spacy_legacy-3.0.12-py2.py3-none-any.whl.metadata (2.8 kB)
Collecting spacy-loggers<2.0.0,>=1.0.0 (from spacy)
  Downloading spacy_loggers-1.0.5-py3-none-any.whl.metadata (23 kB)
Collecting murmurhash<1.1.0,>=0.28.0 (from spacy)
  Downloading murmurhash-1.0.12-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting cymem<2.1.0,>=2.0.2 (from spacy)
  Downloading cymem-2.0.11-cp312-cp312-win_amd64.whl.metadata (8.8 kB)
Collecting preshed<3.1.0,>=3.0.2 (from spacy)
  Downloading preshed-3.0.9-cp312-cp312-win_amd64.whl.metadata (2.2 kB)
Collecting thinc<8.4.0,>=8.3.4 (from spacy)
  Downloading thinc-8.3.4-cp312-cp312-win_amd64.whl.metadata (15 kB)
Collecting wasabi<1.2.0,>=0.9.1 (from spacy)
  Downloading wasabi-1.1.3-py3-none-any.whl.metadata (28 kB)
Collecting srsly<3.0.0,>=2.4.3 (from spacy)
  Downloading srsly-2.5.1-cp312-cp312-win_amd64

In [11]:
import spacy

# Load English tokenizer model
nlp = spacy.load("en_core_web_sm")

def spacy_tokenizer(text):
    doc = nlp(text)
    return [token.text for token in doc]

df["Tokenized_Text"] = df["Cleaned_Comment"].apply(spacy_tokenizer)

print(df[["Cleaned_Comment", "Tokenized_Text"]].head())


                                     Cleaned_Comment  \
0  i seriously hate one subject to death but now ...   
1                 im so full of life i feel appalled   
2  i sit here to write i start to dig out my feel...   
3  ive been really angry with r and i feel like a...   
4  i feel suspicious if there is no one outside l...   

                                      Tokenized_Text  
0  [i, seriously, hate, one, subject, to, death, ...  
1      [i, m, so, full, of, life, i, feel, appalled]  
2  [i, sit, here, to, write, i, start, to, dig, o...  
3  [i, ve, been, really, angry, with, r, and, i, ...  
4  [i, feel, suspicious, if, there, is, no, one, ...  


# Stop word removal

In [12]:
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

df["Filtered_Text"] = df["Tokenized_Text"].apply(remove_stopwords)

# Display text after stopword removal
print(df[["Tokenized_Text", "Filtered_Text"]].head())


                                      Tokenized_Text  \
0  [i, seriously, hate, one, subject, to, death, ...   
1      [i, m, so, full, of, life, i, feel, appalled]   
2  [i, sit, here, to, write, i, start, to, dig, o...   
3  [i, ve, been, really, angry, with, r, and, i, ...   
4  [i, feel, suspicious, if, there, is, no, one, ...   

                                       Filtered_Text  
0  [seriously, hate, one, subject, death, feel, r...  
1                       [full, life, feel, appalled]  
2  [sit, write, start, dig, feelings, think, afra...  
3  [really, angry, r, feel, like, idiot, trusting...  
4  [feel, suspicious, one, outside, like, rapture...  


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\R.MUNIRANJANI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Lemmatization

In [13]:
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")
lemmatizer = WordNetLemmatizer()

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

df["Lemmatized_Text"] = df["Filtered_Text"].apply(lemmatize_tokens)

# Display text after lemmatization
print(df[["Filtered_Text", "Lemmatized_Text"]].head())


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\R.MUNIRANJANI\AppData\Roaming\nltk_data...


                                       Filtered_Text  \
0  [seriously, hate, one, subject, death, feel, r...   
1                       [full, life, feel, appalled]   
2  [sit, write, start, dig, feelings, think, afra...   
3  [really, angry, r, feel, like, idiot, trusting...   
4  [feel, suspicious, one, outside, like, rapture...   

                                     Lemmatized_Text  
0  [seriously, hate, one, subject, death, feel, r...  
1                       [full, life, feel, appalled]  
2  [sit, write, start, dig, feeling, think, afrai...  
3  [really, angry, r, feel, like, idiot, trusting...  
4  [feel, suspicious, one, outside, like, rapture...  


# Convert text into numerical representation

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features
X_tfidf = tfidf_vectorizer.fit_transform(df["Cleaned_Comment"])

print(X_tfidf.shape)  # (num_samples, num_features)


(5937, 5000)


In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Limit to 5000 features
X = tfidf_vectorizer.fit_transform(df["Cleaned_Comment"])

# Convert labels into numerical form
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["Emotion"])  # Converts categorical labels to numbers

print("Feature matrix shape:", X.shape)
print("Target vector shape:", y.shape)


Feature matrix shape: (5937, 5000)
Target vector shape: (5937,)


# Split the data

In [17]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (4749, 5000)
Testing set size: (1188, 5000)


# Naive bayes Model

In [18]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report

# Train the model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Predict on test data
y_pred_nb = nb_model.predict(X_test)

# Evaluate performance
print("Naïve Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=label_encoder.classes_))


Naïve Bayes Accuracy: 0.9023569023569024
              precision    recall  f1-score   support

       anger       0.87      0.94      0.90       392
        fear       0.92      0.89      0.91       416
         joy       0.91      0.88      0.90       380

    accuracy                           0.90      1188
   macro avg       0.90      0.90      0.90      1188
weighted avg       0.90      0.90      0.90      1188



# SVM

In [19]:
from sklearn.svm import SVC

# Train the model
svm_model = SVC(kernel="linear")
svm_model.fit(X_train, y_train)

# Predict on test data
y_pred_svm = svm_model.predict(X_test)

# Evaluate performance
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print(classification_report(y_test, y_pred_svm, target_names=label_encoder.classes_))


SVM Accuracy: 0.9402356902356902
              precision    recall  f1-score   support

       anger       0.92      0.94      0.93       392
        fear       0.97      0.92      0.95       416
         joy       0.94      0.96      0.95       380

    accuracy                           0.94      1188
   macro avg       0.94      0.94      0.94      1188
weighted avg       0.94      0.94      0.94      1188



# Random Forest

In [20]:
from sklearn.ensemble import RandomForestClassifier

# Train the model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Predict on test data
y_pred_rf = rf_model.predict(X_test)

# Evaluate performance
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print(classification_report(y_test, y_pred_rf, target_names=label_encoder.classes_))


Random Forest Accuracy: 0.9191919191919192
              precision    recall  f1-score   support

       anger       0.93      0.90      0.92       392
        fear       0.95      0.90      0.92       416
         joy       0.88      0.96      0.92       380

    accuracy                           0.92      1188
   macro avg       0.92      0.92      0.92      1188
weighted avg       0.92      0.92      0.92      1188



# Deep learning model-LSTM

In [21]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize the text data
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["Cleaned_Comment"])
X_sequences = tokenizer.texts_to_sequences(df["Cleaned_Comment"])

# Pad sequences to ensure equal length
X_padded = pad_sequences(X_sequences, maxlen=50, padding="post", truncating="post")

# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)


In [22]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Define model architecture
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=50),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(32, activation="relu"),
    Dense(len(label_encoder.classes_), activation="softmax")  # Output layer
])

# Compile model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))




Epoch 1/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 33ms/step - accuracy: 0.3363 - loss: 1.0993 - val_accuracy: 0.3300 - val_loss: 1.1030
Epoch 2/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 30ms/step - accuracy: 0.3242 - loss: 1.1000 - val_accuracy: 0.3199 - val_loss: 1.0997
Epoch 3/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.3414 - loss: 1.0984 - val_accuracy: 0.3173 - val_loss: 1.0997
Epoch 4/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.3616 - loss: 1.0927 - val_accuracy: 0.3350 - val_loss: 1.0985
Epoch 5/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 30ms/step - accuracy: 0.3391 - loss: 1.0981 - val_accuracy: 0.3199 - val_loss: 1.1008
Epoch 6/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 31ms/step - accuracy: 0.3368 - loss: 1.0973 - val_accuracy: 0.3316 - val_loss: 1.0993
Epoch 7/10
[1m149/14

<keras.src.callbacks.history.History at 0x1baf6ec5b80>

In [23]:
y_pred_lstm = model.predict(X_test)
y_pred_lstm_classes = y_pred_lstm.argmax(axis=1)

print("LSTM Accuracy:", accuracy_score(y_test, y_pred_lstm_classes))
print(classification_report(y_test, y_pred_lstm_classes, target_names=label_encoder.classes_))


[1m38/38[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 25ms/step
LSTM Accuracy: 0.3341750841750842
              precision    recall  f1-score   support

       anger       0.33      1.00      0.50       392
        fear       0.60      0.01      0.03       416
         joy       0.00      0.00      0.00       380

    accuracy                           0.33      1188
   macro avg       0.31      0.34      0.18      1188
weighted avg       0.32      0.33      0.17      1188



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Train and save the LSTM model

In [1]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import pickle
import numpy as np
import pandas as pd

# Load dataset
file_path = "Emotion_classify_Data.csv"
df = pd.read_csv(file_path)

# Tokenization & Padding
tokenizer = Tokenizer(num_words=5000, oov_token="<OOV>")
tokenizer.fit_on_texts(df["Comment"])
X_sequences = tokenizer.texts_to_sequences(df["Comment"])
X_padded = pad_sequences(X_sequences, maxlen=50, padding="post", truncating="post")

# Encode labels
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["Emotion"])

# Split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, test_size=0.2, random_state=42)

# Define LSTM model
model = Sequential([
    Embedding(input_dim=5000, output_dim=64, input_length=50),
    LSTM(64, return_sequences=True),
    LSTM(32),
    Dense(32, activation="relu"),
    Dense(len(label_encoder.classes_), activation="softmax")
])

# Compile model
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train model
model.fit(X_train, y_train, epochs=10, batch_size=32, validation_data=(X_test, y_test))

# Save model and tokenizer
model.save("lstm_emotion_model.h5")
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)

print("Model and Tokenizer saved successfully!")


Epoch 1/10




[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 31ms/step - accuracy: 0.3376 - loss: 1.0992 - val_accuracy: 0.3190 - val_loss: 1.0986
Epoch 2/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 28ms/step - accuracy: 0.3483 - loss: 1.0985 - val_accuracy: 0.3199 - val_loss: 1.1087
Epoch 3/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.3603 - loss: 1.0977 - val_accuracy: 0.3569 - val_loss: 1.0983
Epoch 4/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.3331 - loss: 1.0985 - val_accuracy: 0.3325 - val_loss: 1.1007
Epoch 5/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 25ms/step - accuracy: 0.3380 - loss: 1.0944 - val_accuracy: 0.3215 - val_loss: 1.1003
Epoch 6/10
[1m149/149[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 27ms/step - accuracy: 0.3487 - loss: 1.0873 - val_accuracy: 0.3232 - val_loss: 1.0999
Epoch 7/10
[1m149/149[0m [32m━



Model and Tokenizer saved successfully!


In [1]:
from keras.initializers import Orthogonal
import tensorflow as tf

model = tf.keras.models.load_model("lstm_emotion_model.h5", custom_objects={"Orthogonal": Orthogonal})




In [2]:
import tensorflow as tf

# Load the original model
model = tf.keras.models.load_model("lstm_emotion_model.h5")

# Re-save using latest Keras format
model.save("lstm_emotion_model_new.h5", save_format="h5")

print("Model re-saved successfully!")


ValueError: Could not interpret initializer identifier: {'module': 'keras.initializers', 'class_name': 'Orthogonal', 'config': {'seed': None, 'gain': 1.0}, 'registered_name': None}