In [1]:
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.utils import to_categorical

In [2]:
df = pd.read_csv("/content/sample_data/combined_emotion.csv")

In [3]:
# Preprocessing function
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)                    # Remove URLs
    text = re.sub(r"[^a-zA-Z\s]", "", text)                # Remove punctuation and digits
    text = re.sub(r"\s+", " ", text).strip()               # Remove extra spaces
    return text

df['clean_text'] = df['sentence'].apply(clean_text)

In [13]:
# Drop rows with NaN in 'emotion' column
df.dropna(subset=['emotion'], inplace=True)

# Label Encoding
label_encoder = LabelEncoder()
df['emotion_label'] = label_encoder.fit_transform(df['emotion'])
num_classes = len(label_encoder.classes_)

In [5]:
# Tokenization and Padding
max_words = 10000
max_len = 30

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df['clean_text'])
sequences = tokenizer.texts_to_sequences(df['clean_text'])
X = pad_sequences(sequences, maxlen=max_len, padding='post')

y = to_categorical(df['emotion_label'], num_classes=num_classes)


In [6]:
# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [7]:
# Build LSTM Model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=64, input_length=max_len),
    LSTM(64, dropout=0.2, recurrent_dropout=0.2),
    Dense(32, activation='relu'),
    Dropout(0.2),
    Dense(num_classes, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [8]:
# Train the Model
history = model.fit(X_train, y_train, epochs=3, batch_size=16, validation_split=0.1, verbose=1)

Epoch 1/3
[1m6820/6820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m807s[0m 117ms/step - accuracy: 0.4557 - loss: 1.2850 - val_accuracy: 0.9079 - val_loss: 0.2482
Epoch 2/3
[1m6820/6820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m830s[0m 113ms/step - accuracy: 0.9118 - loss: 0.2285 - val_accuracy: 0.9142 - val_loss: 0.1794
Epoch 3/3
[1m6820/6820[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m761s[0m 107ms/step - accuracy: 0.9210 - loss: 0.1731 - val_accuracy: 0.9119 - val_loss: 0.1834


In [14]:
# Evaluate the Model
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test, axis=1)

print("\nAccuracy:", accuracy_score(y_true, y_pred))
print("\nClassification Report:")
print(classification_report(y_true, y_pred, target_names=label_encoder.classes_, labels=np.unique(y_true)))

[1m948/948[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 19ms/step

Accuracy: 0.9142225594668603

Classification Report:
              precision    recall  f1-score   support

       anger       0.96      0.86      0.91      4124
        fear       0.86      0.87      0.86      3420
         joy       0.91      0.96      0.93     10359
        love       0.90      0.74      0.81      2480
         sad       0.94      0.97      0.96      8813
     suprise       0.81      0.77      0.79      1115

    accuracy                           0.91     30311
   macro avg       0.90      0.86      0.88     30311
weighted avg       0.91      0.91      0.91     30311



In [25]:
# Testing on a New Sentence
def predict_emotion(sentence):
    cleaned = clean_text(sentence)
    seq = tokenizer.texts_to_sequences([cleaned])
    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    pred = model.predict(padded)
    emotion = label_encoder.inverse_transform([np.argmax(pred)])
    return emotion[0]

# Example
test_sentence = "I would think that whomever would be lucky enough to stay in this suite must feel like it is the most romantic place on earth"
predicted_emotion = predict_emotion(test_sentence)
print(f"\nTest Sentence: '{test_sentence}'")
print(f"Predicted Emotion: {predicted_emotion}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step

Test Sentence: 'I would think that whomever would be lucky enough to stay in this suite must feel like it is the most romantic place on earth'
Predicted Emotion: love
