In [45]:
import numpy as np
import pandas as pd

# Load data

In [46]:
df = pd.read_csv('/kaggle/input/emotions/text.csv')

In [47]:
print('Six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5)')

Six categories: sadness (0), joy (1), love (2), anger (3), fear (4), and surprise (5)


In [48]:
df.head()

Unnamed: 0.1,Unnamed: 0,text,label
0,0,i just feel really helpless and heavy hearted,4
1,1,ive enjoyed being able to slouch about relax a...,0
2,2,i gave up my internship with the dmrg and am f...,4
3,3,i dont know i feel so lost,0
4,4,i am a kindergarten teacher and i am thoroughl...,4


In [49]:
df.tail()

Unnamed: 0.1,Unnamed: 0,text,label
416804,416804,i feel like telling these horny devils to find...,2
416805,416805,i began to realize that when i was feeling agi...,3
416806,416806,i feel very curious be why previous early dawn...,5
416807,416807,i feel that becuase of the tyranical nature of...,3
416808,416808,i think that after i had spent some time inves...,5


In [50]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416809 entries, 0 to 416808
Data columns (total 3 columns):
 #   Column      Non-Null Count   Dtype 
---  ------      --------------   ----- 
 0   Unnamed: 0  416809 non-null  int64 
 1   text        416809 non-null  object
 2   label       416809 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 9.5+ MB


In [51]:
X = df['text']
y = df['label']
print(type(X))
print(type(y))

<class 'pandas.core.series.Series'>
<class 'pandas.core.series.Series'>


# Data processing

In [52]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [53]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
important_words = {'no', 'not', 'nor', 'never'}

X1 = X.apply(lambda x: ' '.join([word for word in x.split() if( word.lower() not in (stop_words) or word.lower() in important_words)]))

In [54]:
print(X1.head())
print(type(X1))

0                   feel really helpless heavy hearted
1    ive enjoyed able slouch relax unwind frankly n...
2              gave internship dmrg feeling distraught
3                                  dont know feel lost
4    kindergarten teacher thoroughly weary job take...
Name: text, dtype: object
<class 'pandas.core.series.Series'>


In [55]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words = 50000)
tokenizer.fit_on_texts(X1)
X_sequences = tokenizer.texts_to_sequences(X1)

In [68]:
import pickle
with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

In [56]:
X_sequences[0]

[1, 6, 181, 1370, 2981]

In [57]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_len = max(len(tokens) for tokens in X_sequences)
X_padded = pad_sequences(X_sequences, padding='post', maxlen=max_len)

In [59]:
max_len

82

In [60]:
print(type(X_padded))
print(type(y))

<class 'numpy.ndarray'>
<class 'pandas.core.series.Series'>


In [61]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_padded, y, train_size=0.85, random_state=42)

In [62]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(354287, 82)
(354287,)
(62522, 82)
(62522,)


# Build model

In [63]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import *

In [64]:
model = Sequential([
    Embedding(input_dim=50001, output_dim=128, input_length=max_len),
    Bidirectional(LSTM(128, return_sequences=True)),
    Bidirectional(LSTM(64, return_sequences=True)),
    BatchNormalization(),
    Bidirectional(LSTM(64, return_sequences=False)),
    Dropout(0.2),
    Dense(6, activation='softmax')
])



In [65]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.fit(X_train, y_train, validation_split=0.05, epochs=10, batch_size=128)

Epoch 1/10


I0000 00:00:1751650717.412531     108 cuda_dnn.cc:529] Loaded cuDNN version 90300


[1m2630/2630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m115s[0m 39ms/step - accuracy: 0.8440 - loss: 0.3791 - val_accuracy: 0.9410 - val_loss: 0.0971
Epoch 2/10
[1m2630/2630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 40ms/step - accuracy: 0.9411 - loss: 0.0963 - val_accuracy: 0.9403 - val_loss: 0.0929
Epoch 3/10
[1m2630/2630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 40ms/step - accuracy: 0.9425 - loss: 0.0900 - val_accuracy: 0.9347 - val_loss: 0.1113
Epoch 4/10
[1m2630/2630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 40ms/step - accuracy: 0.9435 - loss: 0.0857 - val_accuracy: 0.9418 - val_loss: 0.0920
Epoch 5/10
[1m2630/2630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 40ms/step - accuracy: 0.9442 - loss: 0.0838 - val_accuracy: 0.9414 - val_loss: 0.0951
Epoch 6/10
[1m2630/2630[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m105s[0m 40ms/step - accuracy: 0.9442 - loss: 0.0829 - val_accuracy: 0.9415 - val_loss: 0.0937
Epoch 7/1

<keras.src.callbacks.history.History at 0x7f0f804bf4d0>

# Test with test set

In [66]:
model.evaluate(X_test, y_test)

[1m1954/1954[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 9ms/step - accuracy: 0.9399 - loss: 0.1031


[0.10324002802371979, 0.9395572543144226]

# Save model

In [67]:
model.save('emotions_model.h5')


# Test model with text

In [69]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from nltk.tokenize import word_tokenize
import nltk

nltk.download('punkt')
nltk.download('stopwords')

from nltk.corpus import stopwords
def preprocess_text(text, tokenizer, max_len):
    stop_words = set(stopwords.words('english'))
    important_words = {'no', 'not', 'nor', 'never'}
    
    tokens = word_tokenize(text)
    filtered = [word for word in tokens if word.lower() not in stop_words or word.lower() in important_words]
    
    text_processed = ' '.join(filtered)
    seq = tokenizer.texts_to_sequences([text_processed])

    padded = pad_sequences(seq, maxlen=max_len, padding='post')
    
    return padded


[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [70]:
X = "I’m completely fed up with this situation, and I’m absolutely furious right now!"
x_input = preprocess_text(X, tokenizer, max_len)

# Dự đoán
y_pred = model.predict(x_input)

predicted_class = y_pred.argmax(axis=1)[0]
print("Predicted class:", predicted_class)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 718ms/step
Predicted class: 3
