In [1]:
# Importing necessary libraries
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

In [2]:
# Reading in training, validation, and test datasets from text files
train_data = pd.read_csv('train.txt', sep = ';', header=None, names=["text", "emotions"])
val_data = pd.read_csv('val.txt', sep = ';', header=None, names=["text", "emotions"])
test_data = pd.read_csv('test.txt', sep = ';', header=None, names=["text", "emotions"])

In [3]:
# Encoding emotion labels into numerical values using LabelEncoder
from sklearn.preprocessing import LabelEncoder
label_encoder = LabelEncoder()
train_data['encoded_label'] = label_encoder.fit_transform(train_data['emotions'])
val_data['encoded_label'] = label_encoder.fit_transform(val_data['emotions'])
test_data['encoded_label'] = label_encoder.fit_transform(test_data['emotions'])

In [4]:
# Extracting sentences (text) and corresponding encoded emotion labels
train_sentences = train_data['text'].tolist()
train_labels = train_data['encoded_label'].tolist()

In [5]:
val_sentences = val_data['text'].tolist()
val_labels = val_data['encoded_label'].tolist()

In [6]:
test_sentences = test_data['text'].tolist()
test_labels = test_data['encoded_label'].tolist()

In [7]:
# Initialize the Lemmatizer and stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

In [8]:
# Tokenizing, lemmatizing, and removing stop words from the sentences
train_tokens = []
for sentence in train_sentences:
    tokens = word_tokenize(sentence.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    train_tokens.append(tokens)

In [9]:
val_tokens = []
for sentence in val_sentences:
    tokens = word_tokenize(sentence.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    val_tokens.append(tokens)

In [10]:
test_tokens = []
for sentence in test_sentences:
    tokens = word_tokenize(sentence.lower())
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word.isalnum() and word not in stop_words]
    test_tokens.append(tokens)

In [11]:
# Using Keras Tokenizer for converting words to sequences and padding
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [12]:
# Defining parameters for the tokenizer and padding
max_vocab_size = 10000  # Maximum number of words to keep in vocabulary
max_sequence_length = 100  # Maximum length of the sequences after padding

# Tokenizing the train data and padding the sequences
tokenizer = Tokenizer(num_words=max_vocab_size, oov_token="<oov>")
tokenizer.fit_on_texts(train_tokens)  # Fitting tokenizer to the training tokens

In [13]:
X_train = pad_sequences(tokenizer.texts_to_sequences(train_tokens), maxlen=max_sequence_length, padding='post')
X_val = pad_sequences(tokenizer.texts_to_sequences(val_tokens), maxlen=max_sequence_length, padding='post')
X_test = pad_sequences(tokenizer.texts_to_sequences(test_tokens), maxlen=max_sequence_length, padding='post')

In [14]:
# Converting labels to numpy arrays
import numpy as np
y_train = np.array(train_data['encoded_label'])
y_val = np.array(val_data['encoded_label'])
y_test = np.array(test_data['encoded_label'])

In [15]:
# Importing necessary Keras modules for building the model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, Embedding

# Number of classes based on the label encoding
num_classes = len(label_encoder.classes_)

# Defining the model architecture with a Bidirectional LSTM layer
from tensorflow.keras.layers import Bidirectional
model = Sequential([
    Embedding(input_dim=10000, output_dim=128),
    Bidirectional(LSTM(128, dropout=0.2, recurrent_dropout=0.2)),
    Dense(num_classes, activation='softmax')
])

In [16]:
# Compiling the model with Adam optimizer and sparse categorical cross-entropy loss
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [17]:
# Training the model on the training data and validating it on the validation data
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=10, batch_size=32)

Epoch 1/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 74ms/step - accuracy: 0.4792 - loss: 1.3365 - val_accuracy: 0.8770 - val_loss: 0.3445
Epoch 2/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 75ms/step - accuracy: 0.9269 - loss: 0.2098 - val_accuracy: 0.9100 - val_loss: 0.2378
Epoch 3/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 75ms/step - accuracy: 0.9621 - loss: 0.1050 - val_accuracy: 0.9045 - val_loss: 0.2505
Epoch 4/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 76ms/step - accuracy: 0.9747 - loss: 0.0701 - val_accuracy: 0.9140 - val_loss: 0.2448
Epoch 5/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 77ms/step - accuracy: 0.9820 - loss: 0.0509 - val_accuracy: 0.9095 - val_loss: 0.3063
Epoch 6/10
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 81ms/step - accuracy: 0.9862 - loss: 0.0405 - val_accuracy: 0.9120 - val_loss: 0.2864
Epoch 7/10
[1m5

<keras.src.callbacks.history.History at 0x22ec3a819d0>

In [18]:
# Evaluate the model on test data
test_loss, test_accuracy = model.evaluate(X_test, y_test)
print(f"Test Accuracy: {test_accuracy:.2f}")

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 26ms/step - accuracy: 0.9001 - loss: 0.3779
Test Accuracy: 0.90


In [19]:
# Decoding the predictions
predictions = model.predict(X_test)
predicted_labels = [label_encoder.classes_[np.argmax(pred)] for pred in predictions]

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 33ms/step


In [20]:
# Generating a classification report to evaluate the model's performance
from sklearn.metrics import classification_report
print(classification_report(test_data['emotions'], predicted_labels, target_names=label_encoder.classes_))

              precision    recall  f1-score   support

       anger       0.91      0.87      0.89       275
        fear       0.81      0.92      0.86       224
         joy       0.92      0.95      0.93       695
        love       0.84      0.68      0.75       159
     sadness       0.94      0.94      0.94       581
    surprise       0.88      0.67      0.76        66

    accuracy                           0.90      2000
   macro avg       0.88      0.84      0.86      2000
weighted avg       0.90      0.90      0.90      2000

