In [1]:
# Importing the required libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, Bidirectional, LSTM, Flatten, GlobalMaxPool1D, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras import backend as K

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

import seaborn as sns

2024-04-03 14:20:36.147066: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-03 14:20:36.147128: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-03 14:20:36.148882: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-04-03 14:20:36.157208: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
train = pd.read_csv('emotions_all.csv') # My dataset

train.head() # Display the first 5 rows of the train dataset

Unnamed: 0,sentence,emotion
0,That game hurt.,sadness
1,Man I love reddit.,happiness
2,Right? Considering its such an important docum...,happiness
3,"He isn't as big, but he's still quite popular....",disgust
4,That's crazy; I went to a super [RELIGION] hig...,happiness


In [3]:
# Checking for the amount of values in the dataset
train['emotion'].value_counts()

emotion
happiness    210501
sadness      125218
anger         63873
fear          46478
surprise      19845
disgust        5702
Name: count, dtype: int64

In [4]:
# Drop the last 100k happiness emotions in the emotion column
train = train.drop(train[train['emotion'] == 'happiness'].index[100000:])

# Reset the index
train = train.reset_index(drop=True)

# Display the first 5 rows
train.head()

Unnamed: 0,sentence,emotion
0,That game hurt.,sadness
1,Man I love reddit.,happiness
2,Right? Considering its such an important docum...,happiness
3,"He isn't as big, but he's still quite popular....",disgust
4,That's crazy; I went to a super [RELIGION] hig...,happiness


In [5]:
# Amount of emotions in emotion column
train['emotion'].value_counts()

emotion
sadness      125218
happiness    100000
anger         63873
fear          46478
surprise      19845
disgust        5702
Name: count, dtype: int64

In [6]:
# Importing the "test" dataset
test = pd.read_csv('test_group.csv', sep='\t')

test.head() #Display the first 5 rows of the test dataset

Unnamed: 0,id,sentence
0,0,Girls are happy when they get flowers
1,1,His jaw dropped in disbelief when he saw the p...
2,2,Sometimes the ugly stench makes me wanna throw...
3,3,The foul odor from the garbage bin was disgust...
4,4,"I can’t believe it, they lost the game in the ..."


In [8]:
# Extracting the sentences and labels from the training and testing datasets.
train_sentences = train['sentence'].values
train_labels = train['emotion'].values
test_sentences = test['sentence'].values

In [9]:
# Preprocess labels
label_encoder = LabelEncoder()
train_labels_encoded = label_encoder.fit_transform(train_labels)
train_labels_one_hot = tf.keras.utils.to_categorical(train_labels_encoded)

In [10]:
# Tokenize sentences
tokenizer = Tokenizer(oov_token='<OOV>', num_words=1000000)
tokenizer.fit_on_texts(train_sentences)
tokenizer.fit_on_texts(test_sentences)
train_sequences = tokenizer.texts_to_sequences(train_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

In [11]:
# Pad sequences for training data
max_length = max(len(x) for x in train_sequences)
train_padded = pad_sequences(train_sequences, maxlen=max_length, padding='post')

# Pad sequences for testing data using the same max_length
test_padded = pad_sequences(test_sequences, maxlen=max_length, padding='post')

In [12]:
# Split the dataset into training and testing sets
X_train, X_val, y_train, y_val = train_test_split(train_padded, train_labels_one_hot, test_size=0.2, random_state=42)

In [13]:
# Showing the unique values in the emotion column
unique_emotions = train['emotion'].unique()
num_emotions = len(unique_emotions)
print("Unique emotions:", unique_emotions)
print("Number of unique emotions:", num_emotions)

Unique emotions: ['sadness' 'happiness' 'disgust' 'surprise' 'anger' 'fear']
Number of unique emotions: 6


In [18]:
# Define the parameters
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size
embedding_dim = 64  # Dimensionality of the embedding layer
num_emotions = len(unique_emotions)  # Number of unique emotions

# Defining the RNN model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=231),  # Embedding layer
    SimpleRNN(516, return_sequences=True),  # Use SimpleRNN instead of LSTM
    Dropout(0.5),  # Dropout for regularization
    Dense(256, activation='relu'),
    GlobalMaxPool1D(),  # Global max pooling
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dropout(0.5),  # Additional dropout for regularization
    Dense(num_emotions, activation='softmax')  # Output layer for emotion classification
])

model.summary() # Display the model summary

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 231, 64)           4913600   
                                                                 
 simple_rnn_1 (SimpleRNN)    (None, 231, 516)          299796    
                                                                 
 dropout_2 (Dropout)         (None, 231, 516)          0         
                                                                 
 dense_4 (Dense)             (None, 231, 256)          132352    
                                                                 
 global_max_pooling1d_1 (Gl  (None, 256)               0         
 obalMaxPooling1D)                                               
                                                                 
 dense_5 (Dense)             (None, 128)               32896     
                                                      

In [19]:
def f1_score(y_true, y_pred):
    # Calculate Precision and Recall
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))

    precision = true_positives / (predicted_positives + K.epsilon())
    recall = true_positives / (possible_positives + K.epsilon())

    # Calculate F1 score
    f1_val = 2*(precision*recall)/(precision+recall+K.epsilon())
    return f1_val

In [20]:
# Compiling the model
model.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=[f1_score])

In [21]:
# Training the model with early stopping
history = model.fit(X_train, y_train, 
                    epochs=100, 
                    batch_size=128, 
                    validation_data=(X_val, y_val),
                    verbose=1, 
                    callbacks=[tf.keras.callbacks.EarlyStopping(patience=5)])

Epoch 1/100


2024-04-03 14:21:23.006275: I external/local_xla/xla/service/service.cc:168] XLA service 0x7f09b43cf140 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-04-03 14:21:23.006309: I external/local_xla/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA RTX A6000, Compute Capability 8.6
2024-04-03 14:21:23.014030: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:269] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-04-03 14:21:23.268051: I external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:454] Loaded cuDNN version 8906
I0000 00:00:1712154083.376981  226905 device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of the process.


Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100


In [27]:
y_pred = model.predict(test_padded) # For Kaggle submission



In [29]:
# Convert predictions to label indices
predicted_class_indices = np.argmax(y_pred, axis=1)
true_classes = np.argmax(y_val, axis=1)

# Use the LabelEncoder to decode the predictions to original labels
predicted_emotions = label_encoder.inverse_transform(predicted_class_indices)

In [30]:
# Add predictions to my test dataset
test['emotion'] = predicted_emotions

In [31]:
test.head()

Unnamed: 0,id,sentence,emotion
0,0,Girls are happy when they get flowers,sadness
1,1,His jaw dropped in disbelief when he saw the p...,sadness
2,2,Sometimes the ugly stench makes me wanna throw...,sadness
3,3,The foul odor from the garbage bin was disgust...,sadness
4,4,"I can’t believe it, they lost the game in the ...",sadness


In [32]:
# Drop sentence column
test = test.drop(columns=['sentence'])

test.head()

Unnamed: 0,id,emotion
0,0,sadness
1,1,sadness
2,2,sadness
3,3,sadness
4,4,sadness


In [33]:
test.to_csv('rnn_model_max.csv', index=False)