In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, RNN, SimpleRNN, Flatten, Dropout
from tensorflow.keras.optimizers import Adam, AdamW, SGD
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences



In [None]:
# Load the data
train_data = pd.read_csv('train_split.csv')
test_data = pd.read_csv('test_split.csv')

# Preprocessing
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_data['text'])

# Convert text to sequences
X_train = tokenizer.texts_to_sequences(train_data['text'])
X_test = tokenizer.texts_to_sequences(test_data['text'])

# Padding sequences
max_len = 128
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

# Label encoding the target variables (for multi-label classification)
y_train = train_data[['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']].values
y_test = test_data[['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise']].values


In [None]:
from gensim.models import Word2Vec

# Create Word2Vec embeddings
word2vec = Word2Vec(sentences=[row.split() for row in train_data['text']], vector_size=100, window=5, min_count=1, workers=4)
vocab_size = len(tokenizer.word_index) + 1

# Create an embedding matrix
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in word2vec.wv:
        embedding_matrix[i] = word2vec.wv[word]

# Define embedding layer
embedding_layer = Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=max_len, trainable=False)




In [None]:
from gensim.models import Word2Vec

# Train Word2Vec model on the dataset (if not pretrained)
sentences = [sentence.split() for sentence in train_data['text']]
word2vec = Word2Vec(sentences=sentences, vector_size=100, window=5, min_count=1, workers=4)

# Save the Word2Vec model
word2vec.save("word2vec_model.model")


In [None]:
vocab_size = len(tokenizer.word_index) + 1  # Get the vocabulary size

# Initialize the embedding matrix with zeros
embedding_matrix = np.zeros((vocab_size, 100))

# Populate the embedding matrix with Word2Vec vectors
for word, i in tokenizer.word_index.items():
    if word in word2vec.wv:
        embedding_matrix[i] = word2vec.wv[word]


In [None]:
# Save embedding matrix as a .npy file
np.save('embedding_matrix.npy', embedding_matrix)


In [None]:
# Load the embedding matrix from .npy file
embedding_matrix = np.load('embedding_matrix.npy')

# Create an Embedding layer in Keras using the loaded embedding matrix
embedding_layer = Embedding(input_dim=vocab_size,
                            output_dim=100,
                            weights=[embedding_matrix],
                            input_length=max_len,
                            trainable=False)




In [None]:
def create_ffnn():
    model = Sequential()
    model.add(embedding_layer)
    model.add(Flatten())
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='sigmoid'))
    return model


In [None]:
def create_rnn():
    model = Sequential()
    model.add(embedding_layer)
    model.add(SimpleRNN(64, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='sigmoid'))
    return model



In [None]:
def create_lstm():
    model = Sequential()
    model.add(embedding_layer)
    model.add(LSTM(64, return_sequences=False))
    model.add(Dropout(0.5))
    model.add(Dense(5, activation='sigmoid'))
    return model


In [None]:
models = {
    'FFNN': create_ffnn(),
    'RNN': create_rnn(),
    'LSTM': create_lstm()
}

best_f1 = 0
best_model_name = None
best_model = None

for name, model in models.items():
    model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=['accuracy'])
    history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2)

    # Predict and calculate F1-macro score
    y_pred = model.predict(X_test)
    y_pred = np.round(y_pred)
    f1_macro = f1_score(y_test, y_pred, average='macro')

    if f1_macro > best_f1:
        best_f1 = f1_macro
        best_model_name = name
        best_model = model

# Save the best model
best_model.save('best_model.h5')
print(f"Best model is {best_model_name} with F1-macro score: {best_f1}")


Epoch 1/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step - accuracy: 0.3424 - loss: 0.6390 - val_accuracy: 0.5437 - val_loss: 0.5802
Epoch 2/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4799 - loss: 0.5977 - val_accuracy: 0.5688 - val_loss: 0.5724
Epoch 3/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.4865 - loss: 0.5945 - val_accuracy: 0.5656 - val_loss: 0.5681
Epoch 4/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 7ms/step - accuracy: 0.5109 - loss: 0.5784 - val_accuracy: 0.5656 - val_loss: 0.5678
Epoch 5/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.5225 - loss: 0.5793 - val_accuracy: 0.5656 - val_loss: 0.5654
Epoch 6/100
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.5094 - loss: 0.5801 - val_accuracy: 0.5656 - val_loss: 0.5685
Epoch 7/100
[1m40/40[0m [32m━━



Best model is RNN with F1-macro score: 0.2253645571967104


In [None]:
# Load the best model
best_model = tf.keras.models.load_model('best_model.h5')

# Evaluate on test data
y_pred_test = best_model.predict(X_test)
y_pred_test = np.round(y_pred_test)
f1_test = f1_score(y_test, y_pred_test, average='macro')
print(f"F1-macro score on test data: {f1_test}")




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step
F1-macro score on test data: 0.2253645571967104


In [None]:
from sklearn.metrics import classification_report


In [None]:
# Load the best model
best_model = tf.keras.models.load_model('best_model.h5')

# Predict on the test dataset
y_pred_test = best_model.predict(X_test)
y_pred_test = np.round(y_pred_test)  # Convert probabilities to 0 or 1 for multi-label classification

# Generate classification report
report = classification_report(y_test, y_pred_test, target_names=['Joy', 'Fear', 'Anger', 'Sadness', 'Surprise'])
print(report)




[1m13/13[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
              precision    recall  f1-score   support

         Joy       0.36      0.10      0.15        94
        Fear       0.57      0.72      0.64       232
       Anger       0.00      0.00      0.00        52
     Sadness       0.50      0.12      0.19       126
    Surprise       0.31      0.10      0.15       124

   micro avg       0.52      0.32      0.40       628
   macro avg       0.35      0.21      0.23       628
weighted avg       0.43      0.32      0.33       628
 samples avg       0.42      0.31      0.33       628



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
