In [26]:
import random
import pandas as pd
import spacy
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SpatialDropout1D, Bidirectional, LSTM, Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.metrics import classification_report


In [14]:
test_df = pd.read_csv('../task4/NLP_features 2.csv')
train_df = pd.read_csv('../datasets/Train_Test_data/Training_dataset.csv')
# Parameters
MAX_NUM_WORDS = 5000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 128


In [23]:
# Print count of each class in the training set
print(train_df['Emotion'].value_counts())
# print count of each class in the test set
print(test_df['Emotion'].value_counts())

Emotion
neutral      1209
surprise     1082
sadness      1047
fear          988
happiness     950
disgust       376
anger         243
Name: count, dtype: int64
Emotion
neutral      255
happiness    250
surprise     147
anger         47
sadness       31
fear          17
disgust        2
Name: count, dtype: int64


---

# Create first RNN model attempt

In [7]:
# Encode labels
label_encoder = LabelEncoder()
train_df['Encoded_Emotion'] = label_encoder.fit_transform(train_df['Emotion'])
test_df['Encoded_Emotion'] = label_encoder.transform(test_df['Emotion'])
num_classes = len(label_encoder.classes_)

# Tokenizer (fit only on training data)
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df['Sentence'])

# Preprocessing
def preprocess_text(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    padded = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    return padded

X_train = preprocess_text(train_df['Sentence'])
X_test = preprocess_text(test_df['Sentence'])

y_train = train_df['Encoded_Emotion'].values
y_test = test_df['Encoded_Emotion'].values

# Build the model
def create_rnn_model():
    model = Sequential()
    model.add(Embedding(input_dim=MAX_NUM_WORDS, output_dim=EMBEDDING_DIM, input_length=MAX_SEQUENCE_LENGTH))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(num_classes, activation='softmax'))

    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

# Train the model
model = create_rnn_model()
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_split=0.2)

# Evaluate on test set
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.2f}")

# Predict on test data
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

# Print detailed classification report
class_names = label_encoder.classes_
report = classification_report(y_test, y_pred, target_names=class_names)
print("\nClassification Report:\n", report)


Epoch 1/5




[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 58ms/step - accuracy: 0.2384 - loss: 1.8739 - val_accuracy: 0.2273 - val_loss: 1.8513
Epoch 2/5
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.4476 - loss: 1.5171 - val_accuracy: 0.3376 - val_loss: 1.5560
Epoch 3/5
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 52ms/step - accuracy: 0.6935 - loss: 0.9031 - val_accuracy: 0.5208 - val_loss: 1.1667
Epoch 4/5
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 51ms/step - accuracy: 0.8322 - loss: 0.5268 - val_accuracy: 0.5530 - val_loss: 1.1645
Epoch 5/5
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 50ms/step - accuracy: 0.8955 - loss: 0.3569 - val_accuracy: 0.5403 - val_loss: 1.2301
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.3441 - loss: 2.3475

Test Accuracy: 0.33
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step

Classif

---

# Second Iteration with Bidirectional LSTM with Dutch Lemmatization, FastText Embeddings, and Class Weights

In [None]:
# Load spaCy Dutch model
nlp = spacy.load('nl_core_news_sm')

# Parameters
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 300
EPOCHS = 10
BATCH_SIZE = 64

# Lemmatization function for Dutch using spaCy
def lemmatize_text(text):
    doc = nlp(text.lower())
    return ' '.join([token.lemma_ for token in doc])

# Apply lemmatization
train_df['Lemmatized_Sentence'] = train_df['Sentence'].apply(lemmatize_text)
test_df['Lemmatized_Sentence'] = test_df['Sentence'].apply(lemmatize_text)

# Encode labels
label_encoder = LabelEncoder()
train_df['Encoded_Emotion'] = label_encoder.fit_transform(train_df['Emotion'])
test_df['Encoded_Emotion'] = label_encoder.transform(test_df['Emotion'])
num_classes = len(label_encoder.classes_)

# Tokenizer
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df['Lemmatized_Sentence'])

def preprocess_text(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

X_train = preprocess_text(train_df['Lemmatized_Sentence'])
X_test = preprocess_text(test_df['Lemmatized_Sentence'])

y_train = train_df['Encoded_Emotion'].values
y_test = test_df['Encoded_Emotion'].values

# Load FastText Dutch embeddings
embedding_index = {}
embedding_path = 'cc.nl.300.vec'

with open(embedding_path, encoding='utf8') as f:
    next(f)
    for line in f:
        values = line.rstrip().split(' ')
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        embedding_index[word] = coefs

# Create embedding matrix
word_index = tokenizer.word_index
embedding_matrix = np.zeros((MAX_NUM_WORDS, EMBEDDING_DIM))
for word, i in word_index.items():
    if i >= MAX_NUM_WORDS:
        continue
    embedding_vector = embedding_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

# Build the model
def create_bidirectional_lstm():
    model = Sequential()
    model.add(Embedding(input_dim=MAX_NUM_WORDS,
                        output_dim=EMBEDDING_DIM,
                        input_length=MAX_SEQUENCE_LENGTH,
                        weights=[embedding_matrix],
                        trainable=True))
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)))
    model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                optimizer=Adam(learning_rate=0.001),
                metrics=['accuracy'])

    early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

    model.fit(X_train, y_train,
            epochs=50,
            batch_size=BATCH_SIZE,
            validation_split=0.2,
            class_weight=class_weight_dict,
            callbacks=[early_stop])


# Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.2f}")

y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print("\nClassification Report:\n", report)


[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - accuracy: 0.2434 - loss: 1.9872

Test Accuracy: 0.22
[1m24/24[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 55ms/step

Classification Report:
               precision    recall  f1-score   support

       anger       0.09      0.04      0.06        47
     disgust       0.00      0.00      0.00         2
        fear       0.03      0.12      0.04        17
   happiness       0.67      0.16      0.25       250
     neutral       0.46      0.31      0.37       255
     sadness       0.05      0.29      0.09        31
    surprise       0.24      0.26      0.25       147

    accuracy                           0.22       749
   macro avg       0.22      0.17      0.15       749
weighted avg       0.44      0.22      0.27       749



---

# Third iteration on RNN model with Data Augmentation via Random Swap & FastText Embeddings"

In [30]:
# Augmentation for underrepresented classes
underrepresented = ['disgust', 'fear', 'anger', 'sadness']
min_samples = 150

def random_swap(text, n_swaps=1):
    words = text.split()
    for _ in range(n_swaps):
        if len(words) < 2:
            break
        idx1, idx2 = random.sample(range(len(words)), 2)
        words[idx1], words[idx2] = words[idx2], words[idx1]
    return ' '.join(words)

def augment_class_swap(df, emotion_label, target_count):
    subset = df[df['Emotion'] == emotion_label]
    augmented_texts = []
    current_count = len(subset)
    
    while current_count < target_count:
        for text in subset['Sentence']:
            augmented_text = random_swap(text)
            augmented_texts.append(augmented_text)
            current_count += 1
            if current_count >= target_count:
                break
    
    new_df = pd.DataFrame({'Sentence': augmented_texts, 'Emotion': [emotion_label]*len(augmented_texts)})
    return pd.concat([df, new_df], ignore_index=True)

for emotion in underrepresented:
    count = test_df[test_df['Emotion'] == emotion].shape[0]
    if count < min_samples:
        test_df = augment_class_swap(test_df, emotion, min_samples)

print("Augmented test data distribution:\n", test_df['Emotion'].value_counts())

# Lemmatization using spaCy
nlp = spacy.load('nl_core_news_sm')

def lemmatize_text(text):
    doc = nlp(text.lower())
    return ' '.join([token.lemma_ for token in doc])

train_df['Lemmatized_Sentence'] = train_df['Sentence'].apply(lemmatize_text)
test_df['Lemmatized_Sentence'] = test_df['Sentence'].apply(lemmatize_text)

# Label encoding
label_encoder = LabelEncoder()
train_df['Encoded_Emotion'] = label_encoder.fit_transform(train_df['Emotion'])
test_df['Encoded_Emotion'] = label_encoder.transform(test_df['Emotion'])
num_classes = len(label_encoder.classes_)

# Constants
MAX_NUM_WORDS = 10000
MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 300
EPOCHS = 10
BATCH_SIZE = 64

# Tokenizer
tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
tokenizer.fit_on_texts(train_df['Lemmatized_Sentence'])

def preprocess_text(texts):
    sequences = tokenizer.texts_to_sequences(texts)
    return pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)

X_train = preprocess_text(train_df['Lemmatized_Sentence'])
X_test = preprocess_text(test_df['Lemmatized_Sentence'])

y_train = train_df['Encoded_Emotion'].values
y_test = test_df['Encoded_Emotion'].values

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weight_dict = dict(enumerate(class_weights))

# Build model without pre-trained embeddings
def create_bidirectional_lstm():
    model = Sequential()
    model.add(Embedding(input_dim=MAX_NUM_WORDS,
                        output_dim=EMBEDDING_DIM,
                        input_length=MAX_SEQUENCE_LENGTH,
                        trainable=True))  # <-- learned from scratch
    model.add(SpatialDropout1D(0.3))
    model.add(Bidirectional(LSTM(128, return_sequences=True, dropout=0.3, recurrent_dropout=0.3)))
    model.add(Bidirectional(LSTM(64, dropout=0.3, recurrent_dropout=0.3)))
    model.add(Dense(64, activation='relu'))
    model.add(Dropout(0.3))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy',
                  optimizer=Adam(learning_rate=0.001),
                  metrics=['accuracy'])
    return model

# Train model
model = create_bidirectional_lstm()

early_stop = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

model.fit(X_train, y_train,
          epochs=EPOCHS,
          batch_size=BATCH_SIZE,
          validation_split=0.2,
          class_weight=class_weight_dict,
          callbacks=[early_stop])

# Evaluate
loss, accuracy = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {accuracy:.2f}")

# Predictions and report
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
print("\nClassification Report:\n", report)


Augmented test data distribution:
 Emotion
neutral      255
happiness    250
fear         150
sadness      150
anger        150
disgust      150
surprise     147
Name: count, dtype: int64
Epoch 1/10




[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m97s[0m 1s/step - accuracy: 0.1501 - loss: 1.9912 - val_accuracy: 0.0280 - val_loss: 2.1064
Epoch 2/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m119s[0m 2s/step - accuracy: 0.2902 - loss: 1.6483 - val_accuracy: 0.1433 - val_loss: 1.7862
Epoch 3/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m99s[0m 1s/step - accuracy: 0.5783 - loss: 1.0121 - val_accuracy: 0.3562 - val_loss: 1.4845
Epoch 4/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m131s[0m 2s/step - accuracy: 0.7177 - loss: 0.6803 - val_accuracy: 0.3978 - val_loss: 1.3365
Epoch 5/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m100s[0m 1s/step - accuracy: 0.7813 - loss: 0.4939 - val_accuracy: 0.4224 - val_loss: 1.4792
Epoch 6/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m128s[0m 2s/step - accuracy: 0.8057 - loss: 0.4419 - val_accuracy: 0.4249 - val_loss: 1.5159
Epoch 7/10
[1m74/74[0m [32m━━━━━━━━━━━━━━━━━