## Step 1: Import necessary libraries

This step involves importing the required libraries for data manipulation, visualization, machine learning utilities, and TensorFlow for building the GRU model.


In [None]:
# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
import seaborn as sns
import warnings

# Importing machine learning utilities
from sklearn.preprocessing import (
    StandardScaler,
)
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelEncoder

# Importing TensorFlow libraries for building the GRU model
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    GRU,
    Dropout,
)
from tensorflow.keras.layers import Reshape

warnings.filterwarnings("ignore")

## Step 2: Load Data

In this step, we load the training, testing, and validation datasets from their respective CSV files using Pandas.


In [None]:
train_file_path = "./data/train.csv"
test_file_path = "./data/test.csv"
val_file_path = "./data/val.csv"

train_df = pd.read_csv(train_file_path, on_bad_lines="skip")
test_df = pd.read_csv(test_file_path, on_bad_lines="skip")
val_df = pd.read_csv(val_file_path, on_bad_lines="skip")

## Preview the Training Data


In [None]:
train_df

In [None]:
train_df.info()

In [None]:
train_df.describe()

In [None]:
train_df.isnull().sum()

## Preview the Test Data


In [None]:
test_df

In [None]:
test_df.info()

In [None]:
test_df.describe()

In [None]:
test_df.isnull().sum()

## Preview the Validation Data


In [None]:
val_df

In [None]:
val_df.dtypes

In [None]:
val_df.info()

In [None]:
val_df.describe()

In [None]:
val_df.isnull().sum()

## Step 3: Data Preprocessing

In this step, we remove any rows with missing values from the training, testing, and validation datasets.


### Handling Missing Values


In [None]:
train_df = train_df.dropna()
test_df = test_df.dropna()
val_df = val_df.dropna()

### Check the hadling missing values


In [None]:
train_df.isnull().sum()

In [None]:
test_df.isnull().sum()

In [None]:
val_df.isnull().sum()

In [None]:
train_df.dtypes

In [None]:
train_df

## Step 4: Data Filtering

In this step, we filter the rows of the datasets to include only those with valid gender values. We define the set of valid gender values as "Male", "Female", and "Non-binary".


In [None]:
# Define valid gender values
valid_genders = {"Male", "Female", "Non-binary"}

# Filter rows where 'Gender' is in the set of valid genders
train_df = train_df[train_df["Gender"].isin(valid_genders)]
test_df = test_df[test_df["Gender"].isin(valid_genders)]
val_df = val_df[val_df["Gender"].isin(valid_genders)]

train_df.dtypes

## Step 5: Data Type Conversion

In this step, we convert the data types of certain columns to ensure consistency and suitability for analysis:

- Convert 'User_ID' to integer.
- Convert 'Age' to numeric, replacing non-numeric values with NaN, then to integer.
- Drop rows with NaN values in the 'Age' column.
- Convert 'Gender', 'Platform', and 'Dominant_Emotion' columns to string.


In [None]:
# Convert 'User_ID' to int
train_df["User_ID"] = train_df["User_ID"].astype(int)
test_df["User_ID"] = test_df["User_ID"].astype(int)
val_df["User_ID"] = val_df["User_ID"].astype(int)

# Convert 'Age' to numeric, replacing non-numeric values with NaN
train_df["Age"] = pd.to_numeric(train_df["Age"], errors="coerce")
test_df["Age"] = pd.to_numeric(test_df["Age"], errors="coerce")
val_df["Age"] = pd.to_numeric(val_df["Age"], errors="coerce")

# Drop rows with NaN values in the 'Age' column
train_df = train_df.dropna(subset=["Age"])
test_df = test_df.dropna(subset=["Age"])
val_df = val_df.dropna(subset=["Age"])

# Convert 'Age' to int
train_df["Age"] = train_df["Age"].astype(int)
test_df["Age"] = test_df["Age"].astype(int)
val_df["Age"] = val_df["Age"].astype(int)

# Convert 'Gender', 'Platform', and 'Dominant_Emotion' to str
train_df["Gender"] = train_df["Gender"].astype(str)
test_df["Gender"] = test_df["Gender"].astype(str)
val_df["Gender"] = val_df["Gender"].astype(str)

train_df["Platform"] = train_df["Platform"].astype(str)
test_df["Platform"] = test_df["Platform"].astype(str)
val_df["Platform"] = val_df["Platform"].astype(str)

train_df["Dominant_Emotion"] = train_df["Dominant_Emotion"].astype(str)
test_df["Dominant_Emotion"] = test_df["Dominant_Emotion"].astype(str)
val_df["Dominant_Emotion"] = val_df["Dominant_Emotion"].astype(str)

In [None]:
train_df.dtypes

In [None]:
test_df.dtypes

In [None]:
val_df.dtypes

## Step 6: Label Encoding

In this step, we ensure consistent label encoding for the target variable ('Dominant_Emotion') across all data splits:

- We use a LabelEncoder to encode the target variable.
- Labels from the training, testing, and validation sets are combined to create a unified label encoder.


In [None]:
# Ensure that we use consistent label encoding for the target variable across all data splits
label_encoder = LabelEncoder()

# Combine all the labels from train, test, and validation sets to create a unified label encoder
all_labels = (
    list(train_df["Dominant_Emotion"])
    + list(test_df["Dominant_Emotion"])
    + list(val_df["Dominant_Emotion"])
)
label_encoder.fit(all_labels)

# list(train_df["Dominant_Emotion"])

In [None]:
list(test_df["Dominant_Emotion"])[0:5]

## Step 7: Encoding Target Variable and Splitting Features

In this step, we encode the target variable ('Dominant_Emotion') into numerical values using the previously fitted label encoder. Then, we split the datasets into features (X) and target (y):

- Encode the target variable into numerical values for training, testing, and validation sets.
- Split the datasets into features (X) and target (y) by dropping the 'Dominant_Emotion' column.
- Identify the numeric columns in the feature datasets.


In [None]:
y_train_encoded = label_encoder.transform(train_df["Dominant_Emotion"])
y_test_encoded = label_encoder.transform(test_df["Dominant_Emotion"])
y_val_encoded = label_encoder.transform(val_df["Dominant_Emotion"])

In [None]:
y_test_encoded[0:5]

### Define features and target variable for each dataset


In [None]:
X_train = train_df.drop("Dominant_Emotion", axis=1)
X_test = test_df.drop("Dominant_Emotion", axis=1)
X_val = val_df.drop("Dominant_Emotion", axis=1)

X_test

## Step 8: Feature Scaling

In this step, we standardize the numeric features using StandardScaler:

- Identify the numeric columns in the feature datasets.
- Scale the features using StandardScaler separately for training, testing, and validation sets.
- Verify the scaling by displaying a sample of the scaled features from the training data.


In [None]:
numeric_columns = X_train.select_dtypes(include=[np.number]).columns.tolist()
numeric_columns

In [None]:
# Standardize the features using StandardScaler
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train[numeric_columns])
X_test_scaled = scaler.transform(X_test[numeric_columns])
X_val_scaled = scaler.transform(X_val[numeric_columns])

In [None]:
# Verify the scaling
print("Scaled feature sample (first 5 rows of the training data):")
print(X_train_scaled[:5])

## Step 9: Build the GRU Model

In this step, we build the GRU (Gated Recurrent Unit) model using TensorFlow's Keras API:

- Define a Sequential model.
- Reshape the input data to fit the GRU layer.
- Add a GRU layer with 128 units and return sequences.
- Apply a dropout layer with a dropout rate of 0.2.
- Add a dense layer with 64 units and ReLU activation.
- Finally, add a dense output layer with the number of units equal to the number of classes in the target variable and softmax activation.


In [None]:
model = Sequential(
    [
        Reshape((1, X_train_scaled.shape[1]),
                input_shape=(X_train_scaled.shape[1],)),
        GRU(units=128, return_sequences=True),
        Dropout(0.2),
        Dense(64, activation="relu"),
        Dense(len(label_encoder.classes_), activation="softmax"),
    ]
)

### Model Summary

Below is the summary of the GRU model architecture:


In [None]:
model.summary()

## Step 10: Compile the Model

In this step, we compile the GRU model:

- We use the Adam optimizer.
- The loss function is set to sparse categorical crossentropy, suitable for multi-class classification tasks.
- We track accuracy as a metric.


In [None]:
# Step 3: Compile the Model
model.compile(
    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
)

## Step 11: Set up TensorBoard Callback and Train the Model

In this step, we set up TensorBoard callback to visualize the training process:

- Define the directory where TensorBoard logs will be stored.
- Set up the TensorBoard callback to monitor training progress and visualize it using TensorBoard.
- Train the GRU model using the training data and validate it on the validation data.
- We train the model for 200 epochs with a batch size of 32.


In [None]:
# Step 6: Set up TensorBoard callback
log_dir = "./logs"  # Directory where TensorBoard logs will be stored
tensorboard_callback = tf.keras.callbacks.TensorBoard(
    log_dir=log_dir, histogram_freq=1)

# Step 7: Train the Model with TensorBoard callback
history = model.fit(
    X_train_scaled,
    y_train_encoded,
    validation_data=(X_val_scaled, y_val_encoded),
    epochs=200,
    batch_size=32,
    callbacks=[tensorboard_callback],
)

## Step 12: Evaluate the Model

In this step, we evaluate the trained GRU model on the test dataset to assess its performance:

- Compute the test loss and accuracy using the test dataset.
- Print out the test loss and accuracy metrics.


In [None]:
test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_encoded)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

## Step 13: Final Results Visualization and Analysis

In this step, we make predictions on the validation dataset using the trained model and visualize the confusion matrix:

- Predict the labels for the validation dataset using the trained GRU model.
- Decode the predicted labels back to their original emotion labels using the label encoder.
- Compute the confusion matrix using the true and predicted labels.
- Visualize the confusion matrix using a heatmap.


### Predictions on Validation Data

In this step, we make predictions on the validation dataset using the trained GRU model.


In [None]:
y_pred = model.predict(X_val_scaled)
y_pred

### Decode Predicted Labels

In this step, we decode the predicted labels back to their original emotion labels using the label encoder.


In [None]:
# Get the index of the maximum probability for each sample
y_pred_labels = np.argmax(y_pred, axis=2)

# Flatten the array
y_pred_val_encoded = y_pred_labels.flatten()

# Convert the flat array of labels to their corresponding emotion labels using the label encoder
y_pred_labels = label_encoder.inverse_transform(y_pred_val_encoded)

# Reshape the array back to its original shape
y_pred_labels = y_pred_labels.reshape(y_pred.shape[0], -1)

### Get Unique Labels

In this step, we obtain the unique emotion labels from the validation dataset.


In [None]:
y_val = list(val_df["Dominant_Emotion"])
unique_labels = np.unique(y_val)
unique_labels

### Confusion Matrix

In this step, we compute the confusion matrix using the true and predicted labels.


In [None]:
cf_matrix = confusion_matrix(y_val_encoded, y_pred_val_encoded)
cf_matrix

### Visualization: Confusion Matrix Heatmap

In this step, we visualize the confusion matrix using a heatmap.


In [None]:
sns.heatmap(
    cf_matrix,
    annot=True,
    fmt="d",
    xticklabels=unique_labels,
    yticklabels=unique_labels,
    cmap="Blues",
)

### Classification Report

In this step, we calculate and display a classification report, which includes precision, recall, and F1-score for each class.


In [None]:
from sklearn.metrics import classification_report

classification_rep = classification_report(
    y_val_encoded, y_pred_val_encoded, target_names=unique_labels
)

# Print the classification report
print("Classification Report:")
print(classification_rep)

### Accuracy Plot Over Epochs


In [None]:
import matplotlib.pyplot as plt

# Plot training and validation accuracy over epochs
plt.plot(history.history["accuracy"], label="Training Accuracy")
plt.plot(history.history["val_accuracy"], label="Validation Accuracy")
plt.xlabel("Epoch")
plt.ylabel("Accuracy")
plt.title("Training and Validation Accuracy Over Epochs")
plt.legend()
plt.show()

## Step 14: Hyperparameter Tuning

In this step, we experiment with different hyperparameters (e.g., dropout rates) to optimize the model's performance.


In [None]:
# Example: Experimenting with different dropout rates
dropout_rates = [0.1, 0.2, 0.3]

for rate in dropout_rates:
    model = Sequential(
        [
            Reshape(
                (1, X_train_scaled.shape[1]), input_shape=(X_train_scaled.shape[1],)
            ),
            GRU(units=128, return_sequences=True),
            Dropout(rate),  # Try different dropout rates
            Dense(64, activation="relu"),
            Dense(len(label_encoder.classes_), activation="softmax"),
        ]
    )

    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )

    # Train the model
    history = model.fit(
        X_train_scaled,
        y_train_encoded,
        validation_data=(X_val_scaled, y_val_encoded),
        epochs=200,
        batch_size=32,
        callbacks=[tensorboard_callback],
        verbose=0,  # Suppress output for brevity
    )

    # Evaluate the model
    test_loss, test_accuracy = model.evaluate(X_test_scaled, y_test_encoded, verbose=0)
    print(f"Dropout Rate: {rate}, Test Accuracy: {test_accuracy}")

## Step 15: Cross-Validation

In this step, we perform k-fold cross-validation to obtain a more robust estimate of the model's performance and assess its generalization ability.


In [None]:
from sklearn.model_selection import StratifiedKFold

# Example: Perform 5-fold cross-validation
kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

accuracies = []

for train_index, val_index in kfold.split(X_train_scaled, y_train_encoded):
    # Split the data
    X_train_fold, X_val_fold = X_train_scaled[train_index], X_train_scaled[val_index]
    y_train_fold, y_val_fold = y_train_encoded[train_index], y_train_encoded[val_index]

    # Define and compile the model (same as before)
    model = Sequential(
        [
            Reshape(
                (1, X_train_scaled.shape[1]), input_shape=(X_train_scaled.shape[1],)
            ),
            GRU(units=128, return_sequences=True),
            Dropout(0.2),
            Dense(64, activation="relu"),
            Dense(len(label_encoder.classes_), activation="softmax"),
        ]
    )
    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )

    # Train the model
    model.fit(X_train_fold, y_train_fold, epochs=200, batch_size=32, verbose=0)

    # Evaluate the model on validation fold
    _, accuracy = model.evaluate(X_val_fold, y_val_fold, verbose=0)
    accuracies.append(accuracy)

# Calculate mean and standard deviation of accuracies
mean_accuracy = np.mean(accuracies)
std_accuracy = np.std(accuracies)
print(f"Mean Accuracy: {mean_accuracy}, Standard Deviation: {std_accuracy}")