# Install Weights & Biases package

In [1]:
! pip install wandb




[notice] A new release of pip is available: 24.0 -> 24.1.2
[notice] To update, run: C:\Users\panko\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.10_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


# Import & log into Weights and Biases

In [2]:
import wandb
from wandb.integration.keras import WandbCallback



In [3]:
wandb.login()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\panko\_netrc


True

# Initialize the project

In [4]:
wandb.init(project="poleval-2024-emotion-recognition")

[34m[1mwandb[0m: Currently logged in as: [33mkosternaj[0m. Use [1m`wandb login --relogin`[0m to force relogin


# Configure hyperparameters

In [5]:
config = wandb.config
config.learning_rate = 0.001
config.epochs = 200
config.batch_size = 10
config.max_iter = 2000

# Load Libraries

In [6]:
import pandas as pd
import numpy as np

In [7]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score

In [8]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [9]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten

In [10]:
from joblib import dump, load
from scipy.stats import mode

In [11]:
from joblib import dump, load
from scipy.stats import mode

# Change the directory

In [12]:
import os
os.chdir("../../")
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd drive/MyDrive/poleval_emotion/

# Load the data

In [13]:
X = pd.read_csv('data/train/concated_for_ensemble_final.csv')
y = pd.read_csv('data/train/expected.tsv', sep='\t')

# Split the data

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1613)

# Model definitions (CNN & Naive Bayes: later))

In [15]:
models = {
    'random_forest': RandomForestClassifier(),
    'xgboost': XGBClassifier(),
    'mlp': MLPClassifier(max_iter=2000)
}

# Hyperparameters for tuning

In [16]:
'''
# For testing
param_grids = {
    'random_forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'bootstrap': [True]
    },
    'xgboost': {
        'n_estimators': [100],
        'learning_rate': [0.01],
        'max_depth': [3, 5],
        'gamma': [0]
    },
    'mlp': {
    'hidden_layer_sizes': [(100,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001]
    }
}
'''
param_grids = {
    'random_forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'bootstrap': [True, False]
    },
    'xgboost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9],
        'gamma': [0, 0.1]
    },
    'mlp': {
        'hidden_layer_sizes': [(100,), (50, 50), (100, 50), (50, 50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001]
    }
}

# Define the CNN model

In [17]:
def create_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(11, activation='sigmoid'))  # Assuming 11 classes
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Directory to save models

In [18]:
model_dir = 'models/ensemble_final/'
os.makedirs(model_dir, exist_ok=True)

# Function to perform hyperparameter tuning

In [19]:
def tune_model(model, param_grid):
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        return model

# Train models with standard multi-label classification approach

In [20]:
trained_models = {}
for model_name in ['random_forest', 'xgboost', 'mlp']:
    print(f"Training model: {model_name}")
    best_estimator = tune_model(models[model_name], param_grids[model_name])
    multi_target_model = MultiOutputClassifier(best_estimator)
    multi_target_model.fit(X_train, y_train)
    y_pred = multi_target_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    # Log metrics into wandb
    wandb.log({f"accuracy_{model_name}": accuracy})
    
    print(f"Accuracy for {model_name}: {accuracy}\n")
    trained_models[model_name] = multi_target_model
    dump(multi_target_model, os.path.join(model_dir, f'{model_name}.joblib'))

Training model: random_forest
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Accuracy for random_forest: 0.9191073919107392

Training model: xgboost
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Accuracy for xgboost: 0.9149232914923291

Training model: mlp
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Accuracy for mlp: 0.9135285913528591



# Train the CNN model

In [26]:
print("Training model: cnn")
cnn_model = create_cnn_model((X_train.shape[1], 1))
cnn_model.fit(X_train.values[..., np.newaxis], y_train.values, 
              epochs=config.epochs, batch_size=config.batch_size, 
              verbose=1)
cnn_y_pred = cnn_model.predict(X_test.values[..., np.newaxis])
cnn_y_pred = (cnn_y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
cnn_accuracy = accuracy_score(y_test, cnn_y_pred)

# Log metrics into wandb
wandb.log({"accuracy_cnn": cnn_accuracy})
print(f"Accuracy for cnn: {cnn_accuracy}\n")
cnn_model.save(os.path.join(model_dir, 'cnn.h5'))
trained_models['cnn'] = cnn_model

Training model: cnn
Epoch 1/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - accuracy: 0.1430 - loss: 0.1296
Epoch 2/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.1972 - loss: 0.0493
Epoch 3/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2437 - loss: 0.0455
Epoch 4/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2216 - loss: 0.0489
Epoch 5/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2582 - loss: 0.0419
Epoch 6/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3118 - loss: 0.0449
Epoch 7/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.3196 - loss: 0.0402
Epoch 8/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 3ms/step - accuracy: 0.2979 - loss: 0.0413
Epoch 9/200




Accuracy for cnn: 0.895397489539749



# Train the Naive Bayes models independently for each label

In [27]:
print("Training model: naive_bayes")
nb_models = []
nb_accuracies = []
for i in range(y_train.shape[1]):
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train.iloc[:, i])
    nb_models.append(nb_model)
    y_pred = nb_model.predict(X_test)
    accuracy = accuracy_score(y_test.iloc[:, i], y_pred)
    nb_accuracies.append(accuracy)
    
    # Log metrics into wandb
    wandb.log({f"accuracy_naive_bayes_{y_train.columns[i]}": accuracy})
    print(f"Accuracy for label {y_train.columns[i]} with Naive Bayes: {accuracy}")

mean_nb_accuracy = np.mean(nb_accuracies)

# # Log mean accuracy into wandb
wandb.log({"mean_accuracy_naive_bayes": mean_nb_accuracy})
print(f"\nMean Accuracy for Naive Bayes: {mean_nb_accuracy}")
dump(nb_models, os.path.join(model_dir, 'naive_bayes.joblib'))
trained_models['naive_bayes'] = nb_models

Training model: naive_bayes
Accuracy for label Joy with Naive Bayes: 0.9609483960948396
Accuracy for label Trust with Naive Bayes: 0.7580195258019525
Accuracy for label Anticipation with Naive Bayes: 0.99302649930265
Accuracy for label Surprise with Naive Bayes: 0.7580195258019525
Accuracy for label Fear with Naive Bayes: 0.9693165969316597
Accuracy for label Sadness with Naive Bayes: 0.9567642956764296
Accuracy for label Disgust with Naive Bayes: 0.8521617852161785
Accuracy for label Anger with Naive Bayes: 0.8486750348675035
Accuracy for label Positive with Naive Bayes: 0.9665271966527197
Accuracy for label Negative with Naive Bayes: 0.9288702928870293
Accuracy for label Neutral with Naive Bayes: 0.9532775453277545

Mean Accuracy for Naive Bayes: 0.9041460631418791


# Super ensemble model using majority voting

In [28]:
def super_ensemble_predict(models, X):
    predictions = []
    for model_name, model in models.items():
        if model_name == 'cnn':
            pred = model.predict(X.values[..., np.newaxis])
            pred = (pred > 0.5).astype(int)  # Convert probabilities to binary predictions
        elif model_name == 'naive_bayes':
            pred = np.column_stack([nb_model.predict(X) for nb_model in model])
        else:
            pred = model.predict(X)
        predictions.append(pred)
    predictions = np.array(predictions)
    majority_vote = mode(predictions, axis=0).mode[0]
    return majority_vote

# Save the super ensemble model function

In [29]:
dump(super_ensemble_predict, os.path.join(model_dir, 'super_ensemble_model.joblib'))

['models/ensemble_final/super_ensemble_model.joblib']

# Load test datasets

In [30]:
X_testA = pd.read_csv('data/testA/concated_for_ensemble_final.csv')
X_testB = pd.read_csv('data/testB/concated_for_ensemble_final.csv')

# Predict and save the results for testA

In [31]:
predictions_testA = super_ensemble_predict(trained_models, X_testA)
predictions_testA_df = pd.DataFrame(predictions_testA, columns=y.columns)
os.makedirs('predictions/testA/ensemble_final/', exist_ok=True)
predictions_testA_df.to_csv('predictions/testA/ensemble_final/predictions.csv', index=False)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  majority_vote = mode(predictions, axis=0).mode[0]


In [33]:
wandb.save('predictions/testA/ensemble_final/predictions_wandb.csv')

[]

# Predict and save the results for testB

In [34]:
predictions_testB = super_ensemble_predict(trained_models, X_testB)
predictions_testB_df = pd.DataFrame(predictions_testB, columns=y.columns)
os.makedirs('predictions/testB/ensemble_final/', exist_ok=True)
predictions_testB_df.to_csv('predictions/testB/ensemble_final/predictions.csv', index=False)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  majority_vote = mode(predictions, axis=0).mode[0]


In [35]:
wandb.save('predictions/testB/ensemble_final/predictions_wandb.csv')

[]

In [36]:
print("Model training, saving, and predictions complete.")

Model training, saving, and predictions complete.
