# Load Libraries

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import accuracy_score



In [3]:
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB

In [4]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Flatten

In [5]:
from joblib import dump, load
from scipy.stats import mode

# Set the random seed

In [6]:
from joblib import dump, load
from scipy.stats import mode

# Change the directory

In [7]:
import os
os.chdir("../../")
# from google.colab import drive
# drive.mount('/content/drive/')
# %cd drive/MyDrive/poleval_emotion/

# Load the data

In [8]:
X = pd.read_csv('data/train/concated_for_ensemble_final.csv')
y = pd.read_csv('data/train/expected.tsv', sep='\t')

# Split the data

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1613)

# Model definitions (CNN & Naive Bayes: later))

In [10]:
models = {
    'random_forest': RandomForestClassifier(),
    'xgboost': XGBClassifier(),
    'mlp': MLPClassifier(max_iter=2000)
}

# Hyperparameters for tuning

In [11]:
'''
# For testing
param_grids = {
    'random_forest': {
        'n_estimators': [100, 200],
        'max_depth': [None, 10],
        'bootstrap': [True]
    },
    'xgboost': {
        'n_estimators': [100],
        'learning_rate': [0.01],
        'max_depth': [3, 5],
        'gamma': [0]
    },
    'mlp': {
    'hidden_layer_sizes': [(100,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001]
    }
}
'''
param_grids = {
    'random_forest': {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'bootstrap': [True, False]
    },
    'xgboost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 5, 7, 9],
        'gamma': [0, 0.1]
    },
    'mlp': {
        'hidden_layer_sizes': [(100,), (50, 50), (100, 50), (50, 50, 50)],
        'activation': ['relu', 'tanh'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001]
    }
}

# Define the CNN model

In [12]:
def create_cnn_model(input_shape):
    model = Sequential()
    model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=input_shape))
    model.add(Flatten())
    model.add(Dense(100, activation='relu'))
    model.add(Dense(11, activation='sigmoid'))  # Assuming 11 classes
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model

# Directory to save models

In [13]:
model_dir = 'models/ensemble_final/'
os.makedirs(model_dir, exist_ok=True)

# Function to perform hyperparameter tuning

In [14]:
def tune_model(model, param_grid):
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1, verbose=1)
        grid_search.fit(X_train, y_train)
        return grid_search.best_estimator_
    else:
        model.fit(X_train, y_train)
        return model

# Train models with standard multi-label classification approach

In [15]:
trained_models = {}
for model_name in ['random_forest', 'xgboost', 'mlp']:
    print(f"Training model: {model_name}")
    best_estimator = tune_model(models[model_name], param_grids[model_name])
    multi_target_model = MultiOutputClassifier(best_estimator)
    multi_target_model.fit(X_train, y_train)
    y_pred = multi_target_model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f"Accuracy for {model_name}: {accuracy}\n")
    trained_models[model_name] = multi_target_model
    dump(multi_target_model, os.path.join(model_dir, f'{model_name}.joblib'))

Training model: random_forest
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Accuracy for random_forest: 0.9205020920502092

Training model: xgboost
Fitting 5 folds for each of 72 candidates, totalling 360 fits
Accuracy for xgboost: 0.9149232914923291

Training model: mlp
Fitting 5 folds for each of 32 candidates, totalling 160 fits
Accuracy for mlp: 0.9149232914923291



# Train the CNN model

In [16]:
print("Training model: cnn")
cnn_model = create_cnn_model((X_train.shape[1], 1))
cnn_model.fit(X_train.values[..., np.newaxis], y_train.values, epochs=200, batch_size=10, verbose=1)
cnn_y_pred = cnn_model.predict(X_test.values[..., np.newaxis])
cnn_y_pred = (cnn_y_pred > 0.5).astype(int)  # Convert probabilities to binary predictions
cnn_accuracy = accuracy_score(y_test, cnn_y_pred)
print(f"Accuracy for cnn: {cnn_accuracy}\n")
cnn_model.save(os.path.join(model_dir, 'cnn.h5'))
trained_models['cnn'] = cnn_model

Training model: cnn
Epoch 1/200


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 6ms/step - accuracy: 0.2662 - loss: 0.1422
Epoch 2/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.2262 - loss: 0.0482
Epoch 3/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.2648 - loss: 0.0436
Epoch 4/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.2677 - loss: 0.0460
Epoch 5/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.3077 - loss: 0.0413
Epoch 6/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.3145 - loss: 0.0424
Epoch 7/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.3663 - loss: 0.0401
Epoch 8/200
[1m574/574[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 5ms/step - accuracy: 0.3977 - loss: 0.0427
Epoch 9/200
[1m574/574[0m [32m━━━━━━━━━━━



Accuracy for cnn: 0.8919107391910739



# Train the Naive Bayes models independently for each label

In [17]:
print("Training model: naive_bayes")
nb_models = []
nb_accuracies = []
for i in range(y_train.shape[1]):
    nb_model = GaussianNB()
    nb_model.fit(X_train, y_train.iloc[:, i])
    nb_models.append(nb_model)
    y_pred = nb_model.predict(X_test)
    accuracy = accuracy_score(y_test.iloc[:, i], y_pred)
    nb_accuracies.append(accuracy)
    print(f"Accuracy for label {y_train.columns[i]} with Naive Bayes: {accuracy}")
mean_nb_accuracy = np.mean(nb_accuracies)
print(f"\nMean Accuracy for Naive Bayes: {mean_nb_accuracy}")
dump(nb_models, os.path.join(model_dir, 'naive_bayes.joblib'))
trained_models['naive_bayes'] = nb_models

Training model: naive_bayes
Accuracy for label Joy with Naive Bayes: 0.9609483960948396
Accuracy for label Trust with Naive Bayes: 0.7580195258019525
Accuracy for label Anticipation with Naive Bayes: 0.99302649930265
Accuracy for label Surprise with Naive Bayes: 0.7580195258019525
Accuracy for label Fear with Naive Bayes: 0.9693165969316597
Accuracy for label Sadness with Naive Bayes: 0.9567642956764296
Accuracy for label Disgust with Naive Bayes: 0.8521617852161785
Accuracy for label Anger with Naive Bayes: 0.8486750348675035
Accuracy for label Positive with Naive Bayes: 0.9665271966527197
Accuracy for label Negative with Naive Bayes: 0.9288702928870293
Accuracy for label Neutral with Naive Bayes: 0.9532775453277545

Mean Accuracy for Naive Bayes: 0.9041460631418791


# Super ensemble model using majority voting

In [18]:
def super_ensemble_predict(models, X):
    predictions = []
    for model_name, model in models.items():
        if model_name == 'cnn':
            pred = model.predict(X.values[..., np.newaxis])
            pred = (pred > 0.5).astype(int)  # Convert probabilities to binary predictions
        elif model_name == 'naive_bayes':
            pred = np.column_stack([nb_model.predict(X) for nb_model in model])
        else:
            pred = model.predict(X)
        predictions.append(pred)
    predictions = np.array(predictions)
    majority_vote = mode(predictions, axis=0).mode[0]
    return majority_vote

# Save the super ensemble model function

In [19]:
dump(super_ensemble_predict, os.path.join(model_dir, 'super_ensemble_model.joblib'))

['models/ensemble_final/super_ensemble_model.joblib']

# Load test datasets

In [20]:
X_testA = pd.read_csv('data/testA/concated_for_ensemble_final.csv')
X_testB = pd.read_csv('data/testB/concated_for_ensemble_final.csv')

# Predict and save the results for testA

In [21]:
predictions_testA = super_ensemble_predict(trained_models, X_testA)
predictions_testA_df = pd.DataFrame(predictions_testA, columns=y.columns)
os.makedirs('predictions/testA/ensemble_final/', exist_ok=True)
predictions_testA_df.to_csv('predictions/testA/ensemble_final/predictions.csv', index=False)

[1m44/44[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  majority_vote = mode(predictions, axis=0).mode[0]


# Predict and save the results for testB

In [22]:
predictions_testB = super_ensemble_predict(trained_models, X_testB)
predictions_testB_df = pd.DataFrame(predictions_testB, columns=y.columns)
os.makedirs('predictions/testB/ensemble_final/', exist_ok=True)
predictions_testB_df.to_csv('predictions/testB/ensemble_final/predictions.csv', index=False)

[1m45/45[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


  majority_vote = mode(predictions, axis=0).mode[0]


In [23]:
print("Model training, saving, and predictions complete.")

Model training, saving, and predictions complete.
