# Ensemble Learning with Traditional ML Models

In [2]:
import pandas  as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.metrics import f1_score
import optuna

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas  as pd
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
X_train = np.load("data/Regular_processed/Without_Ordinal_Encoding/X_train.npy")
X_valid = np.load("data/Regular_processed/Without_Ordinal_Encoding/X_valid.npy")
y_train = np.load("data/Regular_processed/Without_Ordinal_Encoding/y_train.npy")
y_valid = np.load("data/Regular_processed/Without_Ordinal_Encoding/y_valid.npy")
X_test = np.load("data/Regular_Processed/Without_Ordinal_Encoding/X_test.npy")

for arr in [X_train, X_valid, y_train, y_valid, X_test]:
    print(arr.shape)

(988, 66)
(247, 66)
(988,)
(247,)
(824, 66)


# Loading Traditional ML Models

### Random Forest

In [4]:
from sklearn.ensemble import RandomForestClassifier

random_forest_classifier = RandomForestClassifier(max_depth=23, random_state=123, criterion='entropy', n_estimators=28)
random_forest_classifier.fit(X_train, y_train)
train_preds = random_forest_classifier.predict(X_train)
train_f1_score = f1_score(y_true=y_train, y_pred=train_preds, average='micro')
valid_preds = random_forest_classifier.predict(X_valid)
valid_f1_score = f1_score(y_true=y_valid, y_pred=valid_preds, average='micro')

print(f"Train F1 Score: {train_f1_score}\nValid F1 Score: {valid_f1_score}")

Train F1 Score: 1.0
Valid F1 Score: 0.7449392712550608


In [44]:
def objective(trial):
    # Suggest values for the hyperparameters using a trial object.
    rf_criterion = trial.suggest_categorical('rf_criterion', ['gini', 'entropy', 'log_loss'])
    rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32, log=True)
    rf_n_estimators = trial.suggest_int('rf_n_estimators', 2, 32, log=True)
    classifier_obj = RandomForestClassifier(max_depth=rf_max_depth, n_estimators=rf_n_estimators, random_state=123, criterion=rf_criterion)
    classifier_obj.fit(X_train, y_train)
    train_preds = classifier_obj.predict(X_train)
    train_f1_score = f1_score(y_true=y_train, y_pred=train_preds, average='micro')
    valid_preds_optuna = classifier_obj.predict(X_valid)
    valid_f1_score = f1_score(y_true=y_valid, y_pred=valid_preds_optuna, average='micro')
    
    return valid_f1_score

# Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

[I 2024-04-02 16:53:50,739] A new study created in memory with name: no-name-72ad23b7-72ab-492b-932a-c8f45e46a98f
[I 2024-04-02 16:53:50,765] Trial 0 finished with value: 0.6680161943319838 and parameters: {'rf_criterion': 'log_loss', 'rf_max_depth': 3, 'rf_n_estimators': 16}. Best is trial 0 with value: 0.6680161943319838.
[I 2024-04-02 16:53:50,774] Trial 1 finished with value: 0.5870445344129555 and parameters: {'rf_criterion': 'gini', 'rf_max_depth': 10, 'rf_n_estimators': 4}. Best is trial 0 with value: 0.6680161943319838.
[I 2024-04-02 16:53:50,782] Trial 2 finished with value: 0.6558704453441295 and parameters: {'rf_criterion': 'gini', 'rf_max_depth': 2, 'rf_n_estimators': 7}. Best is trial 0 with value: 0.6680161943319838.
[I 2024-04-02 16:53:50,789] Trial 3 finished with value: 0.680161943319838 and parameters: {'rf_criterion': 'gini', 'rf_max_depth': 7, 'rf_n_estimators': 4}. Best is trial 3 with value: 0.680161943319838.
[I 2024-04-02 16:53:50,794] Trial 4 finished with valu

In [45]:
print(f"Best Trial\nValid Micro Averaged F1 Score: {study.best_trial.values}\nHyperParameters = {study.best_trial.params}")

Best Trial
Valid Micro Averaged F1 Score: [0.7449392712550608]
HyperParameters = {'rf_criterion': 'entropy', 'rf_max_depth': 23, 'rf_n_estimators': 28}


### Bernoulli Naive Bayes

In [11]:
from sklearn.naive_bayes import BernoulliNB

naive_bayes_classifier = BernoulliNB()
naive_bayes_classifier.fit(X_train, y_train)
train_preds = naive_bayes_classifier.predict(X_train)
train_f1_score = f1_score(y_true=y_train, y_pred=train_preds, average='micro')
valid_preds = naive_bayes_classifier.predict(X_valid)
valid_f1_score = f1_score(y_true=y_valid, y_pred=valid_preds, average='micro')

print(f"Train F1 Score: {train_f1_score}\nValid F1 Score: {valid_f1_score}")

Train F1 Score: 0.6973684210526315
Valid F1 Score: 0.6680161943319838


### Nearest Centroid

In [32]:
from sklearn.neighbors import NearestCentroid

nearest_centroid_classifier = NearestCentroid(metric='correlation')
nearest_centroid_classifier.fit(X_train, y_train)
train_preds = nearest_centroid_classifier.predict(X_train)
train_f1_score = f1_score(y_true=y_train, y_pred=train_preds, average='micro')
valid_preds = nearest_centroid_classifier.predict(X_valid)
valid_f1_score = f1_score(y_true=y_valid, y_pred=valid_preds, average='micro')

print(f"Train F1 Score: {train_f1_score}\nValid F1 Score: {valid_f1_score}")

Train F1 Score: 0.6589068825910931
Valid F1 Score: 0.6720647773279352




In [30]:
def objective(trial):
    # Suggest values for the hyperparameters using a trial object.
    rf_metric = trial.suggest_categorical('rf_metric', ['hamming', 'cosine', 'euclidean', 'yule', 'russellrao', 'jaccard', 'chebyshev',
                                                        'sokalmichener', 'sqeuclidean', 'matching', 'rogerstanimoto', 'nan_euclidean',
                                                        'dice', 'sokalsneath', 'correlation', 'canberra', 'manhattan', 'braycurtis'])
    classifier_obj = NearestCentroid(metric=rf_metric)
    classifier_obj.fit(X_train, y_train)
    train_preds = classifier_obj.predict(X_train)
    train_f1_score = f1_score(y_true=y_train, y_pred=train_preds, average='micro')
    valid_preds_optuna = classifier_obj.predict(X_valid)
    valid_f1_score = f1_score(y_true=y_valid, y_pred=valid_preds_optuna, average='micro')
    
    return valid_f1_score

# Create a study object and optimize the objective function.
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[I 2024-04-02 23:31:53,106] A new study created in memory with name: no-name-571f008c-2f95-44bb-aae4-f9b1238fe200
[I 2024-04-02 23:31:53,116] Trial 0 finished with value: 0.3076923076923077 and parameters: {'rf_metric': 'jaccard'}. Best is trial 0 with value: 0.3076923076923077.
[I 2024-04-02 23:31:53,122] Trial 1 finished with value: 0.3076923076923077 and parameters: {'rf_metric': 'yule'}. Best is trial 0 with value: 0.3076923076923077.
[I 2024-04-02 23:31:53,128] Trial 2 finished with value: 0.6680161943319838 and parameters: {'rf_metric': 'nan_euclidean'}. Best is trial 2 with value: 0.6680161943319838.
[I 2024-04-02 23:31:53,132] Trial 3 finished with value: 0.6680161943319838 and parameters: {'rf_metric': 'nan_euclidean'}. Best is trial 2 with value: 0.6680161943319838.
[I 2024-04-02 23:31:53,136] Trial 4 finished with value: 0.3076923076923077 and parameters: {'rf_metric': 'sokalsneath'}. Best is trial 2 with value: 0.6680161943319838.
[I 2024-04-02 23:31:53,140] Trial 5 finishe

In [31]:
print(f"Best Trial\nValid Micro Averaged F1 Score: {study.best_trial.values}\nHyperParameters = {study.best_trial.params}")

Best Trial
Valid Micro Averaged F1 Score: [0.6720647773279352]
HyperParameters = {'rf_metric': 'correlation'}
