This notebook require an installation of "lazypredict-nightly", which can be install as follows in a conda environment as follows:
- conda create --n ML python=3.9
- conda activate ML
- pip install lazypredict-nightly
- pip install openpyxl

In [1]:
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

In [2]:
data = pd.read_excel("../data/Training_Dataset_2.xlsx")

data = data.values[:,1:]

X = data[:,:-1].astype(float)
mu = np.mean(X, axis = 0)
std = np.std(X, axis = 0)
X = (X - mu)/std

y = data[:,-1]

y = np.where(y=="E",      np.zeros(y.shape), y)
y = np.where(y=="L",  1 + np.zeros(y.shape), y)
y = np.where(y=="L ", 1 + np.zeros(y.shape), y)
y = np.where(y=="R",  2 + np.zeros(y.shape), y)
y = np.where(y=="S",  3 + np.zeros(y.shape), y)
y = np.where(y=="W",  4 + np.zeros(y.shape), y)

y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Lazy predict for model selection

In [4]:
from lazypredict import LazyClassifier

In [5]:
# Updated Line
clf = LazyClassifier(verbose=0,
                     ignore_warnings=True,
                     custom_metric=None,
                     predictions=True)

In [6]:
X_train_lazy, X_test_lazy, y_train_lazy, y_test_lazy = train_test_split(X_train, y_train, test_size=0.3, random_state=42)

models, predictions = clf.fit(X_train_lazy, X_test_lazy, y_train_lazy, y_test_lazy)
models

100%|██████████| 29/29 [00:03<00:00,  9.46it/s]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2295
[LightGBM] [Info] Number of data points in the train set: 3171, number of used features: 9
[LightGBM] [Info] Start training from score -1.024775
[LightGBM] [Info] Start training from score -1.285295
[LightGBM] [Info] Start training from score -3.217615
[LightGBM] [Info] Start training from score -1.605033
[LightGBM] [Info] Start training from score -2.090540


Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
XGBClassifier,0.93,0.9,,0.93,0.33
LGBMClassifier,0.92,0.9,,0.92,0.19
ExtraTreesClassifier,0.93,0.9,,0.93,0.17
RandomForestClassifier,0.92,0.89,,0.92,0.55
BaggingClassifier,0.9,0.87,,0.9,0.15
LabelPropagation,0.89,0.85,,0.89,0.26
LabelSpreading,0.89,0.85,,0.89,0.35
DecisionTreeClassifier,0.87,0.83,,0.87,0.03
ExtraTreeClassifier,0.85,0.82,,0.85,0.01
KNeighborsClassifier,0.86,0.82,,0.86,0.04


# XGB Classifier

In [7]:
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [8]:
booster_list = ["gbtree"]

n_estimators_list  = [25, 50, 100, 150]

eta_list = [0.01, 0.1, 1.]
max_depth_list = [5, 10, 30]

lambda_list = [0.0, 5.0, 10.]

max_leaves_list   = [0, 1, 5]

# accuracy_vector = -100 + np.zeros((len(booster_list), len(n_estimators_list), len(eta_list), len(max_depth_list), 
#                                    len(lambda_list), len(max_leaves_list) ))

# for i1 in range(len(booster_list)):
#     for i2 in range(len(n_estimators_list)):
#         for i3 in range(len(eta_list)):
#             for i4 in range(len(max_depth_list)):
#                 for i5 in range(len(lambda_list)):
#                         for i7 in range(len(max_leaves_list)):

#                                 clf = XGBClassifier(booster = booster_list[i1], 
#                                                     n_estimators = n_estimators_list[i2], 
#                                                     eta = eta_list[i3], 
#                                                     max_depth = max_depth_list[i4], 
#                                                     reg_lambda = lambda_list[i5], 
#                                                     max_leaves = max_leaves_list[i7],  
#                                                     random_state = 42)
                                
#                                 model = clf.fit(X_train, y_train)

#                                 # Predict the labels for the test set
#                                 y_pred = model.predict(X_test)

#                                 # Calculate accuracy
#                                 accuracy_vector[i1, i2, i3, i4, i5, i7] = accuracy_score(y_test, y_pred)

#                                 print("#####################")
#                                 print("booster ", booster_list[i1])
#                                 print("n_estimators ", n_estimators_list[i2])
#                                 print("eta ", eta_list[i3])
#                                 print("max_depth ", max_depth_list[i4])
#                                 print("lambda ", lambda_list[i5])
#                                 print("max_leaves ", max_leaves_list[i7])
#                                 print(accuracy_vector[i1, i2, i3, i4, i5, i7])

# np.save("accuracy_XGB.npy", accuracy_vector)

In [9]:
accuracy_vector_XGB = np.load("data/accuracy_XGB.npy")
max_index = np.unravel_index(np.argmax(accuracy_vector_XGB), accuracy_vector_XGB.shape)

In [13]:
clf = XGBClassifier(booster = booster_list[max_index[0]], 
                    n_estimators = n_estimators_list[max_index[1]], 
                    eta = eta_list[max_index[2]], 
                    max_depth = max_depth_list[max_index[3]], 
                    reg_lambda = lambda_list[max_index[4]], 
                    max_leaves = max_leaves_list[max_index[5]],  )

XGB = clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = XGB.predict(X_test)
accuracy_score(y_test, y_pred)

0.946961894953656

# ExtraTreesClassifier	

In [14]:
from sklearn.ensemble import ExtraTreesClassifier

In [15]:
n_estimators_list  = [25, 50, 100, 150]

min_samples_split_list = [1, 2, 5]
max_depth_list = [5, 10, 20, 30]

min_samples_leaf_list = [1, 2, 5]
criterion_list  = ["gini", "entropy"]


# accuracy_vector_extra_tree = -100 + np.zeros((len(n_estimators_list), len(min_samples_split_list), len(max_depth_list), 
#                                               len(min_samples_leaf_list), len(criterion_list) ))

# for i1 in range(len(n_estimators_list)):
#     for i2 in range(len(min_samples_split_list)):
#         for i3 in range(len(max_depth_list)):
#             for i4 in range(len(min_samples_leaf_list)):
#                 for i5 in range(len(criterion_list)):

#                                 clf = ExtraTreesClassifier( n_estimators = n_estimators_list[i1], 
#                                                             min_samples_split = min_samples_split_list[i2], 
#                                                             max_depth = max_depth_list[i3], 
#                                                             min_samples_leaf = min_samples_leaf_list[i4], 
#                                                             criterion = criterion_list[i5],  
#                                                             random_state = 42)
                                
#                                 model = clf.fit(X_train, y_train)

#                                 # Predict the labels for the test set
#                                 y_pred = model.predict(X_test)

#                                 # Calculate accuracy
#                                 accuracy_vector_extra_tree[i1, i2, i3, i4, i5] = accuracy_score(y_test, y_pred)

#                                 print("#####################")
#                                 print("n_estimators", n_estimators_list[i1])
#                                 print("min_samples_split", min_samples_split_list[i2])
#                                 print("max_depth", max_depth_list[i3])
#                                 print("min_samples_leaf", min_samples_leaf_list[i4])
#                                 print("criterion", criterion_list[i5])
#                                 print(accuracy_vector_extra_tree[i1, i2, i3, i4, i5])

# np.save("accuracy_ExtraTree.npy", accuracy_vector_extra_tree)

In [16]:
accuracy_vector_ExtraTree = np.load("data/accuracy_ExtraTree.npy")
max_index = np.unravel_index(np.argmax(accuracy_vector_ExtraTree), accuracy_vector_ExtraTree.shape)

In [24]:
clf = ExtraTreesClassifier( n_estimators = n_estimators_list[max_index[0]], 
                            min_samples_split = min_samples_split_list[max_index[1]], 
                            max_depth = max_depth_list[max_index[2]], 
                            min_samples_leaf = min_samples_leaf_list[max_index[3]], 
                            criterion = criterion_list[max_index[4]],  )

ExtraTree = clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = ExtraTree.predict(X_test)

# Calculate accuracy
accuracy_score(y_test, y_pred)

0.9479917610710608

# LGBM Classifier

In [25]:
from lightgbm import LGBMClassifier
from sklearn.metrics import accuracy_score

In [26]:
num_leaves_list  = [10, 30 , 50]

n_estimators_list  = [25, 50, 100, 150]

learning_rate_list  = [0.01, 0.1, 1]
max_depth_list = [5, 10, 20, 30]

lambda_list = [0.0, 1.0, 5.0]
boosting_type_list  = ["gbdt"]

# accuracy_vector = -100 + np.zeros((len(num_leaves_list), len(n_estimators_list), len(learning_rate_list), len(max_depth_list), 
#                                    len(lambda_list), len(boosting_type_list) ))

# for i1 in range(len(num_leaves_list)):
#     for i2 in range(len(n_estimators_list)):
#         for i3 in range(len(learning_rate_list)):
#             for i4 in range(len(max_depth_list)):
#                 for i5 in range(len(lambda_list)):
#                     for i6 in range(len(boosting_type_list)):

#                                 clf = LGBMClassifier(num_leaves = num_leaves_list[i1], 
#                                                     n_estimators = n_estimators_list[i2], 
#                                                     learning_rate = learning_rate_list[i3], 
#                                                     max_depth = max_depth_list[i4], 
#                                                     reg_lambda = lambda_list[i5], 
#                                                     boosting_type = boosting_type_list[i6],   
#                                                     random_state=42)
                                
#                                 model = clf.fit(X_train, y_train)

#                                 # Predict the labels for the test set
#                                 y_pred = model.predict(X_test)

#                                 # Calculate accuracy
#                                 accuracy_vector[i1, i2, i3, i4, i5, i6] = accuracy_score(y_test, y_pred)

#                                 print("#####################")
#                                 print("booster ", boosting_type_list[i6])
#                                 print("n_estimators ", n_estimators_list[i2])
#                                 print("learning rate ", learning_rate_list[i3])
#                                 print("max_depth ", max_depth_list[i4])
#                                 print("lambda ", lambda_list[i5])
#                                 print("num_leaves ", num_leaves_list[i1])
#                                 print(accuracy_vector[i1, i2, i3, i4, i5, i6])

# np.save("accuracy_LGBM.npy", accuracy_vector)

In [27]:
accuracy_vector_LGBM = np.load("data/accuracy_LGBM.npy")
max_index = np.unravel_index(np.argmax(accuracy_vector_LGBM), accuracy_vector_LGBM.shape)

In [30]:
clf = LGBMClassifier(num_leaves = num_leaves_list[max_index[0]], 
                    n_estimators = n_estimators_list[max_index[1]], 
                    learning_rate = learning_rate_list[max_index[2]], 
                    max_depth = max_depth_list[max_index[3]], 
                    reg_lambda = lambda_list[max_index[4]], 
                    boosting_type = boosting_type_list[max_index[5]], 
                    force_col_wise=True, verbose = -1   )

LGBM = clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = LGBM.predict(X_test)
accuracy_score(y_test, y_pred)

0.9495365602471678

## Random forest classifier

In [31]:
from sklearn.ensemble import RandomForestClassifier

In [32]:
data= []

n_estimators_list = [25, 50, 100, 150]
criterion_list = ["gini", "entropy"]
min_samples_leaf_list = [1, 2, 5, 10]
max_depth_list = [None, 5, 10, 20, 30, 50]

# accuracy_vector = -100 + np.zeros((len(n_estimators_list), len(criterion_list), len(min_samples_leaf_list), len(max_depth_list)))

# for i1 in range(len(n_estimators_list)):
#     for i2 in range(len(criterion_list)):
#         for i3 in range(len(min_samples_leaf_list)):
#             for i4 in range(len(max_depth_list)):
#                     clf = RandomForestClassifier(n_estimators = n_estimators_list[i1], 
#                                                  criterion    = criterion_list[i2], 
#                                                  min_samples_leaf = min_samples_leaf_list[i3], 
#                                                  max_depth        = max_depth_list[i4],
#                                                  random_state=42
#                                                  )
#                     model = clf.fit(X_train, y_train)

#                     # Predict the labels for the test set
#                     y_pred = model.predict(X_test)

#                     # Calculate accuracy
#                     accuracy_vector[i1, i2, i3, i4] = accuracy_score(y_test, y_pred)

#                     print("#####################")
#                     print("n_estimators ", n_estimators_list[i1])
#                     print("criterion ", criterion_list[i2])
#                     print("min sample leaf ", min_samples_leaf_list[i3])
#                     print("max_depth ", max_depth_list[i4])
#                     print(accuracy_vector[i1, i2, i3, i4])

# np.save("accuracy_RadomForest.npy", accuracy_vector)

In [33]:
accuracy_vector_RandomForest = np.load("data/accuracy_RadomForest.npy")
max_index = np.unravel_index(np.argmax(accuracy_vector_RandomForest), accuracy_vector_RandomForest.shape)

In [36]:
clf = RandomForestClassifier( n_estimators     = n_estimators_list[max_index[0]], 
                              criterion        = criterion_list[max_index[1]], 
                              min_samples_leaf = min_samples_leaf_list[max_index[2]], 
                              max_depth        = max_depth_list[max_index[3]]
                            )

RF = clf.fit(X_train, y_train)

# Predict the labels for the test set
y_pred = RF.predict(X_test)
accuracy_score(y_test, y_pred)

0.9392378990731205

## Ensemble method

In [45]:
XGB_pred = XGB.predict(X_test) 
ExtraTree_pred = ExtraTree.predict(X_test) 
LGBM_pred = LGBM.predict(X_test) 
RF_pred = RF.predict(X_test) 

predictions = np.stack((XGB_pred, ExtraTree_pred, LGBM_pred, RF_pred), axis = 1)
values      = np.unique(predictions)

boolean_check = np.expand_dims(predictions, axis = -1) == np.expand_dims(values, axis = (0, 1))
weights = np.expand_dims(np.expand_dims(np.array([1., 1., 1., 1.]), axis = 0), axis = -1)
ensemble_prediction_prob = np.mean(weights*boolean_check.astype(float), axis = -2)

Y_pred_ensemble = np.argmax(ensemble_prediction_prob, axis = 1)

accuracy_score(y_test, Y_pred_ensemble)

0.9505664263645726

# Neural Network

This part of the notebook requires an installation of tensorflow, follow the instructions from https://www.tensorflow.org/install

In [2]:
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

import tensorflow as tf

In [3]:
data = pd.read_excel("../data/Training_Dataset_2.xlsx")

data = data.values[:,1:]

X = data[:,:-1].astype(float)
mu = np.mean(X, axis = 0)
std = np.std(X, axis = 0)
X = (X - mu)/std

y = data[:,-1]

y = np.where(y=="E",      np.zeros(y.shape), y)
y = np.where(y=="L",  1 + np.zeros(y.shape), y)
y = np.where(y=="L ", 1 + np.zeros(y.shape), y)
y = np.where(y=="R",  2 + np.zeros(y.shape), y)
y = np.where(y=="S",  3 + np.zeros(y.shape), y)
y = np.where(y=="W",  4 + np.zeros(y.shape), y)

y = y.astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [4]:
# Build the model
# model = tf.keras.Sequential([
#     tf.keras.layers.Dense(4096, activation='relu'),
#     # tf.keras.layers.Dropout(0.3),  # Add dropout with a dropout rate of 0.5 (you can adjust this value)
#     tf.keras.layers.Dense(4096, activation='relu'),
#     # tf.keras.layers.Dropout(0.3),  # Add dropout with a dropout rate of 0.5 (you can adjust this value)
#     tf.keras.layers.Dense(5, activation='softmax')
# ])

# # Compile the model
# model.compile(optimizer='adam',
#               loss='sparse_categorical_crossentropy',
#               metrics=['accuracy'])

# # Train the model
# model.fit(X_train, y_train, epochs=500) #, validation_data=(X_test, y_test))

# model.save('NeuralNet.keras')

In [6]:
# File too big to be uploaded contact the authors if interested
model = tf.keras.models.load_model('NeuralNet.keras')

In [7]:
# Evaluate the model
test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"Test accuracy: {test_acc:.4f}")




2024-04-07 17:24:03.622784: I tensorflow/tsl/platform/default/subprocess.cc:304] Start cannot spawn child process: No such file or directory


Test accuracy: 0.9413
