## Notebook to test an ensemble learning model of the top 3-5 model from Algorithm_Test_3

In [2]:
#Imports
import numpy as np
import pandas as pd
import sys, os, random
import importlib
from sklearn.preprocessing import StandardScaler
from pathlib import Path
from datetime import datetime
import json
import joblib

#Warning Supression
import warnings
warnings.filterwarnings("ignore", module="skopt") #Ignore scikit-optimize warning print lines
from scipy.linalg import LinAlgWarning
warnings.filterwarnings("ignore", category=LinAlgWarning) #For QDA
warnings.filterwarnings("ignore", category=UserWarning) #For LightBoost

#Class Import
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../.."))) #Allow for imports from src
from src.models import ML_Class_2
importlib.reload(ML_Class_2) #Ensures file is uptodate!
from src.models.ML_Class_2 import Model_Tester_V2

#Utils Import
from src.models.model_artifacts import (get_artifact_dir, load_model, load_models, save_model, save_models,)
ARTIFACT_DIR = get_artifact_dir("algorithm_test_3")
from src.models import model_specs
importlib.reload(model_specs) #Ensures file is uptodate!
from src.models.model_specs import MODEL_SPECS
from src.models.perf_utils import track_performance

#Set Seed
os.environ["PYTHONHASHSEED"] = "1945"
random.seed(1945)
np.random.seed(1945)

In [3]:
# --- Data Loading and Preprocessing --- 

#Complied data of convoys
#Routes examined are HX, SC, OB, ON, ONS
df = pd.read_csv('/Users/matthewplambeck/Desktop/Convoy Predictor/data/processed/Complete_Convoy_Data.csv')
df = df.drop(columns=['Unnamed: 0'])
df.shape #Test
#Drop unecessary/redundent features
df = df.drop(columns=['Convoy Number', 'Number of Ships Sunk', 'Depart_Date', 'Arrival/Dispersal Date', 'Number of Escorts Sunk', \
                         'Number of Stragglers Sunk', 'Total Tons of Ships Sunk', 'Escort Sink Percentage', 'Straggler Sink Percentage'])
df.reset_index(drop=True).head(3)
#Feature Names for later feature analysis:
feature_names = list(df)
feature_names[:-1] #Drop Risk (y)
#Convert Overall Sink Percentage to binary 1( High)
df['Risk'] = (df['Overall Sink Percentage'] > 0).astype(int) 
#Risk is binary based off whether a ship was sunk while in a convoy:  (0 = No Ships Sunk, 1 = At Least One Ship Sunk)
X = np.array(df.drop(columns=['Overall Sink Percentage', 'Risk'])) #Remove Overall Sink Percentage as it leaks data
y = df['Risk'].values #Prediction value

In [None]:
# --- Extra Trees ---

spec = MODEL_SPECS["et"]
et = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)
et.train_test_split(X, y, train_size=0.8, random_state=1945)

#Load trained model
load_model("et", directory=ARTIFACT_DIR, assign_to=et)
et_results = et.evaluate(show_plots=False)

Applied decision threshold: 0.5968

ExtraTreesClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       185
           1       0.69      0.62      0.65        50

    accuracy                           0.86       235
   macro avg       0.79      0.77      0.78       235
weighted avg       0.86      0.86      0.86       235


ROC AUC Score: 0.8123
Matthews Correlation Coefficient (MCC): 0.5662
Balanced Accuracy: 0.7722
ExtraTreesClassifier Confusion Matrix:
[[171  14]
 [ 19  31]]


In [6]:
# --- Random Forest ---

spec = MODEL_SPECS["rf"]
rf = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)
rf.train_test_split(X, y, train_size=0.8, random_state=1945)

#Load trained model
load_model("rf", directory=ARTIFACT_DIR, assign_to=rf)
rf_results = rf.evaluate(show_plots=False)

Applied decision threshold: 0.5057

RandomForestClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       185
           1       0.64      0.60      0.62        50

    accuracy                           0.84       235
   macro avg       0.77      0.75      0.76       235
weighted avg       0.84      0.84      0.84       235


ROC AUC Score: 0.8128
Matthews Correlation Coefficient (MCC): 0.5199
Balanced Accuracy: 0.7541
RandomForestClassifier Confusion Matrix:
[[168  17]
 [ 20  30]]


In [7]:
# --- GradientBoosting ---

spec = MODEL_SPECS["gb"]
gb = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)
gb.train_test_split(X, y, train_size=0.8, random_state=1945)

#Load trained model
load_model("gb", directory=ARTIFACT_DIR, assign_to=gb)
gb_results = gb.evaluate(show_plots=False)

Applied decision threshold: 0.9907

GradientBoostingClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89       185
           1       0.71      0.20      0.31        50

    accuracy                           0.81       235
   macro avg       0.77      0.59      0.60       235
weighted avg       0.80      0.81      0.77       235


ROC AUC Score: 0.7845
Matthews Correlation Coefficient (MCC): 0.3084
Balanced Accuracy: 0.5892
GradientBoostingClassifier Confusion Matrix:
[[181   4]
 [ 40  10]]


In [9]:
# --- AdaBoostClassifier ---

#Notes: Compare With GB once new param grid is fit

spec = MODEL_SPECS["ada"]
ada = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)
ada.train_test_split(X, y, train_size=0.8, random_state=1945)

#Load trained model
load_model("ada", directory=ARTIFACT_DIR, assign_to=ada)
ada_results = ada.evaluate(show_plots=False)

Applied decision threshold: 0.6515

AdaBoostClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.86      0.88       185
           1       0.54      0.62      0.58        50

    accuracy                           0.81       235
   macro avg       0.72      0.74      0.73       235
weighted avg       0.82      0.81      0.81       235


ROC AUC Score: 0.7818
Matthews Correlation Coefficient (MCC): 0.4578
Balanced Accuracy: 0.7397
AdaBoostClassifier Confusion Matrix:
[[159  26]
 [ 19  31]]


In [8]:
# --- QuadraticDiscriminantAnalysis ---

spec = MODEL_SPECS["qda"]
qda = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)
qda.train_test_split(X, y, train_size=0.8, random_state=1945)

#Load trained model
load_model("qda", directory=ARTIFACT_DIR, assign_to=qda)
qda_results = qda.evaluate(show_plots=False)

Applied decision threshold: 0.4781

QuadraticDiscriminantAnalysis Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.91      0.79      0.85       185
           1       0.48      0.70      0.57        50

    accuracy                           0.77       235
   macro avg       0.69      0.75      0.71       235
weighted avg       0.82      0.77      0.79       235


ROC AUC Score: 0.7897
Matthews Correlation Coefficient (MCC): 0.4374
Balanced Accuracy: 0.7473
QuadraticDiscriminantAnalysis Confusion Matrix:
[[147  38]
 [ 15  35]]


In [10]:
# --- XGBRFClassifier ---

spec = MODEL_SPECS["xgbrf"]
xgbrf = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)
xgbrf.train_test_split(X, y, train_size=0.8, random_state=1945)

#Load trained model
load_model("xgbrf", directory=ARTIFACT_DIR, assign_to=xgbrf)
xgbrf_results = xgbrf.evaluate(show_plots=False)

Applied decision threshold: 0.6253

XGBRFClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.85      0.87       185
           1       0.53      0.64      0.58        50

    accuracy                           0.80       235
   macro avg       0.72      0.74      0.73       235
weighted avg       0.82      0.80      0.81       235


ROC AUC Score: 0.7942
Matthews Correlation Coefficient (MCC): 0.4586
Balanced Accuracy: 0.7443
XGBRFClassifier Confusion Matrix:
[[157  28]
 [ 18  32]]


In [None]:
#TODO: Create Ensemble Model with top 3-5