In [1]:
#Imports
import numpy as np
import pandas as pd
import sys, os, random
import importlib
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings("ignore", module="skopt") #Ignore scikit-optimize warning print lines
from scipy.linalg import LinAlgWarning
warnings.filterwarnings("ignore", category=LinAlgWarning) #For QDA

#Class Import
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../.."))) #Allow for imports from src
from src.models import ML_Class_2
importlib.reload(ML_Class_2) #Ensures file is uptodate!
from src.models.ML_Class_2 import Model_Tester_V2

#Utils Import
from src.models.model_specs import MODEL_SPECS
from src.models.perf_utils import track_performance

#Set Seed
os.environ["PYTHONHASHSEED"] = "1945"
random.seed(1945)
np.random.seed(1945)

### Models Tested:

| Key | Algorithm | Library |
|:----|:-----------|:---------|
| **dt** | Decision Tree Classifier | scikit-learn |
| **rf** | Random Forest Classifier | scikit-learn |
| **et** | Extra Trees Classifier | scikit-learn |
| **bag** | Bagging Classifier (Tree Base) | scikit-learn |
| **gb** | Gradient Boosting Classifier | scikit-learn |
| **ada** | AdaBoost Classifier | scikit-learn |
| **qda** | Quadratic Discriminant Analysis | scikit-learn |
| **xgb** | XGBoost Classifier | xgboost |
| **xgbrf** | XGBoost Random Forest | xgboost |
| **lgbm** | LightGBM Classifier | lightgbm |
| **cat** | CatBoost Classifier | catboost |

 **Note:**  
Preliminary algorithm tests were done in Algorithm_Test_2. In this notebook, further optimization and comparison are done!

In [2]:
# --- Data Loading and Preprocessing --- 

#Complied data of convoys
#Routes examined are HX, SC, OB, ON, ONS
df = pd.read_csv('/Users/matthewplambeck/Desktop/Convoy Predictor/data/processed/Complete_Convoy_Data.csv')
df = df.drop(columns=['Unnamed: 0'])
df.shape #Test
#Drop unecessary/redundent features
df = df.drop(columns=['Convoy Number', 'Number of Ships Sunk', 'Depart_Date', 'Arrival/Dispersal Date', 'Number of Escorts Sunk', \
                         'Number of Stragglers Sunk', 'Total Tons of Ships Sunk', 'Escort Sink Percentage', 'Straggler Sink Percentage'])
df.reset_index(drop=True).head(3)
#Feature Names for later feature analysis:
feature_names = list(df)
feature_names[:-1] #Drop Risk (y)
#Convert Overall Sink Percentage to binary 1( High)
df['Risk'] = (df['Overall Sink Percentage'] > 0).astype(int) 
#Risk is binary based off whether a ship was sunk while in a convoy:  (0 = No Ships Sunk, 1 = At Least One Ship Sunk)
X = np.array(df.drop(columns=['Overall Sink Percentage', 'Risk'])) #Remove Overall Sink Percentage as it leaks data
y = df['Risk'].values #Prediction value

**Start of Algorithms Tests**

In [None]:
#Decision Tree

spec = MODEL_SPECS["dt"]
dt = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)
    
dt.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("dt_optimize")
def run_dt_opt():
    dt.optimize(scoring="recall", method='halving')

run_dt_opt()
dt_results = dt.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 4, 'max_features': 'sqrt', 'min_samples_leaf': 2, 'min_samples_split': 2}
Best Cross-Validation Recall: 0.6559

Performance Stats:
dt_optimize completed in 9.83s | ΔRSS -151.78 MB | CPU 74.6%

Applied decision threshold: 0.5949

DecisionTreeClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.73      0.80       185
           1       0.40      0.66      0.50        50

    accuracy                           0.71       235
   macro avg       0.64      0.69      0.65       235
weighted avg       0.78      0.71      0.74       235


ROC AUC Score: 0.7449
Matthews Correlation Coefficient (MCC): 0.3337
Balanced Accuracy: 0.6949
DecisionTreeClassifier Confusion Matrix:
[[135  50]
 [ 17  33]]


In [4]:
#Random Forest

spec = MODEL_SPECS["rf"]
rf = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

rf.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("rf_optimize")
def run_rf_opt():
    rf.optimize(scoring="recall", method='halving')

run_rf_opt() 
rf_results = rf.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'class_weight': 'balanced_subsample', 'max_depth': None, 'max_features': None, 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 800}
Best Cross-Validation Recall: 0.5608

Performance Stats:
rf_optimize completed in 715.00s | ΔRSS -61.34 MB | CPU 88.2%

Applied decision threshold: 0.5057

RandomForestClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.91      0.90       185
           1       0.64      0.60      0.62        50

    accuracy                           0.84       235
   macro avg       0.77      0.75      0.76       235
weighted avg       0.84      0.84      0.84       235


ROC AUC Score: 0.8128
Matthews Correlation Coefficient (MCC): 0.5199
Balanced Accuracy: 0.7541
RandomForestClassifier Confusion Matrix:
[[168  17]
 [ 20  30]]


In [5]:
#Extra Trees

spec = MODEL_SPECS["et"]
et = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

et.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("et_optimize")
def run_et_opt():
    et.optimize(scoring="recall", method='halving')

run_et_opt()
et_results = et.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'class_weight': 'balanced', 'max_depth': 14, 'max_features': None, 'min_samples_leaf': 4, 'n_estimators': 900}
Best Cross-Validation Recall: 0.6270

Performance Stats:
et_optimize completed in 343.88s | ΔRSS 111.30 MB | CPU 90.3%

Applied decision threshold: 0.5968

ExtraTreesClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91       185
           1       0.69      0.62      0.65        50

    accuracy                           0.86       235
   macro avg       0.79      0.77      0.78       235
weighted avg       0.86      0.86      0.86       235


ROC AUC Score: 0.8123
Matthews Correlation Coefficient (MCC): 0.5662
Balanced Accuracy: 0.7722
ExtraTreesClassifier Confusion Matrix:
[[171  14]
 [ 19  31]]


In [6]:
#Bagging Classifier 

spec = MODEL_SPECS["bag"]
bag = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

bag.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("bag_optimize")
def run_bag_opt():
    bag.optimize(scoring="recall", method='halving')

run_bag_opt()
bag_results = bag.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'bootstrap': True, 'bootstrap_features': False, 'estimator__max_depth': 6, 'estimator__min_samples_leaf': 5, 'max_features': 0.6, 'max_samples': 0.9, 'n_estimators': 200}
Best Cross-Validation Recall: 0.5811

Performance Stats:
bag_optimize completed in 433.06s | ΔRSS -128.44 MB | CPU 78.1%

Applied decision threshold: 0.5320

BaggingClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.87      0.88       185
           1       0.56      0.62      0.59        50

    accuracy                           0.82       235
   macro avg       0.73      0.75      0.74       235
weighted avg       0.82      0.82      0.82       235


ROC AUC Score: 0.8214
Matthews Correlation Coefficient (MCC): 0.4739
Balanced Accuracy: 0.7451
BaggingClassifier Confusion Matrix:
[[161  24]
 [ 19  31]]


In [7]:
#Gradient Boosting Classifier 

spec = MODEL_SPECS["gb"]
gb = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

gb.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("gb_optimize")
def run_gb_opt():
    gb.optimize(scoring="recall", method='halving')

run_gb_opt()
gb_results = gb.evaluate(show_plots=False)




Optimization Method: Halving
Best Hyperparameters Found:
{'learning_rate': 0.08, 'max_depth': 4, 'max_features': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 1200, 'subsample': 1.0}
Best Cross-Validation Recall: 0.4355

Performance Stats:
gb_optimize completed in 3172.70s | ΔRSS 1.09 MB | CPU 86.5%

Applied decision threshold: 0.9907

GradientBoostingClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.82      0.98      0.89       185
           1       0.71      0.20      0.31        50

    accuracy                           0.81       235
   macro avg       0.77      0.59      0.60       235
weighted avg       0.80      0.81      0.77       235


ROC AUC Score: 0.7845
Matthews Correlation Coefficient (MCC): 0.3084
Balanced Accuracy: 0.5892
GradientBoostingClassifier Confusion Matrix:
[[181   4]
 [ 40  10]]


In [None]:
#AdaBoost

spec = MODEL_SPECS["ada"]
ada = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

ada.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("ada_optimize")
def run_ada_opt():
    ada.optimize(scoring="recall", method='halving')

run_ada_opt()
ada_results = ada.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'learning_rate': 0.1, 'n_estimators': 600}
Best Cross-Validation Recall: 0.1801
Performance Stats:
ada_optimize completed in 0.11mins | ΔRSS -109.12 MB | CPU 62.8%
Applied decision threshold: 0.3826

AdaBoostClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.88      0.88       185
           1       0.57      0.58      0.57        50

    accuracy                           0.82       235
   macro avg       0.73      0.73      0.73       235
weighted avg       0.82      0.82      0.82       235


ROC AUC Score: 0.7702
Matthews Correlation Coefficient (MCC): 0.4578
Balanced Accuracy: 0.7305
AdaBoostClassifier Confusion Matrix:
[[163  22]
 [ 21  29]]


In [None]:
#QuadraticDiscriminantAnalysis

spec = MODEL_SPECS["qda"]
qda = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    scaler=StandardScaler(),
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

qda.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("qda_optimize")
def run_qda_opt():
    qda.optimize(scoring="recall", method='halving')

run_qda_opt()
qda_results = qda.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'model__reg_param': 0.01}
Best Cross-Validation Recall: 0.5549

Performance Stats:
qda_optimize completed in 3.56s | ΔRSS 4.98 MB | CPU 59.9%

Applied decision threshold: 0.4619

QuadraticDiscriminantAnalysis Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.82      0.86       185
           1       0.49      0.66      0.56        50

    accuracy                           0.78       235
   macro avg       0.70      0.74      0.71       235
weighted avg       0.81      0.78      0.79       235


ROC AUC Score: 0.7843
Matthews Correlation Coefficient (MCC): 0.4317
Balanced Accuracy: 0.7381
QuadraticDiscriminantAnalysis Confusion Matrix:
[[151  34]
 [ 17  33]]


In [11]:
#XGBoost

spec = MODEL_SPECS["xgb"]
xgb = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

xgb.train_test_split(X, y, train_size=0.8, random_state=1945)
if callable(xgb.parameter_grid):
    xgb.parameter_grid = xgb.parameter_grid(xgb.y_train)
@track_performance("xgb_optimize")
def run_xgb_opt():
    xgb.optimize(scoring="recall", method='halving')

run_xgb_opt()
xgb_results = xgb.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'colsample_bytree': 0.85, 'gamma': 0.3, 'learning_rate': 0.03, 'max_depth': 4, 'min_child_weight': 1, 'reg_alpha': 0.001, 'reg_lambda': 0.5, 'scale_pos_weight': 5.507462686567164, 'subsample': 1.0}
Best Cross-Validation Recall: 0.5849

Performance Stats:
xgb_optimize completed in 4895.65s | ΔRSS 62.39 MB | CPU 89.8%

Applied decision threshold: 0.6786

XGBClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.92      0.88       185
           1       0.58      0.38      0.46        50

    accuracy                           0.81       235
   macro avg       0.71      0.65      0.67       235
weighted avg       0.79      0.81      0.79       235


ROC AUC Score: 0.7670
Matthews Correlation Coefficient (MCC): 0.3585
Balanced Accuracy: 0.6522
XGBClassifier Confusion Matrix:
[[171  14]
 [ 31  19]]


In [12]:
#XGBoost Random Forest
from xgboost import XGBRFClassifier

spec = MODEL_SPECS["xgbrf"]
xgbrf = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

xgbrf.train_test_split(X, y, train_size=0.8, random_state=1945)
if callable(xgbrf.parameter_grid):
    xgbrf.parameter_grid = xgbrf.parameter_grid(xgbrf.y_train)
@track_performance("xgbrf_optimize")
def run_xgbrf_opt():
    xgbrf.optimize(scoring="recall", method='halving')

run_xgbrf_opt()
xgbrf_results = xgbrf.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'colsample_bynode': 0.6, 'colsample_bytree': 0.6, 'max_depth': 3, 'min_child_weight': 6, 'n_estimators': 400, 'reg_alpha': 0.01, 'reg_lambda': 3.0, 'scale_pos_weight': 5.507462686567164, 'subsample': 0.6}
Best Cross-Validation Recall: 0.8730

Performance Stats:
xgbrf_optimize completed in 3400.64s | ΔRSS 18.23 MB | CPU 92.1%

Applied decision threshold: 0.6253

XGBRFClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.85      0.87       185
           1       0.53      0.64      0.58        50

    accuracy                           0.80       235
   macro avg       0.72      0.74      0.73       235
weighted avg       0.82      0.80      0.81       235


ROC AUC Score: 0.7942
Matthews Correlation Coefficient (MCC): 0.4586
Balanced Accuracy: 0.7443
XGBRFClassifier Confusion Matrix:
[[157  28]
 [ 18  32]]


In [13]:
#LightGBM

spec = MODEL_SPECS["lgbm"]
lgbm = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

lgbm.train_test_split(X, y, train_size=0.8, random_state=1945)
if callable(lgbm.parameter_grid):
    lgbm.parameter_grid = lgbm.parameter_grid(lgbm.y_train)
    
@track_performance("lgbm_optimize")
def run_lgbm_opt():
    lgbm.optimize(scoring="recall", method='halving')

run_lgbm_opt()
lgbm_results = lgbm.evaluate(show_plots=False)

[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got "False"
[LightGBM] [Fatal] Parameter verbose should be of type int, got 

KeyboardInterrupt: 

In [14]:
#CatBoost

spec = MODEL_SPECS["cat"]
cat = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_large"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

cat.train_test_split(X, y, train_size=0.8, random_state=1945)
if callable(cat.parameter_grid):
    cat.parameter_grid = cat.parameter_grid(cat.y_train)
@track_performance("cat_optimize")
def run_cat_opt():
    cat.optimize(scoring="recall", method='halving')

run_cat_opt()
cat_results = cat.evaluate(show_plots=False)


Optimization Method: Halving
Best Hyperparameters Found:
{'bagging_temperature': 1.0, 'border_count': 254, 'depth': 8, 'l2_leaf_reg': 6.0, 'learning_rate': 0.08, 'scale_pos_weight': 5.507462686567164}
Best Cross-Validation Recall: 0.4402

Performance Stats:
cat_optimize completed in 4432.16s | ΔRSS -220.16 MB | CPU 88.5%

Applied decision threshold: 0.3898

CatBoostClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.91      0.88       185
           1       0.57      0.42      0.48        50

    accuracy                           0.81       235
   macro avg       0.71      0.67      0.68       235
weighted avg       0.79      0.81      0.80       235


ROC AUC Score: 0.7717
Matthews Correlation Coefficient (MCC): 0.3748
Balanced Accuracy: 0.6668
CatBoostClassifier Confusion Matrix:
[[169  16]
 [ 29  21]]
