In [None]:
#Imports
import numpy as np
import pandas as pd
import sys, os, random
import importlib
from sklearn.preprocessing import StandardScaler

#Class Import
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../.."))) #Allow for imports from src
from src.models import ML_Class_2
importlib.reload(ML_Class_2) #Ensures file is uptodate!
from src.models.ML_Class_2 import Model_Tester_V2

#Utils Import
from src.models.model_specs import MODEL_SPECS
from src.models.perf_utils import track_performance

#Set Seed
os.environ["PYTHONHASHSEED"] = "1945"
random.seed(1945)
np.random.seed(1945)

### Models Tested:

| Key | Algorithm | Library |
|:----|:-----------|:---------|
| **dt** | Decision Tree Classifier | scikit-learn |
| **rf** | Random Forest Classifier | scikit-learn |
| **et** | Extra Trees Classifier | scikit-learn |
| **bag** | Bagging Classifier (Tree Base) | scikit-learn |
| **gb** | Gradient Boosting Classifier | scikit-learn |
| **ada** | AdaBoost Classifier | scikit-learn |
| **qda** | Quadratic Discriminant Analysis | scikit-learn |
| **xgb** | XGBoost Classifier | xgboost |
| **lgbm** | LightGBM Classifier | lightgbm |
| **cat** | CatBoost Classifier | catboost |

 **Note:**  
Preliminary algorithm tests were done in Algorithm_Test_2. In this notebook, further optimization and comparison are done!

In [None]:
# --- Data Loading and Preprocessing --- 

#Complied data of convoys
#Routes examined are HX, SC, OB, ON, ONS
df = pd.read_csv('/Users/matthewplambeck/Desktop/Convoy Predictor/data/processed/Complete_Convoy_Data.csv')
df = df.drop(columns=['Unnamed: 0'])
df.shape #Test
#Drop unecessary/redundent features
df = df.drop(columns=['Convoy Number', 'Number of Ships Sunk', 'Depart_Date', 'Arrival/Dispersal Date', 'Number of Escorts Sunk', \
                         'Number of Stragglers Sunk', 'Total Tons of Ships Sunk', 'Escort Sink Percentage', 'Straggler Sink Percentage'])
df.reset_index(drop=True).head(3)
#Feature Names for later feature analysis:
feature_names = list(df)
feature_names[:-1] #Drop Risk (y)
#Convert Overall Sink Percentage to binary 1( High)
df['Risk'] = (df['Overall Sink Percentage'] > 0).astype(int) 
#Risk is binary based off whether a ship was sunk while in a convoy:  (0 = No Ships Sunk, 1 = At Least One Ship Sunk)
X = np.array(df.drop(columns=['Overall Sink Percentage', 'Risk'])) #Remove Overall Sink Percentage as it leaks data
y = df['Risk'].values #Prediction value

**Start of Algorithms Tests**

In [None]:
#Decision Tree

spec = MODEL_SPECS["dt"]
dt = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

dt.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("dt_optimize")
def run_dt_opt():
    dt.optimize(scoring="recall")

run_dt_opt()
dt_results = dt.evaluate(show_plots=False)

In [None]:
#Random Forest

spec = MODEL_SPECS["rf"]
rf = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

rf.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("rf_optimize")
def run_rf_opt():
    rf.optimize(scoring="recall")

run_rf_opt()
rf_results = rf.evaluate(show_plots=False)

In [None]:
#Extra Trees

spec = MODEL_SPECS["et"]
et = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

et.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("et_optimize")
def run_et_opt():
    et.optimize(scoring="recall")

run_et_opt()
et_results = et.evaluate(show_plots=False)

In [None]:
#Bagging Classifier 

spec = MODEL_SPECS["bag"]
bag = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

bag.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("bag_optimize")
def run_bag_opt():
    bag.optimize(scoring="recall")

run_bag_opt()
bag_results = bag.evaluate(show_plots=False)

In [None]:
#Gradient Boosting Classifier 

spec = MODEL_SPECS["gb"]
gb = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

gb.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("gb_optimize")
def run_gb_opt():
    gb.optimize(scoring="recall")

run_gb_opt()
gb_results = gb.evaluate(show_plots=False)



In [None]:
#AdaBoost

spec = MODEL_SPECS["ada"]
ada = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

ada.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("ada_optimize")
def run_ada_opt():
    ada.optimize(scoring="recall")

run_ada_opt()
ada_results = ada.evaluate(show_plots=False)

In [None]:
#QuadraticDiscriminantAnalysis

spec = MODEL_SPECS["qda"]
qda = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    scaler=StandardScaler()
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

qda.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("qda_optimize")
def run_qda_opt():
    qda.optimize(scoring="recall")

run_qda_opt()
qda_results = qda.evaluate(show_plots=False)

In [None]:
#XGBoost

spec = MODEL_SPECS["xgb"]
xgb = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

xgb.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("xgb_optimize")
def run_xgb_opt():
    xgb.optimize(scoring="recall")

run_xgb_opt()
xgb_results = xgb.evaluate(show_plots=False)

In [None]:
#LightGBM

spec = MODEL_SPECS["lgbm"]
lgbm = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

lgbm.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("lgbm_optimize")
def run_lgbm_opt():
    lgbm.optimize(scoring="recall")

run_lgbm_opt()
lgbm_results = lgbm.evaluate(show_plots=False)

In [None]:
#CatBoost

spec = MODEL_SPECS["cat"]
cat = Model_Tester_V2(
    model=spec["estimator"],
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

cat.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("cat_optimize")
def run_cat_opt():
    cat.optimize(scoring="recall")

run_cat_opt()
cat_results = cat.evaluate(show_plots=False)