In [2]:
#Imports
import numpy as np
import pandas as pd
import sys, os, random
import importlib
from sklearn.preprocessing import StandardScaler
from sklearn.base import clone
from pathlib import Path


#Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC, NuSVC
# from sklearn.calibration import CalibratedClassifierCV #Not used
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

#Class Import
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../.."))) #Allow for imports from src
from src.models import ML_Class_2
importlib.reload(ML_Class_2) #Ensures file is uptodate!
from src.models.ML_Class_2 import Model_Tester_V2

#Utils Import
from src.models.model_specs import MODEL_SPECS
from src.data.cache_preprocessor import build_cache
from src.models.perf_utils import track_performance

#Set Seed
os.environ["PYTHONHASHSEED"] = "1945"
random.seed(1945)
np.random.seed(1945)

In this notebook, ten algorithms are tested and optimized for comparison. Algorithms come from tests did in Algorithm_Test_2.ipynb 


In [3]:
#Complied data of convoys
#Routes examined are HX, SC, OB, ON, ONS
df = pd.read_csv('/Users/matthewplambeck/Desktop/Convoy Predictor/data/processed/Complete_Convoy_Data.csv')
df = df.drop(columns=['Unnamed: 0'])
df.shape #Test

(1174, 21)

In [4]:
#Drop unecessary/redundent features
df = df.drop(columns=['Convoy Number', 'Number of Ships Sunk', 'Depart_Date', 'Arrival/Dispersal Date', 'Number of Escorts Sunk', \
                         'Number of Stragglers Sunk', 'Total Tons of Ships Sunk', 'Escort Sink Percentage', 'Straggler Sink Percentage'])
df.reset_index(drop=True).head(3)

Unnamed: 0,Number of Ships,Number of Escort Ships,Number of Stragglers,Total Tons of Convoy,Overall Sink Percentage,Avg Number of U-Boats in Atlantic,Escort Ratio,Time At Sea (Days),Month,Year,Previous Month Avg Sink %,Approx. Sighting Range
0,5.0,2.0,0.0,22877.0,0.0,6.0,0.4,3.0,9.0,1939.0,0.0,12.1902
1,5.0,2.0,0.0,22967.0,0.0,6.0,0.4,5.0,9.0,1939.0,0.0,12.1902
2,7.0,4.0,0.0,21293.0,0.0,6.0,0.571429,5.0,9.0,1939.0,0.0,14.434062


In [5]:
#Feature Names for later feature analysis:
feature_names = list(df)
feature_names[:-1] #Drop Risk (y)

['Number of Ships',
 'Number of Escort Ships',
 'Number of Stragglers',
 'Total Tons of Convoy',
 'Overall Sink Percentage',
 'Avg Number of U-Boats in Atlantic',
 'Escort Ratio',
 'Time At Sea (Days)',
 'Month',
 'Year',
 'Previous Month Avg Sink %']

In [6]:
#Convert Overall Sink Percentage to binary 1( High)
df['Risk'] = (df['Overall Sink Percentage'] > 0).astype(int) 
#Risk is binary based off whether a ship was sunk while in a convoy:  (0 = No Ships Sunk, 1 = At Least One Ship Sunk)
X = np.array(df.drop(columns=['Overall Sink Percentage', 'Risk'])) #Remove Overall Sink Percentage as it leaks data
y = df['Risk'].values #Prediction value

In [10]:
spec = MODEL_SPECS["gb"]
gb = Model_Tester_V2(
    model=spec["estimator"],
    scaler=StandardScaler(),
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

gb.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_performance("gb_optimize")
def run_gb_opt():
    gb.optimize(scoring="recall")

run_gb_opt()
gb_results = gb.evaluate(show_plots=False)




Best Hyperparameters Found:
{'model__learning_rate': 0.07, 'model__max_depth': 4, 'model__max_features': None, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2, 'model__n_estimators': 600, 'model__subsample': 1.0}
Best Cross-Validation Recall: 0.4383
gb_optimize completed in 73.32s | ΔRSS 1.55 MB | CPU 71.5%
Applied decision threshold: 0.7785

GradientBoostingClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89       185
           1       0.68      0.26      0.38        50

    accuracy                           0.82       235
   macro avg       0.76      0.61      0.63       235
weighted avg       0.80      0.82      0.78       235


ROC AUC Score: 0.8004
Matthews Correlation Coefficient (MCC): 0.3416
Balanced Accuracy: 0.6138
GradientBoostingClassifier Confusion Matrix:
[[179   6]
 [ 37  13]]


In [None]:
CACHE_DIR = Path("data/cache")
X_train_path = CACHE_DIR / "X_train.parquet"
y_train_path = CACHE_DIR / "y_train.parquet"

if not X_train_path.exists():
    build_cache(
        input_path=Path("data/processed/features.parquet"),
        target_col="target",
        cache_dir=CACHE_DIR,
        test_size=0.2,
        val_size=0,
        sample_frac=1.0,
        random_state=1945,)

X_train = pd.read_parquet(X_train_path)
y_train = pd.read_parquet(y_train_path)["target"]

spec = MODEL_SPECS["gb"]
gb_model = Model_Tester_V2(
    model=spec["estimator"],
    scaler=StandardScaler(),
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"])

gb_model.X_train = X_train
gb_model.y_train = y_train
gb_model.X_test = pd.read_parquet(CACHE_DIR / "X_test.parquet")
gb_model.y_test = pd.read_parquet(CACHE_DIR / "y_test.parquet")["target"]

@track_performance("gb_optimize")
def run_optimize():
    gb_model.optimize(scoring="recall")

run_optimize()
gb_results = gb_model.evaluate(show_plots=False)


Best Hyperparameters Found:
{'model__learning_rate': 0.07, 'model__max_depth': 4, 'model__max_features': None, 'model__min_samples_leaf': 4, 'model__min_samples_split': 2, 'model__n_estimators': 600, 'model__subsample': 1.0}
Best Cross-Validation Recall: 0.4383
gb_optimize completed in 31.03s | ΔRSS -19.83 MB | CPU 0.0%
Applied decision threshold: 0.7785

GradientBoostingClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.97      0.89       185
           1       0.68      0.26      0.38        50

    accuracy                           0.82       235
   macro avg       0.76      0.61      0.63       235
weighted avg       0.80      0.82      0.78       235


ROC AUC Score: 0.8004
Matthews Correlation Coefficient (MCC): 0.3416
Balanced Accuracy: 0.6138
GradientBoostingClassifier Confusion Matrix:
[[179   6]
 [ 37  13]]


In [24]:
spec = MODEL_SPECS["dt"]
dt = Model_Tester_V2(
    model=spec["estimator"],
    scaler=StandardScaler(),
    parameter_grid=spec["grid_small"],
    cv_folds=5,
    feature_names=feature_names,
    model_config=spec["config"],)

dt.train_test_split(X, y, train_size=0.8, random_state=1945)

@track_perf("dt_optimize")
def run_dt_opt():
    dt.optimize(scoring="recall")

run_dt_opt()
dt_results = dt.evaluate(show_plots=False)


Best Hyperparameters Found:
{'model__class_weight': 'balanced', 'model__max_depth': 3, 'model__max_features': 'sqrt', 'model__min_samples_leaf': 3, 'model__min_samples_split': 2}
Best Cross-Validation Recall: 0.7706
dt_optimize completed in 0.11s | ΔRSS 3.48 MB | CPU 0.0%
Applied decision threshold: 0.8491

DecisionTreeClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.92      0.89       185
           1       0.59      0.40      0.48        50

    accuracy                           0.81       235
   macro avg       0.72      0.66      0.68       235
weighted avg       0.79      0.81      0.80       235


ROC AUC Score: 0.7241
Matthews Correlation Coefficient (MCC): 0.3773
Balanced Accuracy: 0.6622
DecisionTreeClassifier Confusion Matrix:
[[171  14]
 [ 30  20]]
