Updated Ml_Class_1.py so rerunning the classification algorithm compairison:

In [29]:
#Imports
import numpy as np
import pandas as pd
import sys, os, random
import importlib
from sklearn.preprocessing import StandardScaler

#Algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC, SVC, NuSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.neighbors import KNeighborsClassifier, RadiusNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, BaggingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB, BernoulliNB, ComplementNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.linear_model import SGDClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier

#Class Import
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "../.."))) #Allow for imports from src
from src.models import ML_Class_1
importlib.reload(ML_Class_1) #Ensures file is uptodate!
from src.models.ML_Class_1 import Model_Tester

#Set Seed
os.environ["PYTHONHASHSEED"] = "1945"
random.seed(1945)
np.random.seed(1945)


In [19]:
#Complied data of convoys
#Routes examined are HX, SC, OB, ON, ONS
df = pd.read_csv('/Users/matthewplambeck/Desktop/Convoy Predictor/data/processed/Complete_Convoy_Data.csv')
df = df.drop(columns=['Unnamed: 0'])
df.shape #Test

(1174, 21)

In [20]:
#Drop unecessary/redundent features
df = df.drop(columns=['Convoy Number', 'Number of Ships Sunk', 'Depart_Date', 'Arrival/Dispersal Date', 'Number of Escorts Sunk', \
                         'Number of Stragglers Sunk', 'Total Tons of Ships Sunk', 'Escort Sink Percentage', 'Straggler Sink Percentage'])
df.reset_index(drop=True).head(3)

Unnamed: 0,Number of Ships,Number of Escort Ships,Number of Stragglers,Total Tons of Convoy,Overall Sink Percentage,Avg Number of U-Boats in Atlantic,Escort Ratio,Time At Sea (Days),Month,Year,Previous Month Avg Sink %,Approx. Sighting Range
0,5.0,2.0,0.0,22877.0,0.0,6.0,0.4,3.0,9.0,1939.0,0.0,12.1902
1,5.0,2.0,0.0,22967.0,0.0,6.0,0.4,5.0,9.0,1939.0,0.0,12.1902
2,7.0,4.0,0.0,21293.0,0.0,6.0,0.571429,5.0,9.0,1939.0,0.0,14.434062


In [21]:
#Convert Overall Sink Percentage to binary 1( High)
df['Risk'] = (df['Overall Sink Percentage'] > 0).astype(int) #Risk is binary based off whether a ship was sunk while in a convoy:  (0 = No Ships Sunk, 1 = At Least One Ship Sunk)
X = np.array(df.drop(columns=['Overall Sink Percentage', 'Risk'])) #Remove Overall Sink Percentage as it leaks data
y = df['Risk'].values #Prediction value

In [22]:
#Intiate and perfrom train test split on data set
base_tester = Model_Tester()
base_tester.train_test_split(X, y)

In [30]:
#Intial Model Test (Phase 1 - Test to see if works)
models = [("logreg", LogisticRegression(random_state=1945, max_iter=1600)),
          ("svc", SVC(random_state=1945, probability=True)),
          ("rf", RandomForestClassifier(random_state=1945)),
          ("gb", GradientBoostingClassifier(random_state=1945)),]


#Guide to setting random seeds:

# # Linear models
# LogisticRegression(random_state=1945)
# SGDClassifier(random_state=1945)

# # SVM family
# LinearSVC(random_state=1945)
# SVC(random_state=1945)
# NuSVC(random_state=1945)

# # Neighbors
# KNeighborsClassifier()                 # no randomness
# RadiusNeighborsClassifier()            # no randomness

# # Trees and ensembles
# DecisionTreeClassifier(random_state=1945)
# RandomForestClassifier(random_state=1945)
# ExtraTreesClassifier(random_state=1945)
# BaggingClassifier(random_state=1945)
# GradientBoostingClassifier(random_state=1945)
# AdaBoostClassifier(random_state=1945)

# # Naive Bayes
# GaussianNB()        # deterministic
# BernoulliNB()       # deterministic
# ComplementNB()      # deterministic

# # Discriminant analysis
# LinearDiscriminantAnalysis()           # deterministic
# QuadraticDiscriminantAnalysis()        # deterministic

# # Neural networks
# MLPClassifier(random_state=1945)

In [31]:
results = {}
for name, clf in models:
    tester = Model_Tester(model=clf, cv_folds=base_tester.cv_folds)
    tester.X_train, tester.X_test = base_tester.X_train, base_tester.X_test
    tester.y_train, tester.y_test = base_tester.y_train, base_tester.y_test
    tester.feature_names = base_tester.feature_names
    tester.optimize() #show_plots = True to see plots
    results[name] = tester.evaluate()


LogisticRegression Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.97      0.88       185
           1       0.57      0.16      0.25        50

    accuracy                           0.80       235
   macro avg       0.69      0.56      0.57       235
weighted avg       0.76      0.80      0.75       235


ROC AUC Score: 0.7812
Matthews Correlation Coefficient (MCC): 0.2206
Balanced Accuracy: 0.5638
LogisticRegression Confusion Matrix (values only):
[[179   6]
 [ 42   8]]

SVC Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.79      1.00      0.88       185
           1       0.00      0.00      0.00        50

    accuracy                           0.79       235
   macro avg       0.39      0.50      0.44       235
weighted avg       0.62      0.79      0.69       235


ROC AUC Score: 0.4854
Matthews Correlation Coefficient (MCC): 0.0000
Balanced 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])



RandomForestClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.97      0.91       185
           1       0.76      0.38      0.51        50

    accuracy                           0.84       235
   macro avg       0.81      0.67      0.71       235
weighted avg       0.83      0.84      0.82       235


ROC AUC Score: 0.8152
Matthews Correlation Coefficient (MCC): 0.4613
Balanced Accuracy: 0.6738
RandomForestClassifier Confusion Matrix (values only):
[[179   6]
 [ 31  19]]

GradientBoostingClassifier Evaluation:

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.95      0.90       185
           1       0.71      0.44      0.54        50

    accuracy                           0.84       235
   macro avg       0.79      0.70      0.72       235
weighted avg       0.83      0.84      0.83       235


ROC AUC Score: 0.8351
Matthews Correlation Coef