In [2]:
#Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.dates import MonthLocator
import matplotlib.dates as mdates
import seaborn as sns
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler as SS
from sklearn.linear_model import LinearRegression as LinearRegression, Ridge
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report
from sklearn.neighbors import KNeighborsRegressor
import xgboost as xgb
from xgboost import XGBRFClassifier

In [5]:
df = pd.read_csv('Complete_Convoy_Data.csv')
df = df.drop(columns=['Unnamed: 0'])
df2 = df
# df2 = df[df['Time At Sea (Days)'] > 10]
#Dropping the features that are not needed or give away information
df2 = df2.drop(columns=['Convoy Number', 'Number of Ships Sunk', 'Depart_Date', 'Arrival/Dispersal Date', 'Number of Escorts Sunk', \
                         'Number of Stragglers Sunk', 'Total Tons of Ships Sunk', 'Escort Sink Percentage', 'Straggler Sink Percentage'])
df2.reset_index(drop=True)
df2

Unnamed: 0,Number of Ships,Number of Escort Ships,Number of Stragglers,Total Tons of Convoy,Overall Sink Percentage,Avg Number of U-Boats in Atlantic,Escort Ratio,Time At Sea (Days),Month,Year,Previous Month Avg Sink %,Approx. Sighting Range
0,5.0,2.0,0.0,22877.0,0.0,6.0,0.400000,3.0,9.0,1939.0,0.000000,12.190200
1,5.0,2.0,0.0,22967.0,0.0,6.0,0.400000,5.0,9.0,1939.0,0.000000,12.190200
2,7.0,4.0,0.0,21293.0,0.0,6.0,0.571429,5.0,9.0,1939.0,0.000000,14.434062
3,12.0,2.0,0.0,77587.0,0.0,6.0,0.166667,3.0,9.0,1939.0,0.000000,18.351409
4,20.0,3.0,0.0,98187.0,0.0,6.0,0.150000,3.0,9.0,1939.0,0.000000,21.568467
...,...,...,...,...,...,...,...,...,...,...,...,...
1169,85.0,15.0,0.0,511572.0,0.0,0.0,0.176471,15.0,5.0,1945.0,0.098328,23.997420
1170,30.0,9.0,0.0,103961.0,0.0,0.0,0.300000,14.0,5.0,1945.0,0.098328,23.152177
1171,82.0,23.0,0.0,406154.0,0.0,0.0,0.280488,14.0,5.0,1945.0,0.098328,23.996461
1172,43.0,12.0,0.0,210127.0,0.0,0.0,0.279070,13.0,5.0,1945.0,0.098328,23.784495


In [6]:
#K-Fold Cross Validation Function
def K_Fold(model, X, y, K, scaler=None, random_state=1945):
    kf = KFold(n_splits=K, random_state=random_state, shuffle=True)
    train_scores = []
    test_scores = []
    for idxTrain, idxTest in kf.split(X):
        Xtrain = X[idxTrain]
        Xtest = X[idxTest]
        ytrain = y[idxTrain]
        ytest = y[idxTest]
        if scaler is not None:
            Xtrain = scaler.fit_transform(Xtrain)
            Xtest = scaler.transform(Xtest)
        model.fit(Xtrain, ytrain)
        train_scores.append(model.score(Xtrain, ytrain))
        test_scores.append(model.score(Xtest, ytest))
    return train_scores, test_scores

In [7]:
#Train Test Split
df2['High Risk'] = (df2['Overall Sink Percentage'] > 0).astype(int)
X = np.array(df2.drop(columns=['Overall Sink Percentage', 'High Risk']))
y = df2['High Risk'].values
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, train_size = 0.8, random_state=1945)

Optimizing a Gradient Boosting Classifier:

In [8]:
XGB_RFC_Model = XGBRFClassifier()
XGB_RFC_Model.fit(Xtrain, ytrain)
ypredict = XGB_RFC_Model.predict(Xtest)
XGB_RFC_MSE= mean_squared_error(ytest, ypredict)
print('Gradient Boosting Classifier Mean Squared Error', XGB_RFC_MSE)

Gradient Boosting Classifier Mean Squared Error 0.1829787234042553


In [7]:
param_grid = dict(eta=[0.1, 0.3, 0.7], gamma=[0,1,5], max_depth=[5,6,7], min_child_weight=[0.5, 1, 1.5], subsample=[0.5, 1] \
    )
#eta 'Step size shrinkage used in update to prevents overfitting'
#gamma 'Minimum loss reduction required to make a further partition on a leaf node of the tree'
#subsamle 'Subsample ratio of the training instances. Setting it to 0.5 means that XGBoost would randomly sample half of the training data prior to growing trees'

In [8]:
cv = KFold(n_splits=10, random_state=1945, shuffle=True)
grid = GridSearchCV(XGBRFClassifier(), param_grid=param_grid, cv=cv, scoring='accuracy')
grid.fit(Xtrain, ytrain)

In [9]:
print(f'The best parameters are {grid.best_params_} with a score of {grid.best_score_:.2f}')

The best parameters are {'eta': 0.1, 'gamma': 1, 'max_depth': 7, 'min_child_weight': 0.5, 'subsample': 0.5} with a score of 0.85


In [11]:
XGB_RFC_Model_2 = XGBRFClassifier(eta=0.1, gamma=1, max_depth=7, min_child_weight=0.5, subsample=0.5)
XGB_RFC_Model_2.fit(Xtrain, ytrain)
ypredict = XGB_RFC_Model_2.predict(Xtest)
XGB_RFC_Model_2= mean_squared_error(ytest, ypredict)
print('Gradient Boosting Classifier Mean Squared Error', XGB_RFC_Model_2)
print('Gradient Boosting Classifier Classification Report: \n', classification_report(ytest, ypredict))


Gradient Boosting Classifier Mean Squared Error 0.13636363636363635
Gradient Boosting Classifier Classification Report: 
               precision    recall  f1-score   support

           0       0.88      0.96      0.92       141
           1       0.74      0.49      0.59        35

    accuracy                           0.86       176
   macro avg       0.81      0.72      0.75       176
weighted avg       0.85      0.86      0.85       176



In [15]:
eval_results = XGB_RFC_Model_2.evals_result()


AttributeError: 'numpy.float64' object has no attribute 'evals_result'