In [7]:
#Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

**This Notebook is a preliminary test for looking at the differences between various ML models.**

Outline of Tests:

#1 Gradient Boosting Regressor predicting number of ships sunk for convoys at sea greater than 10 days <br>
#2 Random Forest Regressor predicting number of ships sunk for convoys at sea greater than 10 days <br>
#3 Gradient Boosting Regressor predicting number of ships sunk for convoys at sea greater than 5 days <br>
#4 Random Forest Regressor predicting number of ships sunk for convoys at sea greater than 5 days <br>

#5 Gradient Boosting Regressor predicting overall sink percentage for convoys at sea greater than 10 days <br>
#6 Random Forest Regressor predicting overall sink percentage for convoys at sea greater than 10 days <br>
#7 Gradient Boosting Regressor predicting overall sink percentage for convoys at sea greater than 5 days <br>
#8 Random Forest Regressor predicting overall sink percentage for convoys at sea greater than 5 days <br>


Note: These are all regressor models and not the typically classification done for this project. This is done to test differences between models and classifiers and also to identify changes when the time at sea is changed from greater than 10 to greater than 5 days.  

In [4]:
#Complied data of convoys
#Routes examined are HX, SC, OB, ON, ONS
df = pd.read_csv('Complete_Convoy_Data.csv')
df = df.drop(columns=['Unnamed: 0'])
df.shape
df.head(3)

Unnamed: 0,Convoy Number,Number of Ships,Number of Escort Ships,Number of Stragglers,Number of Ships Sunk,Number of Escorts Sunk,Number of Stragglers Sunk,Total Tons of Convoy,Total Tons of Ships Sunk,Depart_Date,...,Overall Sink Percentage,Escort Sink Percentage,Straggler Sink Percentage,Avg Number of U-Boats in Atlantic,Escort Ratio,Time At Sea (Days),Month,Year,Previous Month Avg Sink %,Approx. Sighting Range
0,OB.1,5.0,2.0,0.0,0.0,0.0,0.0,22877.0,0.0,1939-09-07 00:00:00,...,0.0,0.0,0.0,6.0,0.4,3.0,9.0,1939.0,0.0,12.1902
1,OB.2,5.0,2.0,0.0,0.0,0.0,0.0,22967.0,0.0,1939-09-09 00:00:00,...,0.0,0.0,0.0,6.0,0.4,5.0,9.0,1939.0,0.0,12.1902
2,OB.3,7.0,4.0,0.0,0.0,0.0,0.0,21293.0,0.0,1939-09-11 00:00:00,...,0.0,0.0,0.0,6.0,0.571429,5.0,9.0,1939.0,0.0,14.434062


In [18]:
#Dropping the features that are not needed or give away information
df = df.drop(columns=['Convoy Number', 'Number of Ships Sunk', 'Depart_Date', 'Arrival/Dispersal Date', 'Number of Escorts Sunk', \
                         'Number of Stragglers Sunk', 'Total Tons of Ships Sunk', 'Escort Sink Percentage', 'Straggler Sink Percentage'])
df.reset_index(drop=True).head(3)

Unnamed: 0,Number of Ships,Number of Escort Ships,Number of Stragglers,Total Tons of Convoy,Overall Sink Percentage,Avg Number of U-Boats in Atlantic,Escort Ratio,Time At Sea (Days),Month,Year,Previous Month Avg Sink %,Approx. Sighting Range
0,5.0,2.0,0.0,22877.0,0.0,6.0,0.4,3.0,9.0,1939.0,0.0,12.1902
1,5.0,2.0,0.0,22967.0,0.0,6.0,0.4,5.0,9.0,1939.0,0.0,12.1902
2,7.0,4.0,0.0,21293.0,0.0,6.0,0.571429,5.0,9.0,1939.0,0.0,14.434062


**Comparison of Using Overall Sink Percentage vs Number of Ships Sunk and Various cutoffs for time at sea (greater than 10 or 5 days). Time at sea cutoff changes the amount of data used.**

In [5]:
#Only instances where the convoy was together for more than 10 days
df_greater_than_10= df[df['Time At Sea (Days)'] > 10]
#Removes 298 convoys 

#Only instances where the convoy was together for more than 5 days
df_greater_than_5= df[df['Time At Sea (Days)'] > 5]
#Removes 184 convoys

In [20]:
#K-Fold Cross Validation Function
def K_Fold(model, X, y, K, scaler=None, random_state=1945):
    kf = KFold(n_splits=K, random_state=random_state, shuffle=True)
    train_scores = []
    test_scores = []
    for idxTrain, idxTest in kf.split(X):
        Xtrain = X[idxTrain]
        Xtest = X[idxTest]
        ytrain = y[idxTrain]
        ytest = y[idxTest]
        if scaler is not None:
            Xtrain = scaler.fit_transform(Xtrain)
            Xtest = scaler.transform(Xtest)
        model.fit(Xtrain, ytrain)
        train_scores.append(model.score(Xtrain, ytrain))
        test_scores.append(model.score(Xtest, ytest))
    return train_scores, test_scores

In [26]:
#Train Test Split Greater than 10
X_1 = np.array(df_greater_than_10.drop(columns=['Overall Sink Percentage']))
y_1 = df_greater_than_10['Overall Sink Percentage'].values
Xtrain, Xtest, ytrain, ytest = train_test_split(X_1, y_1, train_size=0.8, random_state=1945)
(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

((702, 11), (176, 11), (702,), (176,))

In [None]:
#1

#Gradient Boosting Regressor
GB_model = GradientBoostingRegressor(random_state=1945)
GB_model.fit(Xtrain, ytrain)
ypredict = GB_model.predict(Xtest)
GB_mse = mean_squared_error(ytest, ypredict)
print('Gradient Boosting Regressor Train Score (Mean Accuracy):', GB_model.score(Xtrain, ytrain))
print('Gradient Boosting Regressor Test Score (Mean Accuracy):', GB_model.score(Xtest, ytest))
print('Gradient Boosting Regressor Mean Squared Error', GB_mse)
train_scores, test_scores = K_Fold(GB_model, X_1, y_1, 10, )
print('K-Fold Gradient Boosting Train Score (Mean R\u00b2):', np.mean(train_scores))
print('K-Fold Gradient Boosting Test Score (Mean R\u00b2):',np.mean(test_scores))

Gradient Boosting Regressor Train Score (Mean Accuracy): 0.8108159645821675
Gradient Boosting Regressor Test Score (Mean Accuracy): 0.06908893514660186
Gradient Boosting Regressor Mean Squared Error 13.343385511802069
K-Fold Gradient Boosting Train Score (Mean R²): 0.7650187716574124
K-Fold Gradient Boosting Test Score (Mean R²): -0.2242567375852599


In [None]:
#2

#Random Forest Regressor
Ran_Forest_Model = RandomForestRegressor(random_state=1945) #No optimization at this point
Ran_Forest_Model.fit(Xtrain, ytrain)
ypredict = Ran_Forest_Model.predict(Xtest)
RFR_mse = mean_squared_error(ytest, ypredict)
print('Random Forest Regressor Train Score (R\u00b2):', Ran_Forest_Model.score(Xtrain, ytrain))
print('Random Forest Regressor Test Score (R\u00b2):', Ran_Forest_Model.score(Xtest, ytest))
print('Random Forest Regressor Mean Squared Error', RFR_mse)
train_scores, test_scores = K_Fold(Ran_Forest_Model, X_1, y_1, 10, )
print('K-Fold Random Forest Regressor Train Score (Mean R\u00b2):', np.mean(train_scores))
print('K-Fold Random Forest Regressor Test Score (Mean R\u00b2):',np.mean(test_scores))

Random Forest Regressor Train Score (R²): 0.8583704862153498
Random Forest Regressor Test Score (R²): 0.00961583476495087
Random Forest Regressor Mean Squared Error 14.195854169588886
K-Fold Random Forest Regressor Train Score (Mean R²): 0.8574024500793638
K-Fold Random Forest Regressor Test Score (Mean R²): -0.19246932352660823


In [23]:
#Train Test Split Greater than 5
X_2 = np.array(df_greater_than_5.drop(columns=['Overall Sink Percentage']))
y_2 = df_greater_than_5['Overall Sink Percentage'].values
Xtrain, Xtest, ytrain, ytest = train_test_split(X_2, y_2, train_size=0.8, random_state=1945)
(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

((750, 11), (188, 11), (750,), (188,))

In [None]:
#3

#Gradient Boosting Regressor
GB_model = GradientBoostingRegressor(random_state=1945)
GB_model.fit(Xtrain, ytrain)
ypredict = GB_model.predict(Xtest)
GB_mse = mean_squared_error(ytest, ypredict)
print('Gradient Boosting Regressor Train Score (Mean Accuracy):', GB_model.score(Xtrain, ytrain))
print('Gradient Boosting Regressor Test Score (Mean Accuracy):', GB_model.score(Xtest, ytest))
print('Gradient Boosting Regressor Mean Squared Error', GB_mse)
train_scores, test_scores = K_Fold(GB_model, X_2, y_2, 10, )
print('K-Fold Gradient Boosting Train Score (Mean R\u00b2):', np.mean(train_scores))
print('K-Fold Gradient Boosting Test Score (Mean R\u00b2):',np.mean(test_scores))

Gradient Boosting Regressor Train Score (Mean Accuracy): 0.7667219579010558
Gradient Boosting Regressor Test Score (Mean Accuracy): 0.05201415377468699
Gradient Boosting Regressor Mean Squared Error 19.485033553328893
K-Fold Gradient Boosting Train Score (Mean R²): 0.7287748550675843
K-Fold Gradient Boosting Test Score (Mean R²): -0.20461029416321042


In [None]:
#4

#Random Forest Regressor
Ran_Forest_Model = RandomForestRegressor(random_state=1945) #No optimization at this point
Ran_Forest_Model.fit(Xtrain, ytrain)
ypredict = Ran_Forest_Model.predict(Xtest)
RFR_mse = mean_squared_error(ytest, ypredict)
print('Random Forest Regressor Train Score (R\u00b2):', Ran_Forest_Model.score(Xtrain, ytrain))
print('Random Forest Regressor Test Score (R\u00b2):', Ran_Forest_Model.score(Xtest, ytest))
print('Random Forest Regressor Mean Squared Error', RFR_mse)
train_scores, test_scores = K_Fold(Ran_Forest_Model, X_2, y_2, 10, )
print('K-Fold Random Forest Regressor Train Score (Mean R\u00b2):', np.mean(train_scores))
print('K-Fold Random Forest Regressor Test Score (Mean R\u00b2):',np.mean(test_scores))

Random Forest Regressor Train Score (R²): 0.8520017273140668
Random Forest Regressor Test Score (R²): 0.15141077335865316
Random Forest Regressor Mean Squared Error 17.442021544876678
K-Fold Random Forest Regressor Train Score (Mean R²): 0.8559517855229009
K-Fold Random Forest Regressor Test Score (Mean R²): -0.1312484101136941


**Testing Overall Sink Percentage vs Number of Ships Sunk**

In [30]:
df_2 = pd.read_csv('Complete_Convoy_Data.csv')
df_2 = df_2.drop(columns=['Unnamed: 0'])

In [32]:
#Dropping the features that are not needed or give away information
df_2 = df_2.drop(columns=['Convoy Number', 'Overall Sink Percentage', 'Depart_Date', 'Arrival/Dispersal Date', 'Number of Escorts Sunk', \
                         'Number of Stragglers Sunk', 'Total Tons of Ships Sunk', 'Escort Sink Percentage', 'Straggler Sink Percentage'])
df_2.reset_index(drop=True).head(3) 

Unnamed: 0,Number of Ships,Number of Escort Ships,Number of Stragglers,Number of Ships Sunk,Total Tons of Convoy,Avg Number of U-Boats in Atlantic,Escort Ratio,Time At Sea (Days),Month,Year,Previous Month Avg Sink %,Approx. Sighting Range
0,5.0,2.0,0.0,0.0,22877.0,6.0,0.4,3.0,9.0,1939.0,0.0,12.1902
1,5.0,2.0,0.0,0.0,22967.0,6.0,0.4,5.0,9.0,1939.0,0.0,12.1902
2,7.0,4.0,0.0,0.0,21293.0,6.0,0.571429,5.0,9.0,1939.0,0.0,14.434062


In [37]:
#Only instances where the convoy was together for more than 10 days
df_greater_than_10_2= df_2[df_2['Time At Sea (Days)'] > 10]
#Removes 184 convoys 289

#Only instances where the convoy was together for more than 5 days
df_greater_than_5_2= df_2[df_2['Time At Sea (Days)'] > 5]
#Removes 184 convoys

In [38]:
X_3 = np.array(df_greater_than_10_2.drop(columns=['Number of Ships Sunk']))
y_3 = df_greater_than_10_2['Number of Ships Sunk'].values
Xtrain, Xtest, ytrain, ytest = train_test_split(X_3, y_3, train_size=0.8, random_state=1945)
(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

((702, 11), (176, 11), (702,), (176,))

In [None]:
#5

#Gradient Boosting Regressor
GB_model = GradientBoostingRegressor(random_state=1945)
GB_model.fit(Xtrain, ytrain)
ypredict = GB_model.predict(Xtest)
GB_mse = mean_squared_error(ytest, ypredict)
print('Gradient Boosting Regressor Train Score (Mean Accuracy):', GB_model.score(Xtrain, ytrain))
print('Gradient Boosting Regressor Test Score (Mean Accuracy):', GB_model.score(Xtest, ytest))
print('Gradient Boosting Regressor Mean Squared Error', GB_mse)
train_scores, test_scores = K_Fold(GB_model, X_3, y_3, 10, )
print('K-Fold Gradient Boosting Train Score (Mean R\u00b2):', np.mean(train_scores))
print('K-Fold Gradient Boosting Test Score (Mean R\u00b2):',np.mean(test_scores))

Gradient Boosting Regressor Train Score (Mean Accuracy): 0.7924348525211582
Gradient Boosting Regressor Test Score (Mean Accuracy): -0.08432872786347101
Gradient Boosting Regressor Mean Squared Error 3.7027011618594603
K-Fold Gradient Boosting Train Score (Mean R²): 0.7558586254211882
K-Fold Gradient Boosting Test Score (Mean R²): -0.24125934028307872


In [None]:
#6

#Random Forest Regressor
Ran_Forest_Model = RandomForestRegressor(random_state=1945) #No optimization at this point
Ran_Forest_Model.fit(Xtrain, ytrain)
ypredict = Ran_Forest_Model.predict(Xtest)
RFR_mse = mean_squared_error(ytest, ypredict)
print('Random Forest Regressor Train Score (R\u00b2):', Ran_Forest_Model.score(Xtrain, ytrain))
print('Random Forest Regressor Test Score (R\u00b2):', Ran_Forest_Model.score(Xtest, ytest))
print('Random Forest Regressor Mean Squared Error', RFR_mse)
train_scores, test_scores = K_Fold(Ran_Forest_Model, X_3, y_3, 10, )
print('K-Fold Random Forest Regressor Train Score (Mean R\u00b2):', np.mean(train_scores))
print('K-Fold Random Forest Regressor Test Score (Mean R\u00b2):',np.mean(test_scores))

Random Forest Regressor Train Score (R²): 0.857097286518973
Random Forest Regressor Test Score (R²): -0.08816053698889137
Random Forest Regressor Mean Squared Error 3.7157857954545452
K-Fold Random Forest Regressor Train Score (Mean R²): 0.8557931261270346
K-Fold Random Forest Regressor Test Score (Mean R²): -0.14126694232031323


In [41]:
X_4 = np.array(df_greater_than_5_2.drop(columns=['Number of Ships Sunk']))
y_4 = df_greater_than_5_2['Number of Ships Sunk'].values
Xtrain, Xtest, ytrain, ytest = train_test_split(X_4, y_4, train_size=0.8, random_state=1945)
(Xtrain.shape, Xtest.shape, ytrain.shape, ytest.shape)

((750, 11), (188, 11), (750,), (188,))

In [None]:
#7

#Gradient Boosting Regressor
GB_model = GradientBoostingRegressor(random_state=1945)
GB_model.fit(Xtrain, ytrain)
ypredict = GB_model.predict(Xtest)
GB_mse = mean_squared_error(ytest, ypredict)
print('Gradient Boosting Regressor Train Score (Mean Accuracy):', GB_model.score(Xtrain, ytrain))
print('Gradient Boosting Regressor Test Score (Mean Accuracy):', GB_model.score(Xtest, ytest))
print('Gradient Boosting Regressor Mean Squared Error', GB_mse)
train_scores, test_scores = K_Fold(GB_model, X_4, y_4, 10, )
print('K-Fold Gradient Boosting Train Score (Mean R\u00b2):', np.mean(train_scores))
print('K-Fold Gradient Boosting Test Score (Mean R\u00b2):',np.mean(test_scores))

Gradient Boosting Regressor Train Score (Mean Accuracy): 0.7622364343330639
Gradient Boosting Regressor Test Score (Mean Accuracy): 0.010142099860464104
Gradient Boosting Regressor Mean Squared Error 7.408698774086474
K-Fold Gradient Boosting Train Score (Mean R²): 0.7273958378147681
K-Fold Gradient Boosting Test Score (Mean R²): -0.3011173625045743


In [None]:
#8

#Random Forest Regressor
Ran_Forest_Model = RandomForestRegressor(random_state=1945) #No optimization at this point
Ran_Forest_Model.fit(Xtrain, ytrain)
ypredict = Ran_Forest_Model.predict(Xtest)
RFR_mse = mean_squared_error(ytest, ypredict)
print('Random Forest Regressor Train Score (R\u00b2):', Ran_Forest_Model.score(Xtrain, ytrain))
print('Random Forest Regressor Test Score (R\u00b2):', Ran_Forest_Model.score(Xtest, ytest))
print('Random Forest Regressor Mean Squared Error', RFR_mse)
train_scores, test_scores = K_Fold(Ran_Forest_Model, X_4, y_4, 10, )
print('K-Fold Random Forest Regressor Train Score (Mean R\u00b2):', np.mean(train_scores))
print('K-Fold Random Forest Regressor Test Score (Mean R\u00b2):',np.mean(test_scores))

Random Forest Regressor Train Score (R²): 0.8437771627851679
Random Forest Regressor Test Score (R²): 0.11908060755435956
Random Forest Regressor Mean Squared Error 6.593336702127659
K-Fold Random Forest Regressor Train Score (Mean R²): 0.8535978372528774
K-Fold Random Forest Regressor Test Score (Mean R²): -0.14360055572159672


# All Results:


#1
Gradient Boosting Regressor Train Score (Mean Accuracy): 0.8108159645821675
Gradient Boosting Regressor Test Score (Mean Accuracy): 0.06908893514660186
Gradient Boosting Regressor Mean Squared Error 13.343385511802069
K-Fold Gradient Boosting Train Score (Mean R²): 0.7650187716574124
K-Fold Gradient Boosting Test Score (Mean R²): -0.2242567375852599


#2
Random Forest Regressor Train Score (R²): 0.8583704862153498
Random Forest Regressor Test Score (R²): 0.00961583476495087
Random Forest Regressor Mean Squared Error 14.195854169588886
K-Fold Random Forest Regressor Train Score (Mean R²): 0.8574024500793638
K-Fold Random Forest Regressor Test Score (Mean R²): -0.19246932352660823


#3
Gradient Boosting Regressor Train Score (Mean Accuracy): 0.7667219579010558
Gradient Boosting Regressor Test Score (Mean Accuracy): 0.05201415377468699
Gradient Boosting Regressor Mean Squared Error 19.485033553328893
K-Fold Gradient Boosting Train Score (Mean R²): 0.7287748550675843
K-Fold Gradient Boosting Test Score (Mean R²): -0.20461029416321042


#4
Random Forest Regressor Train Score (R²): 0.8520017273140668
Random Forest Regressor Test Score (R²): 0.15141077335865316
Random Forest Regressor Mean Squared Error 17.442021544876678
K-Fold Random Forest Regressor Train Score (Mean R²): 0.8559517855229009
K-Fold Random Forest Regressor Test Score (Mean R²): -0.1312484101136941


#5
Gradient Boosting Regressor Train Score (Mean Accuracy): 0.7924348525211582
Gradient Boosting Regressor Test Score (Mean Accuracy): -0.08432872786347101
Gradient Boosting Regressor Mean Squared Error 3.7027011618594603
K-Fold Gradient Boosting Train Score (Mean R²): 0.7558586254211882
K-Fold Gradient Boosting Test Score (Mean R²): -0.24125934028307872

#6
Random Forest Regressor Train Score (R²): 0.857097286518973
Random Forest Regressor Test Score (R²): -0.08816053698889137
Random Forest Regressor Mean Squared Error 3.7157857954545452
K-Fold Random Forest Regressor Train Score (Mean R²): 0.8557931261270346
K-Fold Random Forest Regressor Test Score (Mean R²): -0.14126694232031323


#7
Gradient Boosting Regressor Train Score (Mean Accuracy): 0.7622364343330639
Gradient Boosting Regressor Test Score (Mean Accuracy): 0.010142099860464104
Gradient Boosting Regressor Mean Squared Error 7.408698774086474
K-Fold Gradient Boosting Train Score (Mean R²): 0.7273958378147681
K-Fold Gradient Boosting Test Score (Mean R²): -0.3011173625045743


#8
Random Forest Regressor Train Score (R²): 0.8437771627851679
Random Forest Regressor Test Score (R²): 0.11908060755435956
Random Forest Regressor Mean Squared Error 6.593336702127659
K-Fold Random Forest Regressor Train Score (Mean R²): 0.8535978372528774
K-Fold Random Forest Regressor Test Score (Mean R²): -0.14360055572159672

Notebook is finished, results will be analyzed elsewhere. 