In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import sklearn
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix, roc_auc_score, f1_score, average_precision_score, fbeta_score
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
import seaborn as sn
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support, roc_curve, RocCurveDisplay
from sklearn.neighbors import KNeighborsClassifier
from scikitplot.metrics import plot_roc
from scikitplot.metrics import plot_precision_recall, roc_curve
import shap
import sklearn
import joblib
#activate high interaction shell so print() is not necessary to show output. https://stackoverflow.com/questions/31764006/ipython-notebook-display-every-line-output-without-print
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

#1. Get data
vmsOp2 = pd.read_csv('data/vmsOp2.csv')
vmsOp2 = vmsOp2.loc[vmsOp2['Date']>'2021-01-01'].copy()

#2. Balance dataset between fishing operations (A, C, N)
print('Unbalanced dataset structure:')
vmsOp2.groupby('Operation').count()

#sample same number of points as smallest category(C)
n_bal = len(vmsOp2.loc[vmsOp2['Operation'] == 'C'])
C_bal = vmsOp2.loc[vmsOp2['Operation']=='C'] #get all points for C
A_bal = vmsOp2.loc[vmsOp2['Operation']=='A'].sample(n_bal)
N_bal = vmsOp2.loc[vmsOp2['Operation']=='N'].sample(n_bal)

print('Balanced dataset structure:')
vmsOp2_bal = pd.concat([C_bal, A_bal, N_bal])
vmsOp2_bal.groupby('Operation').count()


Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


Unbalanced dataset structure:
Balanced dataset structure:


Unnamed: 0_level_0,Code,Date,Latitude,Longitude,geom,Speed,Course,CourseCorrected,TrackCode,Effort,...,IniDate,EndDate,DayTime,DayTime2,geometry,speedDiff-1,speedDiff+1,cogDiff,bufferGeom,bufferCount
Operation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A,440,440,440,440,440,440,440,440,440,440,...,440,440,440,440,440,440,440,440,440,440
C,440,440,440,440,440,440,440,440,440,440,...,440,440,440,440,440,440,440,440,440,440
N,440,440,440,440,440,440,440,440,440,440,...,440,440,440,440,440,440,440,440,440,440


In [15]:
#3. Split dataset into response (y) and predictive variables (x)
finalDF = vmsOp2_bal
finalDF.to_csv('data/balanced_dataset.csv')
x = finalDF[['Speed', 'CourseCorrected', 'cogDiff', 'speedDiff+1', 'speedDiff-1', 'DayTime2', 'bufferCount']]
y = finalDF.Operation

#encode labels 
le = LabelEncoder()
y = pd.Series(le.fit_transform(y))
print('labels encoded as:')
dict(zip(le.classes_, le.transform(le.classes_)))

#4. Split x, y datasets into training and validation 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25)
print('vmsOp2 balanced total len: ' + str(len(finalDF)))
print('vmsOp2 balanced training len: ' + str(len(y_train)))
print('vmsOp2 balanced test len: ' + str(len(y_test)))
y_test.value_counts()

#5. Check preliminar model accuracy (without hyperparameter tunning)
model = RandomForestClassifier(n_estimators=300, max_features=1)
model.fit(x_train, y_train) 
print('The model accuracy in the training dataset is: ' + str(model.score(x_train, y_train)))
print('The model accuracy in the testing dataset is: ' + str(model.score(x_test, y_test)))


labels encoded as:


{'A': 0, 'C': 1, 'N': 2}

vmsOp2 balanced total len: 1320
vmsOp2 balanced training len: 990
vmsOp2 balanced test len: 330


2    115
0    110
1    105
Name: count, dtype: int64

The model accuracy in the training dataset is: 1.0
The model accuracy in the testing dataset is: 0.7333333333333333


In [16]:
#6. Hyperparameter tunning
n_estimators = [int(x) for x in np.linspace(start=100, stop=2000, num = 3)]
'''
max_features = ['sqrt','log2']
max_depth = [2,3,5,10]
min_samples_split = [2,3,4,5] 
min_samples_leaf = [1,5,10,15,20]
bootstrap = [True, False]
max_leaf_nodes = [2,5,10,50,100]
min_impurity_decrease = [0.0, 0.1, 0.5, 1.0]
max_samples = [0.25, 0.5, 0.75, 1.0]
'''
max_features = ['sqrt','log2']
max_depth = [3,5]
min_samples_split = [2,3,5] 
min_samples_leaf = [10,15]
bootstrap = [True, False]
max_leaf_nodes = [10]
min_impurity_decrease = [0.0]
max_samples = [0.25]

param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
              'bootstrap': bootstrap,
              'max_leaf_nodes' : max_leaf_nodes,
              'min_impurity_decrease' : min_impurity_decrease,
              'max_samples' : max_samples
              }

scorers = {
    'precision_score': make_scorer(precision_score, average='micro')
    ,'recall_score': make_scorer(recall_score, average='micro')
    ,'f1_score': make_scorer(f1_score, average='micro')
    ,'accuracy_score': make_scorer(accuracy_score)
}

modelH = RandomForestClassifier(class_weight='balanced')
print('Fitting final model with hyperparameter tunning...')
rf_Grid = GridSearchCV(estimator = modelH, param_grid=param_grid, scoring=scorers, refit='accuracy_score', cv=5, verbose=1, n_jobs=10)
rf_Grid.fit(x_train, y_train)

print(rf_Grid.best_params_)
print('Model accuracy in the training dataset is: ' + str(rf_Grid.score(x_train, y_train)))
print('Model accuracy in the test dataset: ' + str(rf_Grid.score(x_test, y_test)))

rf_Grid.best_estimator_
rf_Grid.best_estimator_.score
pd.DataFrame(rf_Grid.cv_results_).to_csv('results/multiclassModel/gridSearchCV_results.csv')
joblib.dump(rf_Grid, 'results/multiclassModel/fittedModel_multiClass.pkl')


"\nmax_features = ['sqrt','log2']\nmax_depth = [2,3,5,10]\nmin_samples_split = [2,3,4,5] \nmin_samples_leaf = [1,5,10,15,20]\nbootstrap = [True, False]\nmax_leaf_nodes = [2,5,10,50,100]\nmin_impurity_decrease = [0.0, 0.1, 0.5, 1.0]\nmax_samples = [0.25, 0.5, 0.75, 1.0]\n"

Fitting final model with hyperparameter tunning...
Fitting 5 folds for each of 144 candidates, totalling 720 fits


360 fits failed out of a total of 720.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\jsala\anaconda3\envs\jupyter-env\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\jsala\anaconda3\envs\jupyter-env\Lib\site-packages\sklearn\ensemble\_forest.py", line 397, in fit
    raise ValueError(
ValueError: `max_sample` cannot be set if `bootstrap=False`. Either switch to `bootstrap=True` or set `max_sample=None`.

 0.65656566 0.66464646 0.66464646 0.65959596 0.66262626 0.65555556
 0.64848485 0.65757576 0.65656566 0.65353535 0.65858586 0.65959596
 0.

{'bootstrap': True, 'max_depth': 5, 'max_features': 'sqrt', 'max_leaf_nodes': 10, 'max_samples': 0.25, 'min_impurity_decrease': 0.0, 'min_samples_leaf': 10, 'min_samples_split': 3, 'n_estimators': 1050}
Model accuracy in the training dataset is: 0.694949494949495
Model accuracy in the test dataset: 0.696969696969697


<bound method ClassifierMixin.score of RandomForestClassifier(class_weight='balanced', max_depth=5, max_leaf_nodes=10,
                       max_samples=0.25, min_samples_leaf=10,
                       min_samples_split=3, n_estimators=1050)>

['results/multiclassModel/fittedModel_multiClass.pkl']

In [19]:
from modelCheckFunctions import model_scoring_and_confusion_matrix_multiclassModel
#7. Check fitted model performance
#Get model predictions 
y_pred = rf_Grid.best_estimator_.predict(x_test)
y_score = rf_Grid.best_estimator_.predict_proba(x_test)

#Confusion matrix
model_scoring_and_confusion_matrix_multiclassModel(y_test, y_pred)

#Roc curve
plot_roc(le.inverse_transform(y_test), y_score)
plt.show()

#Precision recall curve
plot_precision_recall(le.inverse_transform(y_test), y_score)
plt.show()

ImportError: cannot import name 'model_scoring_and_confusion_matrix_multiclassModel' from 'modelCheckFunctions' (c:\Users\jsala\ownCloud\papers\paperPeixBlau\PS_ML_git\PurseSeine_RandomForest\modelCheckFunctions.py)

In [None]:
#8. Shap values (understanding the model)
#https://www.youtube.com/watch?v=L8_sVRhBDLU&ab_channel=ADataOdyssey
shap.initjs()
type(rf_Grid.best_estimator_)
explainer = shap.TreeExplainer(rf_Grid.best_estimator_)
shap_values = explainer(x_train)