In [24]:
# Standard imports
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [31]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
def evaluate_preds(y_true, y_pred):
    """
    Preforms evaluation comparison on y_true labels vs y_pred labels on our classification model.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    metric_dict = {"accuracy" : round(accuracy, 2), "precision" : round(precision, 2), "recall" : round(recall, 2), "f1" : round(f1, 1)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Prec: {precision * 100:.2f}%")
    print(f"Rec: {recall:.2f}%")
    print(f"f1: {f1:.2f}%")

    return metric_dict

## Saving and Loading trained ML model

Two ways to save and load ML models:
1. With Pythons `pickle` module
2. WIth `joblib` module

**Pickle**

In [32]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV ##SIMILAR TO RANDOMIZED but it will go through all combinations of params

grid_2 = {'n_estimators': [100, 200],
          'max_depth': [None],
          'max_features': ['sqrt'],
          'min_samples_split': [2],
          'min_samples_leaf': [2, 4]}

heart_disease = pd.read_csv("heart-disease.csv")
np.random.seed(42)

# SUFFLE THE DATA
heart_disease_shuffeled = heart_disease.sample(frac=1)

X = heart_disease_shuffeled.drop("target", axis=1)
y = heart_disease_shuffeled["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = RandomForestClassifier(n_jobs=1)

#Setup Grid search cv
gs_clf = GridSearchCV(estimator=clf, 
                            param_grid=grid_2,
                            cv=5,
                            verbose=2)

#FIt the grid search cv version of clf
gs_clf.fit(X_train, y_train);

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=100; total time=   0.0s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n_estimators=200; total time=   0.1s
[CV] END max_depth=None, max_features=sqrt, min_samples_leaf=2, min_samples_split=2, n

In [33]:
#save an existing model to file
import pickle
pickle.dump(gs_clf, open("gs_random_forest_model_1.pkl", "wb"))

In [34]:
# Import existing model from file
loaded_pickle_model = pickle.load(open("gs_random_forest_model_1.pkl", "rb"))

In [35]:
# make some predictions
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) 
loaded_y_preds = loaded_pickle_model.predict(X_test)
evaluate_preds(y_test, loaded_y_preds)

Acc: 90.16%
Prec: 88.89%
Rec: 0.94%
f1: 0.91%


{'accuracy': 0.9, 'precision': 0.89, 'recall': 0.94, 'f1': 0.9}

In [36]:
gs_y_preds = gs_clf.predict(X_test)
evaluate_preds(y_test, gs_y_preds)

Acc: 90.16%
Prec: 88.89%
Rec: 0.94%
f1: 0.91%


{'accuracy': 0.9, 'precision': 0.89, 'recall': 0.94, 'f1': 0.9}

**joblib**

In [37]:
from joblib import dump, load

#Save model to file
dump(gs_clf, filename="gs_random_forest_model_1.joblib")

['gs_random_forest_model_1.joblib']

In [38]:
# Load the existing model
loaded_joblib_model = load(filename="gs_random_forest_model_1.joblib")

In [40]:
# make some predictions
jl_loaded_y_preds = loaded_joblib_model.predict(X_test)
evaluate_preds(y_test, jl_loaded_y_preds)

Acc: 90.16%
Prec: 88.89%
Rec: 0.94%
f1: 0.91%


{'accuracy': 0.9, 'precision': 0.89, 'recall': 0.94, 'f1': 0.9}

In [41]:
gs_y_preds = gs_clf.predict(X_test)
evaluate_preds(y_test, gs_y_preds)

Acc: 90.16%
Prec: 88.89%
Rec: 0.94%
f1: 0.91%


{'accuracy': 0.9, 'precision': 0.89, 'recall': 0.94, 'f1': 0.9}

**According to sklearn documentation it may be better to use joblib library which is more efficient on object that carry large numpy arrays internally as it is often the case for fitted sklearn estimators, but can only pickle to the disk and not to string**