In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

In [2]:
heart_df = pd.read_csv("data/heart-disease.csv")
heart_df.head() # classification dataset - supervised learning

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],
        "min_samples_leaf": [1, 2, 4]}

In [4]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)  # Results are reproducable

# Shuffle the data
heart_df_shuffle = heart_df.sample(frac=1)

# Split into X and y
X = heart_df_shuffle.drop("target",axis=1)
y = heart_df_shuffle["target"]


# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV --> cross validation - automatically creates the validation sets for us
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10, # try 20 models total
                            cv=5, # 5-fold cross-validation
                            verbose=2) # print out results

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train);



Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.6s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.5s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.4s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=1, min_samples_split=4, n_estimators=200; total time=   0.4s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=100; total time=   0.2s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=100; total time=   0.2s
[CV] END max_depth=30, max_features=sqrt, min_samples_leaf=4, min_samples_split=6, n_estimators=100; tot

In [5]:
# Which combination of hyperparameters got the best results found by RandomizedSearchCV
rs_clf.best_params_

{'n_estimators': 200,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 20}

In [6]:
def evaluate_preds(y_true,y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_pred labels
    on  a classification model.
    """
    accuracy = accuracy_score(y_true,y_preds)
    precision = precision_score(y_true,y_preds)
    recall = recall_score(y_true,y_preds) 
    f1 = f1_score(y_true,y_preds)
    metric_dict = {
        "accuracy":round(accuracy,2),
        "precision":round(precision,2),
        "recall":round(recall,2),
        "f1":round(f1,2)
    } # A dictionary that stores the results of the evaluation metrics
    
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}")
    print(f"Recall: {recall:.2f}")
    print(f"F1 score: {f1:.2f}")
    
    return metric_dict

In [7]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Make predictions with the best hyperparameters
rs_y_preds = rs_clf.predict(X_test)

# Evaluate the predictions
rs_metrics = evaluate_preds(y_test, rs_y_preds)

Acc: 83.61%
Precision: 0.87
Recall: 0.82
F1 score: 0.84


## Saving and loading trained Machine Learning Models

Two ways to save and load machine learning models:
1. With Python's `pickle` module
2. With the `joblib module`

### 1) Saving and loading a model with [pickle](https://docs.python.org/3/library/pickle.html)

We'll use pickle's dump() function and pass it our model, gs_clf, along with the open() function containing a string for the filename we want to save our model as, along with the "wb" string which stands for "write binary", which is the file type open() will write our model as.

In [9]:
import pickle
# The python object is our model

# Save an existing model to file
# .pkl --> pickle file
pickle.dump(rs_clf,open("models/rs_random_forest_model1.pkl","wb"))

In [10]:
# Load a saved model

# rb -> read binary
loaded_model = pickle.load(open("models/rs_random_forest_model1.pkl","rb"))

In [11]:
# Make some predictions 

# Check if the model is loaded
pickle_y_preds = loaded_model.predict(X_test)
evaluate_preds(y_test,pickle_y_preds)

Acc: 83.61%
Precision: 0.87
Recall: 0.82
F1 score: 0.84


{'accuracy': 0.84, 'precision': 0.87, 'recall': 0.82, 'f1': 0.84}

### 2) Saving and loading a model with [joblib](https://joblib.readthedocs.io/en/latest/persistence.html)

works relatively the same as pickle.

To save a model, we can use joblib's dump() function, passing it the model (gs_clf) and the desired filename.

In [12]:
from joblib import dump, load

# Save a model to file
dump(rs_clf,filename="models/rs_random_forest_model2.pkl")

['models/rs_random_forest_model2.pkl']

In [13]:
# Import(load) a saved joblid model
loaded_model2 = load(filename="models/rs_random_forest_model2.pkl")

In [14]:
# Make and evaluate joblib predictions 

joblib_y_preds = loaded_model2.predict(X_test)
evaluate_preds(y_test,joblib_y_preds)

Acc: 83.61%
Precision: 0.87
Recall: 0.82
F1 score: 0.84


{'accuracy': 0.84, 'precision': 0.87, 'recall': 0.82, 'f1': 0.84}

You'll notice the evaluation metrics are the same as before.

Which one should you use, pickle or joblib?

According to [Scikit-Learn's documentation](https://scikit-learn.org/stable/modules/model_persistence.html), they suggest it may be more efficient to use joblib as it's more efficient with large numpy array (which is what may be contained in trained/fitted Scikit-Learn models).