In [150]:
# Get the data ready (split into features and labels, prepare train and test steps)
# Choose a model for our problem
# Fit the model to the data and use it to make a prediction
# Evaluate the model
# Experiment to improve
# Save a model for someone else to use

In [176]:
# Step 1. Get the data ready
import pandas as pd
import numpy as np
heart_disease = pd.read_csv('heart.csv')
# heart_disease.head()
heart_disease.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
age,303.0,54.366337,9.082101,29.0,47.5,55.0,61.0,77.0
sex,303.0,0.683168,0.466011,0.0,0.0,1.0,1.0,1.0
cp,303.0,0.966997,1.032052,0.0,0.0,1.0,2.0,3.0
trestbps,303.0,131.623762,17.538143,94.0,120.0,130.0,140.0,200.0
chol,303.0,246.264026,51.830751,126.0,211.0,240.0,274.5,564.0
fbs,303.0,0.148515,0.356198,0.0,0.0,0.0,0.0,1.0
restecg,303.0,0.528053,0.52586,0.0,0.0,1.0,1.0,2.0
thalach,303.0,149.646865,22.905161,71.0,133.5,153.0,166.0,202.0
exang,303.0,0.326733,0.469794,0.0,0.0,0.0,1.0,1.0
oldpeak,303.0,1.039604,1.161075,0.0,0.0,0.8,1.6,6.2


In [152]:
# The target column indicates whether the patient has heart disease (target=1) or not (target=0).
# this is our "label" column

In [153]:
# Feature Selection

In [154]:
# Create X (all the feature colums)
X = heart_disease.drop("target",axis= 1)

# Create y (the target column)
y = heart_disease["target"]

# Check the data
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [155]:
# check the head and the value counts of the labels
y.head(), y.value_counts()

(0    1
 1    1
 2    1
 3    1
 4    1
 Name: target, dtype: int64,
 target
 1    165
 0    138
 Name: count, dtype: int64)

In [156]:
# Split data into training and testing set
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=43)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((227, 13), (76, 13), (227,), (76,))

In [157]:
# Step 2: Choose the model and hyperparameters

In [158]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()
# view the current hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [159]:
# Step 3: Fit the model with the data and use it to make predictions

In [160]:
# fit the model with training data
clf.fit(X=X_train, y=y_train)

In [161]:
X_test.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
242,64,1,0,145,212,0,0,132,0,2.0,1,2,1
130,54,0,2,160,201,0,1,163,0,0.0,2,1,2
208,49,1,2,120,188,0,1,139,0,2.0,1,3,3
160,56,1,1,120,240,0,1,169,0,0.0,0,0,2
124,39,0,2,94,199,0,1,179,0,0.0,2,0,2


In [162]:
# using the model to make predictions
y_preds = clf.predict(X=X_test)
y_preds

array([0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 1, 0, 0, 0, 0], dtype=int64)

In [163]:
# Step 5: Evaluate the model
# Accuracy score is a fraction of the correct predictiond out of the totall predictions
train_acc = clf.score(X = X_train, y=y_train)
print(f" Accuracy(Training Dataset): {train_acc * 100}%")

 Accuracy(Training Dataset): 100.0%


In [164]:
test_acc = clf.score(X = X_test, y=y_test)
print(f"Accuracy(Testing Dataset): {test_acc * 100}%")

Accuracy(Testing Dataset): 85.52631578947368%


In [165]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
# Create a classification report
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.93      0.75      0.83        36
           1       0.81      0.95      0.87        40

    accuracy                           0.86        76
   macro avg       0.87      0.85      0.85        76
weighted avg       0.87      0.86      0.85        76



In [166]:
# Create a confusion matrix
conf_mat = confusion_matrix(y_test, y_preds)
conf_mat

array([[27,  9],
       [ 2, 38]], dtype=int64)

In [167]:
# Compute the accuracy score (same as the score() method for classifiers) 
accuracy_score(y_test, y_preds)

0.8552631578947368

In [168]:
# Try different numbers of estimators (trees)... no cross-validation
np.random.seed(42)
for i in range(100, 200, 10):
    print(f"Trying model with {i} estimators ...")
    model = RandomForestClassifier(n_estimators = i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {model.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 100 estimators ...
Model accuracy on test set: 86.84%

Trying model with 110 estimators ...
Model accuracy on test set: 85.53%

Trying model with 120 estimators ...
Model accuracy on test set: 85.53%

Trying model with 130 estimators ...
Model accuracy on test set: 84.21%

Trying model with 140 estimators ...
Model accuracy on test set: 85.53%

Trying model with 150 estimators ...
Model accuracy on test set: 84.21%

Trying model with 160 estimators ...
Model accuracy on test set: 84.21%

Trying model with 170 estimators ...
Model accuracy on test set: 84.21%

Trying model with 180 estimators ...
Model accuracy on test set: 85.53%

Trying model with 190 estimators ...
Model accuracy on test set: 84.21%



In [169]:
# Let's use sklearn.model_selection.cross_val_score to measure the results across 5 different train and test sets.
from sklearn.model_selection import cross_val_score

# With cross-validation
np.random.seed(42)
for i in range(100, 200, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)

    # Measure the model score on a single train/test split
    model_score = model.score(X_test, y_test)
    print(f"Model accuracy on single test set split: {model_score * 100:.2f}%")

    # Measure the mean cross-validation score across 5 different train and test splits
    cross_val_mean = np.mean(cross_val_score(model, X, y, cv=5))
    print(f"5-fold cross-validation score: {cross_val_mean * 100:.2f}%")

    print("")

Trying model with 100 estimators...
Model accuracy on single test set split: 86.84%
5-fold cross-validation score: 82.15%

Trying model with 110 estimators...
Model accuracy on single test set split: 85.53%
5-fold cross-validation score: 81.17%

Trying model with 120 estimators...
Model accuracy on single test set split: 85.53%
5-fold cross-validation score: 83.16%

Trying model with 130 estimators...
Model accuracy on single test set split: 85.53%
5-fold cross-validation score: 83.14%

Trying model with 140 estimators...
Model accuracy on single test set split: 84.21%
5-fold cross-validation score: 82.48%

Trying model with 150 estimators...
Model accuracy on single test set split: 85.53%
5-fold cross-validation score: 80.17%

Trying model with 160 estimators...
Model accuracy on single test set split: 84.21%
5-fold cross-validation score: 80.83%

Trying model with 170 estimators...
Model accuracy on single test set split: 85.53%
5-fold cross-validation score: 81.83%

Trying model wit

In [170]:
# A high cross-validation score is usually a better indicator of 
# a quality model than a single split accuracy score. 
# But rather than set up and track the results of these experiments manually, 
# we can get Scikit-Learn to do the exploration for us.

In [171]:
# Scikit-Learn's sklearn.model_selection.GridSearchCV is a handy way 
# to search over a set of different hyperparameter values and 
# automatically track which performs the best.

In [172]:
# Another way to do it with GridSearchCV...
np.random.seed(42)
from sklearn.model_selection import GridSearchCV

# Define the parameters to search over in dictionary form 
# (these can be any of your target model's hyperparameters) 
param_grid = {'n_estimators': [i for i in range(100, 200, 10)]}

# Setup the grid search
grid = GridSearchCV(estimator=RandomForestClassifier(),
                    param_grid=param_grid,
                    cv=5,
                    verbose=1) 

# Fit the grid search to the data
grid.fit(X, y)

# Find the best parameters
print(f"The best parameter values are: {grid.best_params_}")
print(f"With a score of: {grid.best_score_*100:.2f}%")

Fitting 5 folds for each of 10 candidates, totalling 50 fits
The best parameter values are: {'n_estimators': 120}
With a score of: 82.82%


In [173]:
# We can then extract the best model/estimator with the best_estimator_ attribute.
# Set the model to be the best estimator
clf = grid.best_estimator_
Clf

In [174]:
# And now that we've got the best cross-validated model, 
# we can fit and score it on our original single train/test split of the data.

In [175]:
# Fit the best model
clf = clf.fit(X_train, y_train)

# Find the best model scores on our single test split
# (note: this may be lower than the cross-validation score since it's only on one split of the data)
print(f"Best model score on single split of the data: {clf.score(X_test, y_test)*100:.2f}%")

Best model score on single split of the data: 84.21%


In [177]:
# Step 6: Save a model
# When you've done a few experiments and you're happy with how your model is doing, 
# you'll likely want someone else to be able to use it.

# This may come in the form of a teammate or colleague trying to replicate and 
# validate your results or through a customer using your model as part of a service or application you offer.

# Saving a model also allows you to reuse it later without having to go through retraining it. 
# This is especially helpful when your training times start to increase.


In [178]:
# You can save a Scikit-Learn model using Python's in-built pickle module.
import pickle
# Save an existing model to file
pickle.dump(model, open("random_forest_model_1.pkl", "wb"))

In [180]:
# Then, you can use load and evaluate it:
# Load a saved pickle model and evaluate it
loaded_pickle_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
print(f"Loaded pickle model prediction score: {loaded_pickle_model.score(X_test, y_test) * 100:.2f}%")

# Loaded pickle model prediction score: (modelscore)%

Loaded pickle model prediction score: 85.53%


In [181]:
# However, for larger models, it may be more efficient to use Joblib.
from joblib import dump, load

# Save a model using joblib
dump(model, "random_forest_model_1.joblib")

['random_forest_model_1.joblib']

In [182]:
# Load a saved joblib model and evaluate it
loaded_joblib_model = load("random_forest_model_1.joblib")
print(f"Loaded joblib model prediction score: {loaded_joblib_model.score(X_test, y_test) * 100:.2f}%")

Loaded joblib model prediction score: 85.53%
