# Intro to Scikit Learn (sklearn)
from pickletools import read_bytes1


This notebook shows some of the most useful functions for sklearn

In this section:

0. sklearn workflow
1. getting data ready
2. Choose the right estimator/algorithm for problem
3. Fit the model and use it to make predictions
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Putting it all together

## 0. sklearn workflow

### 1. Get data ready

In [1]:
import pandas as pd
import numpy as np
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [2]:
#Create X (features matrix)
X = heart_disease.drop("target",axis=1)

#Create Y (labels)
y = heart_disease["target"]

### 2. Choose the right model and hyperparameters

In [3]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

#Well keep the default hyperparameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

### 3. Fit the model to the data

In [4]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [5]:
clf.fit(X_train,y_train);
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
142,42,0,2,120,209,0,1,173,0,0.0,1,0,2
95,53,1,0,142,226,0,0,111,1,0.0,2,0,3
265,66,1,0,112,212,0,0,132,1,0.1,2,1,2
97,52,1,0,108,233,1,1,147,0,0.1,2,3,3
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
259,38,1,3,120,231,0,1,182,1,3.8,1,0,3
210,57,1,2,128,229,0,0,150,0,0.4,1,1,3
15,50,0,2,120,219,0,1,158,0,1.6,1,0,2
207,60,0,0,150,258,0,0,157,0,2.6,1,2,3


In [6]:
# make a prediction
y_preds = clf.predict(X_test)
y_preds

array([0, 1, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0,
       1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1], dtype=int64)

In [7]:
y_test

31     1
33     1
86     1
218    0
188    0
      ..
194    0
13     1
269    0
143    1
283    0
Name: target, Length: 61, dtype: int64

### 4. evaluate teh model on the training data and test data

In [8]:
clf.score(X_train,y_train)

1.0

In [9]:
clf.score(X_test,y_test)

0.8360655737704918

In [10]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
print(classification_report(y_test,y_preds))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80        26
           1       0.84      0.89      0.86        35

    accuracy                           0.84        61
   macro avg       0.84      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61



In [11]:
#Confusion matrix: how good was the prediction (n x n matrix where n is the number of number of classifications 
            #[[True Positive, False Negative],
            # [False Postive, True Negative]]
confusion_matrix(y_test,y_preds)

array([[20,  6],
       [ 4, 31]], dtype=int64)

In [12]:
# Accuracy Score
accuracy_score(y_test, y_preds)

0.8360655737704918

### 5. Improve a model

In [18]:
# Try different amount of n_estimators
np.random.seed(1492)
best_score = [0,0] #[n_esitmators, score]
for i in range(10,100,10):
    print(f"Trying to model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    score = clf.score(X_test, y_test) * 100
    if score > best_score[1]:
        best_score[0] = i
        best_score[1] = score
    print(f"Model accuracy on test set: {score:.2f}%\nBest score is {best_score[1]:.2f} with {best_score[0]} estimators\n")

Trying to model with 10 estimators...
Model accuracy on test set: 88.52%
Best score is 88.52 with 10 estimators

Trying to model with 20 estimators...
Model accuracy on test set: 73.77%
Best score is 88.52 with 10 estimators

Trying to model with 30 estimators...
Model accuracy on test set: 77.05%
Best score is 88.52 with 10 estimators

Trying to model with 40 estimators...
Model accuracy on test set: 78.69%
Best score is 88.52 with 10 estimators

Trying to model with 50 estimators...
Model accuracy on test set: 80.33%
Best score is 88.52 with 10 estimators

Trying to model with 60 estimators...
Model accuracy on test set: 81.97%
Best score is 88.52 with 10 estimators

Trying to model with 70 estimators...
Model accuracy on test set: 73.77%
Best score is 88.52 with 10 estimators

Trying to model with 80 estimators...
Model accuracy on test set: 85.25%
Best score is 88.52 with 10 estimators

Trying to model with 90 estimators...
Model accuracy on test set: 83.61%
Best score is 88.52 wit

### 6. Save a model and load it

In [20]:
import pickle
pickle.dump(clf, open("random_forest_model_0.pkl","wb"))

In [21]:
loaded_model = pickle.load(open("random_forest_model_0.pkl","rb"))
loaded_model.score(X_test,y_test)

0.8360655737704918

In [22]:
# IT WORKS!