# Introduction to Scikit-learn (sklearn)

Common workflow steps:

1. get the data ready
2. choose an algorithm for our problem
3. fit the algorithm and make prediction on our data
4. evaluate a model
5. improve the model
6. save and load a trained model

In [1]:
import numpy as np
import pandas as pd

## 1. Get the data ready

In [6]:
heart_disease = pd.read_csv('_data/heart-disease.csv')
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [7]:
x = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

## 2. Choose the algorithm for our problem

In [9]:
from sklearn.ensemble import RandomForestClassifier

In [10]:
classifier = RandomForestClassifier()
classifier.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

## 3. Fit the algorithm and make a prediction

In [11]:
from sklearn.model_selection import train_test_split

In [12]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [13]:
classifier.fit(x_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [18]:
y_preds = classifier.predict(x_test)
y_preds

array([0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0])

In [19]:
y_test

200    0
33     1
16     1
300    0
67     1
      ..
130    1
65     1
14     1
186    0
249    0
Name: target, Length: 61, dtype: int64

## 4. Evaluate the model

In [24]:
classifier.score(x_train, y_train)

1.0

In [25]:
classifier.score(x_test, y_test)

0.8688524590163934

In [26]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [28]:
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.88      0.81      0.85        27
           1       0.86      0.91      0.89        34

    accuracy                           0.87        61
   macro avg       0.87      0.86      0.87        61
weighted avg       0.87      0.87      0.87        61



In [29]:
confusion_matrix(y_test, y_preds)

array([[22,  5],
       [ 3, 31]])

In [30]:
accuracy_score(y_test, y_preds)

0.8688524590163934

## 5. Improve the model

In [34]:
for i in range(10, 100, 10):
    print(f"Testing model with {i} estimators...")
    classifier = RandomForestClassifier(n_estimators=i)
    classifier.fit(x_train, y_train)
    print(f"Model accuracy on test set: {classifier.score(x_test, y_test) * 100:.2f}%")
    print("")

Testing model with 10 estimators...
Model accuracy on test set: 80.33%

Testing model with 20 estimators...
Model accuracy on test set: 83.61%

Testing model with 30 estimators...
Model accuracy on test set: 81.97%

Testing model with 40 estimators...
Model accuracy on test set: 85.25%

Testing model with 50 estimators...
Model accuracy on test set: 83.61%

Testing model with 60 estimators...
Model accuracy on test set: 86.89%

Testing model with 70 estimators...
Model accuracy on test set: 86.89%

Testing model with 80 estimators...
Model accuracy on test set: 85.25%

Testing model with 90 estimators...
Model accuracy on test set: 86.89%



## 6. Save and Load trained model

In [36]:
import pickle

In [37]:
pickle.dump(classifier, open("_data/exported_random_forest_model.pk1", "wb"))

In [38]:
loaded_model = pickle.load(open("_data/exported_random_forest_model.pk1", "rb"))
loaded_model.score(x_test, y_test)

0.8688524590163934

## Split our data into training and test

In order to use our data for machine learning we must clean it up and get rid of the noise. This means rows that are missing fields will be removed or filled with an average value to have useful and consistent data. Sometimes, data is in a type that isn't easy to process or in form that isn't useful like KM when we might want meters.

In [2]:
heart_disease = pd.read_csv('_data/heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
x = heart_disease.drop("target", axis=1)
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [4]:
y = heart_disease["target"]
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

In [7]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((242, 13), (61, 13), (242,), (61,))

## Convert Data into numbers

In [8]:
car_sales = pd.read_csv('_data/car-sales.csv')
car_sales.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Toyota,White,150043,4,"$4,000.00"
1,Honda,Red,87899,4,"$5,000.00"
2,Toyota,Blue,32549,3,"$7,000.00"
3,BMW,Black,11179,5,"$22,000.00"
4,Nissan,White,213095,4,"$3,500.00"


In [18]:
x = car_sales.drop("Price", axis=1)
y = car_sales["Price"].replace('[\$,]', '', regex=True).astype(float).astype(int)
x.shape, y.shape

((10, 4), (10,))

In [24]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

# make string columns to categories
categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer(
    [(
        "one_hot",
        one_hot,
        categorical_features
    )],
    remainder="passthrough"
    
)
transformed_x = transformer.fit_transform(x)
transformed_x

array([[0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 1.50043e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 8.78990e+04],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 3.25490e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 1.00000e+00, 1.11790e+04],
       [0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00, 0.00000e+00,
        0.00000e+00, 0.00000e+00, 0.00000e+00, 1.00000e+00, 0.00000e+00,
        1.00000e+00, 0.00000e+00, 2.13095e+05],
       [0.00000e+00, 0.00000e+

In [14]:
pd.DataFrame(transformed_x), x.head()

(     0    1    2    3    4    5    6    7    8    9   10   11        12
 0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  150043.0
 1  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0   87899.0
 2  0.0  0.0  0.0  1.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0   32549.0
 3  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0   11179.0
 4  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0  213095.0
 5  0.0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  0.0   99213.0
 6  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0   45698.0
 7  0.0  1.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0   54738.0
 8  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0   60000.0
 9  0.0  0.0  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  1.0  0.0   31600.0,
      Make Colour  Odometer (KM)  Doors
 0  Toyota  White         150043      4
 1   Honda    Red          87899      4
 2  Toyota   Blue          32549      3
 3     BMW  Black          11179    

In [25]:
# split data again with the transformed x data
x_train, x_test, y_train, y_test = train_test_split(transformed_x, y, test_size=0.2)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((8, 13), (2, 13), (8,), (2,))

In [26]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)
model = RandomForestRegressor()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.4054101261926747