# Introduction to scikit-learn

## end-to-end scikit-learn workflow

In [38]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('seaborn')

In [4]:
heart_disease = pd.read_csv('../datasets/heart-disease.csv')
heart_disease.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [9]:
# create features matrix
X = heart_disease.iloc[:, :-1]
X

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3


In [11]:
y = heart_disease.iloc[:, -1]

In [29]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

classifier = RandomForestClassifier(n_estimators=300)
classifier.fit(X_train, y_train)
accuracy = classifier.score(X_test, y_test)
print(f'accuracy: {accuracy}')

cv_accuracy = cross_val_score(estimator=classifier, X=X_test, y=y_test, cv=10, n_jobs=-1)
print(f'cross validation accuracy: {cv_accuracy.mean()}')

cm = confusion_matrix(y_test, classifier.predict(X_test))
print(cm)

accuracy: 0.8360655737704918
cross validation accuracy: 0.8214285714285715
[[25  5]
 [ 5 26]]


In [33]:
print(classification_report(y_test, classifier.predict(X_test)))

              precision    recall  f1-score   support

           0       0.83      0.83      0.83        30
           1       0.84      0.84      0.84        31

    accuracy                           0.84        61
   macro avg       0.84      0.84      0.84        61
weighted avg       0.84      0.84      0.84        61



In [37]:
from sklearn.model_selection import GridSearchCV

param_grid = [
    {
        'n_estimators': range(10, 1000, 10)
    }
]

grid_search = GridSearchCV(estimator=classifier, 
                           param_grid=param_grid, 
                           n_jobs=-1, 
                           scoring='accuracy',
                           cv=10)

grid_search.fit(X_train, y_train)

GridSearchCV(cv=10, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=300, n_jobs=None,
                                              oob_score=False,
                                              rand

In [36]:
best_accuracy = grid_search.best_score_
best_parameters = grid_search.best_params_

print(f'best accuracy: {best_accuracy}')
print(best_parameters)

best accuracy: 0.8301666666666666
{'n_estimators': 80}


## getting data ready
* split data into features and labels
* filling (imputing) or disregarding missing values
* converting categorical values to numerical values

In [40]:
heart_disease = pd.read_csv('../datasets/heart-disease.csv')
heart_disease.head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [42]:
X = heart_disease.drop('target', axis=1)
X. head(5)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [47]:
y = heart_disease['target']

In [53]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

X.shape, X_train.shape, X_test.shape

((303, 13), (242, 13), (61, 13))

## making sure data is numerical

In [94]:
car_sales = pd.read_csv('../datasets/car-sales-extended.csv')
car_sales.head(5)

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431,4,15323
1,BMW,Blue,192714,5,19943
2,Honda,White,84714,4,28343
3,Toyota,White,154365,4,13434
4,Nissan,Blue,181577,3,14043


In [57]:
len(car_sales)

1000

In [58]:
car_sales.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
Price             int64
dtype: object

In [93]:
X = car_sales.drop('Price', axis=1)
y = car_sales['Price']
X.head(2)

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5


In [67]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ['Make', 'Colour', 'Doors']
transformer = ColumnTransformer(
    [('one_hot', OneHotEncoder(), categorical_features)], 
    remainder='passthrough'
)

transformed_x = transformer.fit_transform(X)
transformed_x

array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [71]:
pd.DataFrame(transformed_x).head(5)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0


In [101]:
car_sales['Doors'] = car_sales['Doors'].astype(str)
with_dummy_variables = pd.get_dummies(car_sales).drop('Price', axis=1)
cleansed = with_dummy_variables.drop(['Make_BMW', 'Colour_Black', 'Doors_3'], axis=1)
cleansed

Unnamed: 0,Odometer (KM),Make_Honda,Make_Nissan,Make_Toyota,Colour_Blue,Colour_Green,Colour_Red,Colour_White,Doors_4,Doors_5
0,35431,1,0,0,0,0,0,1,1,0
1,192714,0,0,0,1,0,0,0,0,1
2,84714,1,0,0,0,0,0,1,1,0
3,154365,0,0,1,0,0,0,1,1,0
4,181577,0,1,0,1,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...
995,35820,0,0,1,0,0,0,0,1,0
996,155144,0,1,0,0,0,0,1,0,0
997,66604,0,1,0,1,0,0,0,1,0
998,215883,1,0,0,0,0,0,1,1,0


In [106]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

X_train, X_test, y_train, y_test = train_test_split(cleansed, y, test_size=0.2)

classifier = RandomForestRegressor()
classifier.fit(X_train, y_train)
print(classifier.score(X_test, y_test))

print(mean_squared_error(y_test, classifier.predict(X_test)))

0.20714057133089814
52756571.052206494
