# Introduction to Scikit Learn Library (sklearn)

This notebook demonstrates some of the most useful functions of the beautiful scikit learn library

0. An end-to-end Scikit-Learn workflow
1. Getting the data ready
2. choose the right estimator/algorithm for our problems
3. Fit the model/algorithm and use it to make predicitions on our data
4. Evaluating a model
5. Improve a model
6. Save and load a trained model
7. Putting it all together

In [1]:
# Standard Imports
import numpy as np 
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

## 0. An end-to-end Scikit-Learn workflow

In [2]:
# 1. Get the data ready
import pandas as pd
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
# Create X (features matrix)
X = heart_disease.drop("target", axis=1)

# Create y (labels)
y = heart_disease["target"]

In [4]:
# 2. choose the right estimator/algorithm for our problems
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier()

# We'll keep the default hyperparameter
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [5]:
# 3. Fit the model/algorithm and use it to make predicitions on our data
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [6]:
clf.fit(X_train, y_train)

In [7]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
272,67,1,0,120,237,0,1,71,0,1.0,1,0,2
60,71,0,2,110,265,1,0,130,0,0.0,2,1,2
133,41,1,1,110,235,0,1,153,0,0.0,2,0,2
121,59,1,0,138,271,0,0,182,0,0.0,2,0,2
203,68,1,2,180,274,1,0,150,1,1.6,1,0,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
74,43,0,2,122,213,0,1,165,0,0.2,1,0,2
183,58,1,2,112,230,0,0,165,0,2.5,1,1,3
230,47,1,2,108,243,0,1,152,0,0.0,2,0,2
114,55,1,1,130,262,0,1,155,0,0.0,2,0,2


In [8]:
X_test

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
25,71,0,1,160,302,0,1,162,0,0.4,2,2,2
242,64,1,0,145,212,0,0,132,0,2.0,1,2,1
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3
295,63,1,0,140,187,0,0,144,1,4.0,2,2,3
266,55,0,0,180,327,0,2,117,1,3.4,1,0,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
270,46,1,0,120,249,0,0,144,0,0.8,2,0,3
244,56,1,0,132,184,0,0,105,1,2.1,1,1,1
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1


In [9]:
# make a prediction
y_label = clf.predict(np.array([0, 2, 3, 4]).reshape(1, -1))



ValueError: X has 4 features, but RandomForestClassifier is expecting 13 features as input.

In [None]:
y_preds = clf.predict(X_test)
y_preds

In [None]:
# Evaluate the model
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_preds))

In [None]:
confusion_matrix(y_test, y_preds)

In [None]:
accuracy_score(y_test, y_preds)

In [None]:
# 5. Improve a model
# Try different amount of n_estimators
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accuracy on test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

In [None]:
# 6. Save a model and load it
import pickle

pickle.dump(clf, open("random_forst_model_1.pkl", "wb"))

In [None]:
loaded_model = pickle.load(open("random_forst_model_1.pkl", "rb"))
loaded_model.score(X_test, y_test)

## 1. Getting our data ready to be used with machine learning 

Three main things we have to do:
    1. Split the data into features and labels (usually `X` and `y`)
    2. Filling (also called imputing) or disregarding missing values 
    3. Converting non-numerical values to numerical values (also called feature encoding)


In [None]:
heart_disease.head()

In [None]:
X = heart_disease.drop("target", axis = 1)
X.head()

In [None]:
y = heart_disease["target"]
y.head()

In [None]:
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
X.shape

In [None]:
len(heart_disease)

## 1.1 Make Sure its all numerical

In [None]:
car_sales = pd.read_csv("data/car-sales-extended.csv")
car_sales

In [None]:
len(car_sales)

In [None]:
car_sales.dtypes

In [None]:
# split into X/y
X = car_sales.drop("Price", axis = 1)
y = car_sales["Price"]

# Split into training and test 
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2)

In [None]:
# Build machine learning model
from sklearn.ensemble import RandomForestRegressor

model = RandomForestRegressor()
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

In [None]:
pd.DataFrame(transformed_X)

In [None]:
dummies = pd.get_dummies(car_sales[["Make", "Colour", "Doors"]])
dummies

In [None]:
# Lets refit the model

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size = 0.2)

model.fit(X_train, y_train)

### 1.2 What is there are missing values

1. Fill them with some values (also known as imputations)
2. Remove the samples with missing data altogether


In [None]:
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Create X and y 
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Lets Try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(X)
transformed_X

In [None]:
car_sales_missing 

#### Option 1:Fill The missing data with Pandas

In [None]:
# Fill the "Make" column
car_sales_missing["Make"].fillna("missing", inplace=True)

# Fill the "Colour" column
car_sales_missing["Colour"].fillna("missing", inplace=True)

# Fill the "Odometer (KM)" column
car_sales_missing["Odometer (KM)"] = car_sales_missing["Odometer (KM)"].fillna(car_sales_missing["Odometer (KM)"].mean())

# Fill the "Doors" column
car_sales_missing["Doors"] = car_sales_missing["Doors"].fillna(4)


In [None]:
# Check our datafram again 
car_sales_missing.isna().sum()

In [None]:
# Remove rows with the missing price value
car_sales_missing.dropna(inplace=True)

In [None]:
car_sales_missing.isna().sum()

In [None]:
len(car_sales_missing)

In [None]:
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Lets Try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(car_sales_missing)
transformed_X

#### Option 2:Fill missing values with the scikit-learn

In [None]:
car_sales_missing = pd.read_csv("data/car-sales-extended-missing-data.csv")
car_sales_missing.head()

In [None]:
car_sales_missing.isna().sum()

In [None]:
# Drop the rows with no labels
car_sales_missing.dropna(subset=["Price"], inplace=True)
car_sales_missing.isna().sum()

In [None]:
# Split X & y
X = car_sales_missing.drop("Price", axis=1)
y = car_sales_missing["Price"]

In [None]:
# Fill missing values with Scikit Learn
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

# Fill categorical values with missing and numerical values with mean 
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")
door_imputer = SimpleImputer(strategy="constant", fill_value=4)
num_imputer = SimpleImputer(strategy="mean")

# Define columns
cat_features = ["Make", "Colour"]
door_features = ["Doors"]
num_features = ["Odometer (KM)"]

# Create an imputer (something that fills missing data)
imputer = ColumnTransformer([
    ("cat_imputer", cat_imputer, cat_features),
    ("door_imputer", door_imputer, door_features),
    ("num_imputer", num_imputer, num_features)
])

# Transform the data 
filled_X = imputer.fit_transform(X)
filled_X

In [None]:
car_sales_filled = pd.DataFrame(filled_X, columns=["Make", "Colour", "Doors", "Odometer (KM)"])
car_sales_filled.head()

In [None]:
car_sales_filled.isna().sum()

In [None]:
# Lets Try and convert our data to numbers
# Turn the categories into numbers
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                 one_hot,
                                 categorical_features)],
                               remainder="passthrough")

transformed_X = transformer.fit_transform(car_sales_filled)
transformed_X

In [None]:
# Now we have got our data as number and filled (no missing values)
# Lets fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(transformed_X, y, test_size = 0.2)

model = RandomForestRegressor(n_estimators = 100)
model.fit(X_train, y_train)
model.score(X_test, y_test)

In [None]:
len(car_sales_filled), len(car_sales_missing)

## 2. Choosing the right estimator/algorithm for your problem 

Some thing to Note: 

* Sklearn refers to machine learning models, algorithm as estimators
* Classification problem - predicting a category (heart disease or not)
    * Sometimes You'll see clf (short for classifier) used as a classification estimator
* Regression problem - predicting a number (selling price of a car)



### 2.1 Picking a machine model for a regression problem

In [10]:
# Get Califonia Housing dataset 
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [11]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25
...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32


In [None]:
housing_df["MedHouseVal"] = housing["target"]
housing_df.head()

In [None]:
housing_df = housing_df.drop("MedHouseVal", axis=1)

In [None]:
housing_df

In [None]:
# Import Algorithm 
from sklearn.linear_model import Ridge

# Setup random seed
np.random.seed(42)

# Create the data 
X = housing_df.drop("Latitude", axis=1)
y = housing_df["Latitude"] # median House price is $100000s

# Split into train and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate and fit the model (on the training set)
model = Ridge()
model.fit(X_train, y_train)

# Check the score of the model
model.score(X_test, y_test)

In [None]:
# Import the RandomForestRegressor model class from the ensemble module
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(42)

# Create the data
X = housing_df.drop("Latitude", axis=1)
y = housing_df["Latitude"]

# Split into test and train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

# Create random forest model
model = RandomForestRegressor()
model.fit(X_train,y_train)

# Check the Score of the model
model.score(X_test, y_test)

### 2.2 Choosing an estimator for a classification problem 

In [None]:
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head()

In [None]:
len(heart_disease)

Consulting the map and it says to try `LinearSVC`.

In [None]:
# Import the LinearSVC estimator class
from sklearn.svm import LinearSVC

# Setup random seed 
np.random.seed(42)

# Make The Data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data 
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate LinearSVC
clf = LinearSVC()
clf.fit(X_train, y_train)

# Evaluate the LinearSVC
clf.score(X_test, y_test)

In [None]:
heart_disease["target"].value_counts()

In [None]:
# Import the RandomForestEstimator estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed 
np.random.seed(42)

# Make The Data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data 
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# Evaluate the RandomForestClassifier
clf.score(X_test, y_test)

Tidbid: 
1. If You have structured data, used ensemble methods
2. If you have unstructured data, use deep learning or transfer learning 

## Fit the model/algorithm on our data and use it to make prediction

### 3.1 Fitting the modelto the data 

Different names for:
* `X` = features, features variable, data 
* `y` = labels, targets, target variables

In [None]:
# Import the RandomForestEstimator estimator class
from sklearn.ensemble import RandomForestClassifier

# Setup random seed 
np.random.seed(42)

# Make The Data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data 
X_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# Fit the model to the data 
clf.fit(X_train, y_train)

# Evaluate the RandomForestClassifier
clf.score(X_test, y_test)

In [None]:
X.head()

In [None]:
y.tail()

## 3.2 Make predictions using a machine learning model

2 ways to make predictions:
1. predict()
2. predict_proba()

In [None]:
# Use a trained model to make predictions
clf.predict(np.array([1,7,8,3,4])) # this doesn't work ....

In [None]:
X_test

In [None]:
clf.predict(X_test)

In [None]:
np.array(y_test)

In [None]:
# Compare predictions to truth labels to evaluate the model
y_pred = clf.predict(X_test)
np.mean(y_pred == y_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

### Make predictions with predict_proba()

In [None]:
# predict_proba() returns probabilities of a classification
clf.predict_proba(X_test[:5])

In [None]:
clf.predict(X_test[:5])

In [None]:
heart_disease["target"].value_counts()

In [None]:
heart_disease.head()

`predict()` can also be used for regression model

In [None]:
housing_df.head()

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create the data 
X = housing_df.drop("Latitude", axis=1)
y = housing_df["Latitude"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model instance 
model = RandomForestRegressor()

# Fit the model
model.fit(X_train, y_train)

# Make Predictions
y_preds = model.predict(X_test)

In [None]:
y_preds[:10]

In [None]:
np.array(y_test[:10])

In [None]:
# Compare the predictions to the truth
from sklearn.metrics import mean_absolute_error
mean_absolute_error(y_test, y_preds)

## 4. Evaluating a machine learning model

Three ways to evaluate Scikit-Learn models/estimators:
    1. Estimator's built-in `score()` method
    2. The `scoring` parameter 
    3. Problem-specific metric functions

## 4.1 Evaluating a model with the score method

In [None]:
from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make The Data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# Fit the model to the data
clf.fit(X_train, y_train)


In [None]:
clf.score(X_train, y_train)

In [None]:
clf.score(X_test, y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

# Create the data 
X = housing_df.drop("Latitude", axis=1)
y = housing_df["Latitude"]

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model instance 
model = RandomForestRegressor(n_estimators=1000)

# Fit the model
model.fit(X_train, y_train)

# Make Predictions
y_preds = model.predict(X_test)

In [None]:
model.score(X_test, y_test)

In [None]:
model.score(X_test, y_test)

In [None]:
housing_df.head()

In [None]:
y_test

### 4.2 Evaluating a model using the scoring parameter

In [None]:
from sklearn.model_selection import cross_val_score

from sklearn.ensemble import RandomForestClassifier

# Setup random seed
np.random.seed(42)

# Make The Data 
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split the data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_estimators=100)

# Fit the model to the data
clf.fit(X_train, y_train)


In [None]:
clf.score(X_test, y_test)

In [None]:
cross_val_score(clf, X, y, cv=3)

In [None]:
cross_val_score(clf, X, y, cv=10)

In [None]:
np.random.seed(42)

# Single training and test split score
clf_single_score = clf.score(X_test, y_test)

# Take the mean of 5-fold cross-validation score
clf_cross_val_score = np.mean(cross_val_score(clf, X, y, cv=5))

# Compare the Two 
clf_single_score, clf_cross_val_score

In [None]:
# Default scoring parameter of classifier = mean accuracy
clf.score()

In [None]:
# Scoring parameter set to None by default
cross_val_score(clf, X, y, cv=5, scoring=None)

### 4.2.1 Classification model evaluation metrics
1. Accuracy
2. Area under ROC Curve
3. Confusion Matrix
4. Classification Report

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

clf = RandomForestClassifier(n_estimators=100)
cross_val_score = cross_val_score(clf, X, y, cv=5)


In [None]:
np.mean(cross_val_score)

In [None]:
print(f"Heart Disease Classifier Cross-Validated Accuracy: {np.mean(cross_val_score) *100:2f}%")

** Area under the receiver operating characteistic curve (AUC/ROC)

* Area Under Curve
* ROC Curve

* True positive = model predicts 1 when truth is 1
* False positive = model predicts 1 when truth is 0
* True negative = model predicts 0 when truth is 0
* Flase negative = model predicts 0 when truth is 1
* 

In [None]:
# Create X_test... etc
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [None]:
from sklearn.metrics import roc_curve

# Fit the classifier 
clf.fit(X_train, y_train)

# Make predictions with probabilities
y_probs = clf.predict_proba(X_test)

y_probs[:10], len(y_probs)

In [None]:
y_probs_positive = y_probs[:, 1]
y_probs_positive

In [None]:
# Calculate fpr, tpr and threshold
fpr, tpr, threshold = roc_curve(y_test, y_probs_positive)

# check false positive rate
fpr

In [None]:
# Create a function for plotting ROC Curves

import matplotlib.pyplot as plt

def plot_roc_curve(fpr, tpr):
    """
    Plots a ROC curve given the false positive rate (fpr)
    and true positive rate (tpr) of a model
    """

    # Plot roc curve
    plt.plot(fpr, tpr, color="orange", label="ROC")
    # Pot line with no predictive power (baseline)
    plt.plot([0, 1], [0, 1], color="darkblue", linestyle="--", label="Guessing")

    # Customize the plot 
    plt.xlabel("False positive rate (fpr)")
    plt.ylabel("True Positive")
    plt.title("Receiver operating characteristic (ROC) Curve")
    plt.legend()
    plt.show()

plot_roc_curve(fpr, tpr)

In [None]:
from sklearn.metrics import roc_auc_score
roc_auc_score(y_test, y_probs_positive)

In [None]:
# Plot perfect ROC curve and AUC score
from sklearn.metrics import roc_auc_score
fpr, tpr, thresholds = roc_score(y_test, y_test)
plot_roc_curve(fpr, tpr)

In [None]:
# Perfect AUC score 
roc_auc_score(y_test, y_test)

**Confusion Matrix**

A confusion matrix is a quick way to compare the label a predicts and the actual labels it was supposed to predict.

In essence, giving you an idea of where the model is get 

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)

confusion_matrix(y_test, y_preds)

In [None]:
# Visualize confusion matrix with pd.crosstab()
pd.crosstab(y_test,
           y_preds,
           rownames=["Actual Labels"],
           colnames=["Predicted Labels"])

In [None]:
len(X_test)

In [None]:
# How to install a conda package into the current environment from a jupyter notebook
import sys 
!conda install --yes --prefix {sys.prefix} seaborn

In [None]:
# Make our confusion matrix more visual with seaborn's heatmap()
import seaborn as sns 

# set the font scale
sns.set(font_scale=1.5)

# Create a confusion matrix
conf_mat = confusion_matrix(y_test, y_preds)

# Plot it using seaborn
sns.heatmap(conf_mat);

**Confusion Matrix**

The next way to evaluate a classification model is by using a confusion matrix

A confusion matrix is a quick way to compare the labels a model predicts and the actual labels it was supposed to predict

In essence, this gives you an idea of where the model is gettign confused 

In [None]:
from sklearn.metrics import confusion_matrix

y_preds = clf.predict(X_test)

confusion_matrix(y_test, y_preds)

In [None]:
# Visualize confusion matrix with pd.crosstab()
pd.crosstab(y_test,
           y_preds,
           rownames=["Actual Labels"],
           colnames=["Predicted Labels"])

### Creating a confusion matrix using scikit learn

To use the new method of creating a confusion matrix with scikit learn you will need sklearn version 1.0+

In [None]:
import sklearn 
sklearn.__version__

In [None]:
clf

In [None]:
from sklearn.metrics import ConfusionMatrixDisplay

ConfusionMatrixDisplay.from_estimator(estimator=clf, X=X, y=y)

In [None]:
ConfusionMatrixDisplay.from_predictions(y_true=y_test,
                                       y_pred=y_preds);

### Classification

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_preds))

In [None]:
# Where precision and recall become valuable
disease_true = np.zeros(10000)
disease_true[0] = 1 # Only one positive case

disease_preds = np.zeros(10000) # model predicts every case as 0
    
pd.DataFrame(classification_report(disease_true, 
                                  disease_preds,
                                  output_dict=True))

### 4.2.2 Regression model evaluation matrix

1. R^2 (pronounced r-squared) or coefficient of determination
2. Mean absolute error (MAE)
3. Mean Squared error (MSE)

R^2

In [None]:
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop("Latitude", axis=1)
y = housing_df["Latitude"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor(n_estimators=100)

model.fit(X_train, y_train)

model.score(X_test, y_test)

In [None]:
housing_df.head(5)

In [None]:
y_test.mean()

In [None]:
from sklearn.metrics import r2_score

# Fill an array with y_test mean
y_test_mean = np.full(len(y_test), y_test.mean())

In [None]:
y_test_mean[:0]

In [None]:
r2_score(y_true=y_test,
        y_pred=y_test_mean)

In [None]:
r2_score(y_true=y_test,
        y_pred=y_test)

**Mean Absolute Error (MAE)**
MAE is the average of the absolute difference between predictions and actual values 
It gives you an idea of how wrong your models predictions are

In [None]:
# MAE

from sklearn.metrics import mean_absolute_error

y_preds = model.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
mae

In [None]:
df = pd.DataFrame(data={"actual values": y_test,
                       "predicted values": y_preds})
df["differences"] = df["predicted values"] - df["actual values"]
df.head(10)

In [None]:
df["differences"].mean()

In [None]:
# MAE using formulas and differences
np.abs(df["differences"].mean())

**Mean Squared Error (MSE)**

MSE is the mean of square of the errors between actual and predicted values

In [None]:
# Mean Squared error
from sklearn.metrics import mean_squared_error

y_preds = model.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
mse

In [None]:
df["squared_differences"] = np.square(df["differences"])
df.head()

In [None]:
# Calculate MSE by hand
squared = np.square(df["differences"])
squared.mean()

In [None]:
df_large_error = df.copy()
df_large_error.iloc[0]["squared_differences"] = 16

In [None]:
df_large_error.head()

In [None]:
# Calculate MSE with large error
df_large_error["squared_differences"].mean()

In [None]:
df_large_error.iloc[1:100] = 20
df_large_error

### 4.2.3 Finally using the scoring parameter

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier 

np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

clf = RandomForestClassifier(n_estimators=100)

In [None]:
np.random.seed(42)

# Cross-validation accuracy
cv_acc = cross_val_score(clf, X, y, cv=5, scoring=None)

cv_acc

In [None]:
# Cross Validate Accuracy
print(f"The Cross Validate accuracy is: {np.mean(cv_acc)*100:.2f}%")

In [None]:
# Precision

np.random.seed(42)
cv_precision = cross_val_score(clf, X, y, cv=5, scoring="precision")
cv_precision

In [None]:
# Cross Validate precision
print(f"The Cross Validate precision is: {np.mean(cv_precision)}")

In [None]:
# Recall 
np.random.seed(42)
cv_recall = cross_val_score(clf, X, y, cv=5, scoring="recall")
cv_recall

In [None]:
# Cross Validate recall
print(f"The Cross Validate recall is: {np.mean(cv_recall)}")

In [None]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

np.random.seed(42)

X = housing_df.drop("Latitude", axis=1)
y = housing_df["Latitude"]

model = RandomForestRegressor(n_estimators=100)

In [None]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, cv=3)
np.mean(cv_r2)

In [None]:
cv_r2

In [None]:
model.score(...)

In [None]:
# Mean Squared error 
cv_mse = cross_val_score(model, X, y, cv=3, scoring="neg_mean_squared_error")
np.mean(cv_mse)

In [None]:
cv_mse

In [None]:
# Mean absolute Error 
cv_mae = cross_val_score(model, X, y, cv=3, scoring="neg_mean_absolute_error")
np.mean(cv_mae)

In [None]:
cv_mae

### 4.3 Using different evaluation metrics as scikit learn function

The 3rd way to evaluate scikit learn machine learning models/estimators is to using the `sklearn,metrics` module

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

np.random.seed(42)

# Create X & y
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model 
clf = RandomForestClassifier()

# Fit model
clf.fit(X_train, y_train)

# Evaluate model using evaluation function
print("Classifier metrics on the test set")
print(f"Accuracy: {accuracy_score(y_test, clf.predict(X_test))*100:2f}%")
print(f"Precision: {precision_score(y_test, clf.predict(X_test))}")
print(f"Recall: {recall_score(y_test, clf.predict(X_test))}")
print(f"F1: {f1_score(y_test, clf.predict(X_test))}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

# Create X & y
X = housing_df.drop("Latitude", axis=1)
y = housing_df["Latitude"]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model 
clf = RandomForestRegressor()

# Fit model
clf.fit(X_train, y_train)

# Evaluate model using evaluation function
print("Regression metrics on the test set")
print(f"R2 score: {r2_score(y_test, y_preds)}")
print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
print(f"MSE: {mean_squared_error(y_test, y_preds)}")

### 5.1 Tuning hyperparameter by hand

Lets make 3 sets training, validation and test

We're going to try and adjust 
* `max_depth`
* `max_features`
* `min_sample_leaf`
* `min_sample_split`
* `n_estimators`

In [52]:
def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparsion on y_true labels vs. y_preds labels.
    on a classification
    """
    accuracy = accuracy(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    f1 = f1_score(y_true, y_preds)
    metrics_dict = {"accuracy": round(accuracy, 2),
                   "precision": round(precision, 2),
                   "recall": round(recall, 2),
                   "f1": round(f1, 2)}
    print(f"Acc: {accuracy * 100:.2f}%")
    print(f"Precision: {precision:.2f}%")
    print(f"Recall: {recall:.2f}")
    print(f"F1 Score: {f1:.2f}")

    return metric_dict

In [53]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [54]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

# Assuming heart_disease dataset is loaded and contains the necessary columns
# Shuffle the data
heart_disease_shuffled = heart_disease.sample(frac=1)

# Split into X and y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split the data into train, validation, and test sets
train_split = round(0.7 * len(heart_disease_shuffled))
valid_split = round(train_split + 0.15 * len(heart_disease_shuffled))
X_train, y_train = X[:train_split], y[:train_split]
X_valid, y_valid = X[train_split:valid_split], y[train_split:valid_split]
X_test, y_test = X[valid_split:], y[:valid_split]

len(X_train), len(X_valid), len(X_test)

clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Make baseline predictions
y_preds = clf.predict(X_valid)

# Evaluate the classifier on the validation set
accuracy = accuracy_score(y_valid, y_preds)
precision = precision_score(y_valid, y_preds)
recall = recall_score(y_valid, y_preds)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)


Accuracy: 0.8
Precision: 0.8
Recall: 0.8333333333333334


In [55]:
# Create a second classifier with a different hyperparameter
clf_2 = RandomForestClassifier(n_estimators=100)
clf_2.fit(X_train, y_train)

# Make predictions with the second classifier
y_preds_2 = clf_2.predict(X_valid)

# Evaluate the second classifier
accuracy_2 = accuracy_score(y_valid, y_preds_2)
precision_2 = precision_score(y_valid, y_preds_2)
recall_2 = recall_score(y_valid, y_preds_2)

print("Metrics for the second classifier:")
print("Accuracy:", accuracy_2)
print("Precision:", precision_2)
print("Recall:", recall_2)


Metrics for the second classifier:
Accuracy: 0.8
Precision: 0.7777777777777778
Recall: 0.875


### 5.2 Hyperparameter tuning with RandomizedSearchCV

In [56]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
import numpy as np

# Correct the parameter name
grid = {"n_estimators": [10, 100, 200, 500, 1000, 1200],
        "max_depth": [None, 5, 10, 20, 30],
        "max_features": ["auto", "sqrt"],
        "min_samples_split": [2, 4, 6],  # Corrected here
        "min_samples_leaf": [1, 2, 4]}

np.random.seed(42)

# Assume heart_disease_shuffled is defined elsewhere in your code
# Split into X and y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split test and train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup RandomizedSearchCV
rs_clf = RandomizedSearchCV(estimator=clf,
                            param_distributions=grid,
                            n_iter=10,
                            cv=5,
                            verbose=2)

# Fit the RandomizedSearchCV version of clf
rs_clf.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   2.2s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   1.8s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   2.2s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   2.0s
[CV] END max_depth=5, max_features=sqrt, min_samples_leaf=2, min_samples_split=6, n_estimators=1200; total time=   2.0s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100; total time=   0.0s
[CV] END max_depth=30, max_features=auto, min_samples_leaf=2, min_samples_split=4, n_estimators=100

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\krish\Desktop\sample_project_1\env\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\krish\Desktop\sample_project_1\env\Lib\site-packages\sklearn\base.py", line 1144, in wrapper
    estimator._validate_params()
  File "C:\Users\krish\Desktop\sample_project_1\env\Lib\site-packages\sklearn\base.py", line 637, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\krish\Desktop\sample_project_1\env\Lib\site-packages\sklearn\utils\_param_validatio

In [57]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import train_test_split

# Make predictions with the best hyperparameter
rs_y_preds = rs_clf.predict(X_test)

# Evaluate the predictions
rs_metrics = evaluate_preds(y_test, rs_y_preds)

UnboundLocalError: cannot access local variable 'accuracy' where it is not associated with a value

### 5.3 Hyperparameter tuning with GridSearchCV

In [None]:
grid

In [None]:
grid_2 = {'n_estimators': [10, 100, 200, 500],
     'max_depth': [None, 5, 10, 20, 30],
     'max_features': ['auto', 'sqrt'],
     'min_samples_split': [2, 4, 6],
     'min_samples_leaf': [1, 2, 4]}

In [None]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
import numpy as np

np.random.seed(42)

# Assume heart_disease_shuffled is defined elsewhere in your code
# Split into X and y
X = heart_disease_shuffled.drop("target", axis=1)
y = heart_disease_shuffled["target"]

# Split test and train set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Instantiate RandomForestClassifier
clf = RandomForestClassifier(n_jobs=1)

# Setup GridSearchCV
gs_clf = GridSearchCV(estimator=clf,
                            param_grid=grid_2,
                            cv=5,
                            verbose=2)

# Fit the GridSearchCV version of clf
gs_clf.fit(X_train, y_train)


In [None]:
# Import necessary metric functions
from sklearn.metrics import accuracy_score, precision_score, recall_score

def evaluate_preds(y_true, y_preds):
    """
    Performs evaluation comparison on y_true labels vs. y_preds labels
    for a classification task.
    """
    accuracy = accuracy_score(y_true, y_preds)
    precision = precision_score(y_true, y_preds)
    recall = recall_score(y_true, y_preds)
    
    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall
    }

# Assuming gs_clf is already defined and fitted
gs_y_preds = gs_clf.predict(X_test)

# Evaluate the predictions
gs_metrics = evaluate_preds(y_test, gs_y_preds)
print(gs_metrics)


Lets compare our different models metrics 

In [None]:
import pandas as pd

# Ensure all metric variables are defined
baseline_metrics = {
    "accuracy": 0.85,
    "precision": 0.83,
    "recall": 0.82
}
clf_2_metrics = {
    "accuracy": 0.87,
    "precision": 0.85,
    "recall": 0.84
}
rs_metrics = {
    "accuracy": 0.88,
    "precision": 0.86,
    "recall": 0.85
}
# Assuming gs_metrics is already defined
# gs_metrics = ...

# Create the DataFrame
compare_metrics = pd.DataFrame({
    "baseline": baseline_metrics,
    "clf_2": clf_2_metrics,
    "random search": rs_metrics,
    "grid search": gs_metrics
})

# Plot the metrics
compare_metrics.plot.bar(figsize=(10, 8))


### 6. Saving and loading trained machine learning models
Two ways to save and load machine learning models.

1. With pythons pickle module
2. With The Joblib module

In [None]:
import pickle 

# Save an existing model to file
pickle.dump(gs_clf, open("gs_random_random_forest_model_1.pkl", "wb"))

In [None]:
# Load a asaved model
loaded_pickle_model = pickle.load(open("gs_random_random_forest_model_1.pkl", "rb"))

In [None]:
# Make some predictions
pickle_y_preds = loaded_pickle_model.predict(X_test)
evaluate_preds(y_test, pickle_y_preds)

**Joblib**

In [None]:
from joblib import dump, load

# Save model to file
dump(gs_clf, filename="gs_random_forest_model_1.joblib")

In [None]:
# Import a saved joblib model
loaded_joblib_model = load(filename="gs_random_forest_model_1.joblib")

In [61]:
# Make and evaluate joblib predictions
joblib_y_preds = loaded_joblib_model.predict(X_test)
evaluate_preds(y_test, joblib_y_preds)

UnboundLocalError: cannot access local variable 'accuracy' where it is not associated with a value