# A quick Machine Learning Modelling Turtorial with Python and Scikit-Learn

In [127]:
#what is Scikit-Learn also known as sklearn, is an open-source python ML library
#it is built on NumPy, and Matplotlib, both Python libraries

# A Scikit-Learn Workflow

In [128]:
# 1. Get data ready
# 2. Pick a model (to suit the problem)
# 3. Fit the model to the data and make a prediction
# 4. Evaluate the model
# 5. Improve through experimentation
# 6. Save and reload your trained model
# 7. Putting it all together in a pipeline

In [129]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import sklearn


# Random Forest Classifier Workflow for Classifying Heart Disease

# 1. Get the data ready

In [256]:
#lets use an example dataset 
heart_disease = pd.read_csv('heart-disease.csv')
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [131]:
# each row and column are diff patient characteristics expect target 1 has or 0 not

In [132]:
#NOTE: it is a common custom to save features to a variable X and labels to a Y
#in practive we'd like to use the X to build a predictive algorithm to predict the Y

In [257]:
# Create X (all the feature columns)
X = heart_disease.drop("target", axis=1)

#Create y (the target column)
y = heart_disease["target"]

#check the head of the features DataFrame
X.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


# 2 Pick a model 

In [134]:
# ours is a classification problem. We want to classify if someone has it or not.

In [260]:
from sklearn.ensemble import RandomForestClassifier

In [259]:
clf = RandomForestClassifier(n_estimators=100)
#well keep the default parameters
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'sqrt',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

# 3. Fit the model to the training data

In [261]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) #.2 is 20% so 80% is going to be used to train and 20% to test

In [262]:
clf.fit(X_train, y_train); #we are basically saying find the patterns in our data

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [265]:
X_train

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
120,64,0,0,130,303,0,1,122,0,2.0,1,2,2
223,56,0,0,200,288,1,0,133,1,4.0,0,2,3
131,49,0,1,134,271,0,1,162,0,0.0,1,0,2
166,67,1,0,120,229,0,0,129,1,2.6,1,2,3
183,58,1,2,112,230,0,0,165,0,2.5,1,1,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...
285,46,1,0,140,311,0,1,120,1,1.8,1,2,3
276,58,1,0,146,218,0,1,105,0,2.0,1,1,3
225,70,1,0,145,174,0,1,125,1,2.6,0,0,3
26,59,1,2,150,212,1,1,157,0,1.6,2,0,2


In [140]:
# lets make a prediction
#this will call an error because of the fact that the array we made does not match the X_train df
#y_label = clf.predict(np.array([0,2,3,4]))

In [141]:
y_preds = clf.predict(X_test)
y_preds

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0])

In [142]:
y_test

163    1
33     1
15     1
49     1
57     1
      ..
93     1
3      1
285    0
77     1
235    0
Name: target, Length: 61, dtype: int64

# 4. Evaluate the Model on the training data and test data

In [143]:
clf.score(X_train,y_train)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


1.0

In [144]:
# 1.0 is the maximum score it can get

In [145]:
clf.score(X_test, y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.8360655737704918

In [146]:
from sklearn.metrics import classification_report, confusion_matrix,accuracy_score
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.79      0.79      0.79        24
           1       0.86      0.86      0.86        37

    accuracy                           0.84        61
   macro avg       0.83      0.83      0.83        61
weighted avg       0.84      0.84      0.84        61



  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [147]:
confusion_matrix(y_test,y_preds)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[19,  5],
       [ 5, 32]])

In [148]:
accuracy_score(y_test,y_preds)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.8360655737704918

# 5. Improve the Model 

In [149]:
#try different amounts of n_esimators

In [150]:
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators ...")
    clf = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model Accuracy on Test set: {clf.score(X_test, y_test) * 100:.2f}%")
    print("")

Trying model with 10 estimators ...
Model Accuracy on Test set: 80.33%

Trying model with 20 estimators ...
Model Accuracy on Test set: 86.89%

Trying model with 30 estimators ...
Model Accuracy on Test set: 83.61%

Trying model with 40 estimators ...


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array,

Model Accuracy on Test set: 81.97%

Trying model with 50 estimators ...
Model Accuracy on Test set: 85.25%

Trying model with 60 estimators ...


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype

Model Accuracy on Test set: 83.61%

Trying model with 70 estimators ...
Model Accuracy on Test set: 80.33%

Trying model with 80 estimators ...


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


Model Accuracy on Test set: 81.97%

Trying model with 90 estimators ...
Model Accuracy on Test set: 83.61%



  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [151]:
#This will give you accuracy on diff estimators

# 6.  Save a model

In [152]:
import pickle

pickle.dump(clf, open("random_forest_model.pkl", "wb"))

In [153]:
loaded_model = pickle.load(open("random_forest_model.pkl", "rb"))
loaded_model.score(X_test, y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.8360655737704918

# Creating Machine Learning Models 

# 0. Import Needed Libraries and Tools

In [154]:
#Let's listify the steps
what_we_are_covering = [
    "0. An end-to-end Scikit-Learn workflow",
    "1. Getting the data ready",
    "2. Choose the right estimator/algorithm for our problems",
    "3. Fit the model/algorithm and use it to make predictions",
    "4. Evaluating a model",
    "5. Improve the model",
    "6. Save and load a trained model",
    "7. Putting it all together"]

In [155]:
what_we_are_covering

['0. An end-to-end Scikit-Learn workflow',
 '1. Getting the data ready',
 '2. Choose the right estimator/algorithm for our problems',
 '3. Fit the model/algorithm and use it to make predictions',
 '4. Evaluating a model',
 '5. Improve the model',
 '6. Save and load a trained model',
 '7. Putting it all together']

# 1. Getting our data ready to be used 
    Three main things we need to do:
        1. Split the data in features and labels (usaully 'X' and 'y')
        2. filling/imputing or disregarding missing values
        3. Converting non-numerical values to numerical values (also called feature encoding)

In [156]:
heart_disease = pd.read_csv("heart-disease.csv")

In [157]:
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [158]:
x = heart_disease.drop("target", axis = 1) #this has dropped the target column
x.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2


In [159]:
y = heart_disease["target"] #this is selecting just the target column for y
y.head()

0    1
1    1
2    1
3    1
4    1
Name: target, dtype: int64

In [160]:
#split into training and test data we cant test on data we used to train

In [161]:
from sklearn.model_selection import train_test_split #this allows us to spliy our data
x_train, x_train, y_train, y_test = train_test_split(x,y, test_size=0.3) #test size is 30%


In [162]:
x_train.shape, x_train.shape, y_test.shape, y_train.shape

((91, 13), (91, 13), (91,), (212,))

# 1.1 make sure it is all numerical

In [163]:
car_sales = pd.read_csv("car-sales-extended.csv")

In [164]:
car_sales_X = car_sales.drop('Price', axis =1)
car_sales_Y = car_sales["Price"]

In [165]:
car_sales_X.head()
car_sales_Y.head()

0    15323
1    19943
2    28343
3    13434
4    14043
Name: Price, dtype: int64

In [166]:
len(car_sales_X)

1000

In [167]:
car_sales_X.dtypes

Make             object
Colour           object
Odometer (KM)     int64
Doors             int64
dtype: object

In [168]:
#turn the categories into numbers
#we will be using OneHot Encoder to change the categorical to numerical

from sklearn.preprocessing import OneHotEncoder #import the oneHoteEncoder
from sklearn.compose import ColumnTransformer #as well as column transfer allows us to change columns

categorical_features = ["Make", "Colour", "Doors"] #identify categorical data
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                remainder = "passthrough")
transformed_carsales = transformer.fit_transform(car_sales_X)
transformed_carsales

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [169]:
pd.DataFrame(transformed_carsales)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,35431.0
1,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,192714.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,84714.0
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,154365.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,181577.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,35820.0
996,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,155144.0
997,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,66604.0
998,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,215883.0


In [170]:
car_sales_X.head()

Unnamed: 0,Make,Colour,Odometer (KM),Doors
0,Honda,White,35431,4
1,BMW,Blue,192714,5
2,Honda,White,84714,4
3,Toyota,White,154365,4
4,Nissan,Blue,181577,3


In [171]:
#look up at index zero, that is our first car that is a white honda with 4 doors, KM 35431, and price 15323. it will get a 1 for honda and white
#so lets look at index 2 it should match so lets see if we an detwermine what each new column coresponds to in regards to make, colour, doorsd
# it looks like in each category it is alphabetical so 
# 0 = BMW, 1=Honda, 2=Nissan, 3=Toyota, then we look at the next ones is color, black blue etc white is 8 then doors 3,4,5 

In [172]:
car

NameError: name 'car' is not defined

In [None]:
#lets refit our model 
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(carx_train, cary_train)
model.score(carx_test, cary_test)

# Quick note: OneHotEncoder can now handle NaN/None values 

# 1.2 What if there is missing data?
    1. Fill them with some value (also known as imputation). For example you can use the mean of the other values, 
    2. Remove the samples with missing data. This can result in using less data to build the model. 

In [173]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing                           

Unnamed: 0,Make,Colour,Odometer (KM),Doors,Price
0,Honda,White,35431.0,4.0,15323.0
1,BMW,Blue,192714.0,5.0,19943.0
2,Honda,White,84714.0,4.0,28343.0
3,Toyota,White,154365.0,4.0,13434.0
4,Nissan,Blue,181577.0,3.0,14043.0
...,...,...,...,...,...
995,Toyota,Black,35820.0,4.0,32042.0
996,,White,155144.0,3.0,5716.0
997,Nissan,Blue,66604.0,4.0,31570.0
998,Honda,White,215883.0,4.0,4001.0


In [174]:
#if your data set is this big you are not going to go through each row 
#thankfully there is a method called pd.DataFrame.isna() which detects missing values
car_sales_missing.isna().sum()

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [175]:
#hmm there is about 50 or so per column. Lets try what we did before, convert to 
#numerical split, train, test and fir to a model. 


In [176]:
#create the features
X_missing = car_sales_missing.drop("Price", axis =1)
print(f"Number of missing X vales:\n{X_missing.isna().sum()}")

Number of missing X vales:
Make             49
Colour           50
Odometer (KM)    50
Doors            50
dtype: int64


In [177]:
#Create Labels 
y_missing = car_sales_missing["Price"]
print(f"Number of missing y values: {y_missing.isna().sum()}")

Number of missing y values: 50


In [178]:
#Convert the categorical columns into one-hot encodings 
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()

transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                remainder= "passthrough",
                                sparse_threshold=0) #returen a sparse matrix or not
transformed_X_Missing = transformer.fit_transform(X_missing)
transformed_X_Missing

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [179]:
#split into training and test sets
X_missing_train, X_Missing_test, y_missing_train,y_missing_test = train_test_split(transformed_X_Missing,
                                                                                   y_missing,
                                                                                   test_size = 0.2)
#fit and score to a model
model = RandomForestRegressor()
model.fit(X_missing_train,y_missing_train)
model.score(X_Missing_test, y_missing_test)

ValueError: Input X contains NaN.
RandomForestRegressor does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# 1.2.1 Fill missing data with pandas

In [180]:
#for categorical data the simplist way is to fill the missing fieslds with the string
#"missing"
#you can also use the most common value if we look at doors the most common is 4
#with Odometer we can use the mean value of all the other values. 
#for price since price is the target we can remove them and it will cause less harm than 
#imputing however you can design an experiment to tes this.


In [181]:
#we are going to us fillna(value="missing", inplace=Ture) to fill with missing
car_sales_missing["Make"].fillna(value="missing", inplace=True)

In [182]:
#lets do the same for Colour
car_sales_missing["Colour"].fillna(value="missing", inplace=True)

In [183]:
#how many missing do we have now?
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [184]:
#lets move to doors, ets fill with 4 the most common value this is the same as filling 
#with the median or mode of the doors columns
#you would start with finding the most commonvalue
car_sales_missing["Doors"].value_counts()

Doors
4.0    811
5.0     75
3.0     64
Name: count, dtype: int64

In [185]:
#now lets fill with most common values
car_sales_missing["Doors"].fillna(value=4, inplace=True)

In [186]:
#next for Odometer lets use the mean value of itself.
car_sales_missing["Odometer (KM)"].fillna(value=car_sales_missing["Odometer (KM)"].mean(), inplace=True)

In [187]:
#how many missing values do we have now?
car_sales_missing.isna().sum()

Make              0
Colour            0
Odometer (KM)     0
Doors             0
Price            50
dtype: int64

In [188]:
#for price we could input the mean or median  but we want to prevent inputing
#too many fake labels so lets remove them
car_sales_missing.dropna(inplace=True)


In [189]:
car_sales_missing.isna().sum()

Make             0
Colour           0
Odometer (KM)    0
Doors            0
Price            0
dtype: int64

In [190]:
#Check the number of total samples 
len(car_sales_missing)

950

In [191]:
#Can we do it now? Lets try

In [192]:
#create features
X_missing = car_sales_missing.drop("Price", axis=1)
print(f"Number of missing X values:\n{X_missing.isna().sum()}")

#create labels
y_missing = car_sales_missing["Price"]
print(f"Number of missing y values: {y_missing.isna().sum()}")

Number of missing X values:
Make             0
Colour           0
Odometer (KM)    0
Doors            0
dtype: int64
Number of missing y values: 0


In [193]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]

one_hot = OneHotEncoder()

transformer = ColumnTransformer([( "one_hot",
                                  one_hot,
                                  categorical_features)],
                                remainder = "passthrough",
                                sparse_threshold=0) 
transformed_X_Missing = transformer.fit_transform(X_missing)
transformed_X_Missing

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 3.54310e+04],
       [1.00000e+00, 0.00000e+00, 0.00000e+00, ..., 0.00000e+00,
        1.00000e+00, 1.92714e+05],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 8.47140e+04],
       ...,
       [0.00000e+00, 0.00000e+00, 1.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 6.66040e+04],
       [0.00000e+00, 1.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.15883e+05],
       [0.00000e+00, 0.00000e+00, 0.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 2.48360e+05]])

In [194]:
#split data into training and test sets
np.random.seed(42)
X_missing_train,X_Missing_test, y_missing_train,y_missing_test = train_test_split(transformed_X_Missing,
                                                                                  y_missing,
                                                                                  test_size=0.2)
#fit and score a model
model = RandomForestRegressor()
model.fit(X_missing_train, y_missing_train)
model.score(X_Missing_test,y_missing_test)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.22011714008302485

# Filling in missing data and transforming categorical data with Sckikit-Learn

In [195]:
#Scikit-Learn provides a class called sklearn.impute.SimpleImputer()
#we need to re-import the dataframe so that we have missing values again. 

In [196]:
car_sales_missing = pd.read_csv("car-sales-extended-missing-data.csv")
car_sales_missing.isna().sum()
                                

Make             49
Colour           50
Odometer (KM)    50
Doors            50
Price            50
dtype: int64

In [197]:
#we will begin by removing the price missing rows
car_sales_missing.dropna(subset=["Price"], inplace=True)

In [198]:
car_sales_missing.isna().sum()


Make             47
Colour           46
Odometer (KM)    48
Doors            47
Price             0
dtype: int64

In [199]:
#since we dont have to fill in any price data lets split into features X and Labels Y


In [200]:
Missing_X = car_sales_missing.drop("Price", axis=1)
Missing_y = car_sales_missing["Price"]

#split dtat into train and test 
np.random.seed(42)
Missing_X_train, Missing_X_test, Missing_y_train, Missing_y_test = train_test_split(
                                                                Missing_X,
                                                                Missing_y,
    test_size=0.2)

In [201]:
from sklearn.impute import SimpleImputer
#create categorical variable imputer 
cat_imputer = SimpleImputer(strategy="constant", fill_value="missing")

#create door column imputer
door_imputer = SimpleImputer(strategy="constant", fill_value=4)

#create Odometer (KM) column imputer
num_imputer = SimpleImputer(strategy="mean")

In [202]:
#next we need to define which columns will be imputed on we will need them


In [203]:
#define different column features 
categorical_features = ["Make", "Colour"]
door_feature = ["Doors"]
numerical_features = ["Odometer (KM)"]


In [204]:
#now how are we going to do this? ColumnTransfer() takes input a list in the ofrm of name 
#of transform, transformer to use, and column trnasformer

In [205]:
from sklearn.compose import ColumnTransformer
#create series of column transforms to perform
imputer = ColumnTransformer ([
    ("cat_imputer", cat_imputer, categorical_features),
    ("door_imputer", door_imputer, door_feature),
    ("num_imputer", num_imputer, numerical_features)])

In [206]:
# we need to 
# 1. Learn the imputation values from the training set
#2. fill the missing values in the training set with values learned in 1
#3. fill in misisng values in the testing set with values learned in 1
# we are not calculating many variables , remember the tes set should always remain unseen data
 

## So when filling in the test set, they should only be with values calculated or 
## imputer from the training sets 

In [207]:
#we can acheive steps 1 & 2 with columntransformer.fit_transform() method 

In [208]:
#find values to fill and transform training data
filled_X_train = imputer.fit_transform(Missing_X_train)

# fill values in to the test set with values learned from the training
filled_X_test = imputer.transform(Missing_X_test)

#check filled X train
filled_X_train

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dty

array([['Honda', 'White', 4.0, 71934.0],
       ['Toyota', 'Red', 4.0, 162665.0],
       ['Honda', 'White', 4.0, 42844.0],
       ...,
       ['Toyota', 'White', 4.0, 196225.0],
       ['Honda', 'Blue', 4.0, 133117.0],
       ['Honda', 'missing', 4.0, 150582.0]], dtype=object)

In [209]:
#we now have filled all missing data, but its not all numerical

In [210]:
filled_X_train_df = pd.DataFrame(filled_X_train,
                                 columns=["Make", "Colour", "Doors","Odometer (KM)"])
filled_X_test_df = pd.DataFrame(filled_X_test,
                                columns=["Make", "Colour", "Doors", "Odometer (KM)"])
#check missing data in trainging set
filled_X_train_df.isna().sum()

Make             0
Colour           0
Doors            0
Odometer (KM)    0
dtype: int64

In [211]:
# lets one hot encode the make, colour and doors columsn
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

categorical_features = ["Make", "Colour", "Doors"]
one_hot = OneHotEncoder()
transformer = ColumnTransformer([("one_hot",
                                  one_hot,
                                  categorical_features)],
                                remainder="passthrough",
                                sparse_threshold=0)
#fill train and test values seperateley
transformed_Missing_X_train = transformer.fit_transform(filled_X_train_df)
transformed_Missing_X_test = transformer.transform(filled_X_test_df)

#Check transformed and filled X_train
transformed_Missing_X_train

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[0.0, 1.0, 0.0, ..., 1.0, 0.0, 71934.0],
       [0.0, 0.0, 0.0, ..., 1.0, 0.0, 162665.0],
       [0.0, 1.0, 0.0, ..., 1.0, 0.0, 42844.0],
       ...,
       [0.0, 0.0, 0.0, ..., 1.0, 0.0, 196225.0],
       [0.0, 1.0, 0.0, ..., 1.0, 0.0, 133117.0],
       [0.0, 1.0, 0.0, ..., 1.0, 0.0, 150582.0]], dtype=object)

In [212]:
#Now that we've transformed X, let's see if we can fit a model
np.random.seed(42)
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()

#make sure to use the transformed data (filled and one hot coded X)
model.fit(transformed_Missing_X_train, Missing_y_train)
model.score(transformed_Missing_X_test, Missing_y_test)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.21229043336119102

### Important note always keep your training and test sets seperate. so better to 
### do it like sklearn where you split then input not input then split you dont want
### the model to see ANY test data before evaluation

# 2. Choosing the right estimator/algorithm for your problem
    Note:
    - Scikit_Learn refers to machine learning models and algorithms as estimators
    -classification problem, predicting a category (Heart disease yes or not)
        -clf is used a lot for classification estimators
    -Regression problem for predicting a number
    -Unsupervised problem data with no labels
        - grouping unabelled samples with other similar unlabelled samples.

scikit_Learn_cheat_sheet = ![Algorithm cheat](sklearn-ml-map.png)

In [213]:
# remember markdown for image is ![alt text](filepath)

## 2.1 Picking a machine learning model for a regression problem

In [214]:
#Lets start with regression or predicting a number lets use California Housing data
from sklearn.datasets import fetch_california_housing
housing = fetch_california_housing()
housing; #gets downloaded as a dictionary

In [215]:
#Since its a dictionary you can turn it into a DataFrame so we can inpsect it


In [217]:
housing_df = pd.DataFrame(housing["data"], columns=housing["feature_names"])
housing_df["target"] = pd.Series(housing["target"])
housing_df.head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,target
0,8.3252,41.0,6.984127,1.02381,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.97188,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.80226,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422


In [218]:
#how many samples?
len(housing_df)

20640

In [222]:
#Beutiful our goal here is to use the feature columns such as
#MedInc = median income in block groupd
#HouseAge = median house age in block group
#AveRooms = Average number of rooms per household
#AveBedrms = average number of bedrooms per household.
#to predict the target column 
#which is the median house value for specfici 

In [224]:
#if we look at the map above we  see that it is a regression problem involving RidgeRegression
#import the ridge model class from the linear_model module
from sklearn.linear_model import Ridge
#set up random seed
np.random.seed(42)
#split the data into features (X) and labels (y)
housing_x = housing_df.drop("target", axis=1)
housing_y = housing_df["target"]
#split into train test sets
housing_x_test, housing_x_train, housing_y_test, housing_y_train = train_test_split(housing_x,housing_y, test_size=0.2)
model= Ridge()
model.fit(housing_x_train, housing_y_train)
#check the score of the model (on the test set)
#the default score() metric of regression algorithm is R^2
model.score(housing_x_test,housing_y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.2609354104670423

In [227]:
#lets say we want to improve it if we look back at the map our next step is to try EnsembleRegressors
from sklearn.ensemble import RandomForestRegressor

#set up random seed 
np.random.seed(42)

#split the data into features and labels
h2_x = housing_df.drop("target", axis=1)
h2_y = housing_df["target"]

#split into train and test sets
h2_x_train, h2_x_test, h2_y_train, h2_y_test = train_test_split(h2_x,h2_y, test_size=0.2)

#institate and fit the model (on the training set)
model_2 = RandomForestRegressor()
model_2.fit(h2_x_train, h2_y_train)

#check the score of the model (on the test set)
#the default score metric of regression algorithms are R^2
model_2.score(h2_x_test, h2_y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.8066196804802649

In [228]:
#Wow we got a huge boost, so remember if it doesnt work at first experiment experiment 



## 2.2 Picking a machine learning model for a classification problem

In [229]:
#now lets look at a classification problem, lets say we want to predict if they will have somethign or not


In [232]:
heart_disease = pd.read_csv("heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [234]:
#how many samples are tehre?
len(heart_disease)


303

In [235]:
#lets look at our map, we have over 50, predicting a category , we have labeled data
#its less than 100K, we are going to do Linear svc (linear support vector classifier)
#first 

In [None]:
#the following throws an error big time

In [255]:
#Import LinearSVC from svm modlue
#from sklearn.svm import LinearSVC

#setup random seed
#np.random.seed(42)

#split into features/data and target/labels
#h_x = heart_disease.drop("target", axis=1)
#h_y = heart_disease["target"]

#attempt at fixing a value error
#from sklearn import preprocessing
#from sklearn import utils
#lab = preprocessing.LabelEncoder()
#h_y_transformed = lab.fit_transform(h_y)

#split  into train and test sets 
#h_x_test,h_x_train,h_y_train,h_y_test = train_test_split(h_x,h_y_transformed, test_size=0.2)

#instantiate and fit teh model (on the trainign set)
#clf = LinearSVC(max_iter=100 #iterations on the data 1000 is the default
                #) #dual="auto" choses best parameters for the model autmatically
#clf.fit(h2_x_train, h2_y_train)

#check the score of the model (on the test set)
#clf.score(h2_x_test,h2_y_test)

# 3. Fit the model to data and using it to make predictions

### 3.1 Fitting a model to data


In [241]:
#Fitinng is the process of having a machine learning model learn patterns from a dataset

In [242]:
#calling the fit method will cause the maching learning algorithm to attempt to find
#patterns between x and y or if theres no y itll only find the patterns in x

In [243]:
#passing x and y to fit() will cause the model to go through all of the examples
#in x and see what theri y is the how is different depending on the algorightm 


## 3.2 Making predictions using a machine learning model

In [244]:
#Now that we have a model trained, you can use it to make predictions two ways predict()
#and predict_proba()

In [267]:
#use a trained modle to make predictions
clf.predict(X_test) 

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1])

In [268]:
#compare predictions ot the truth
y_preds = clf.predict(X_test)
np.mean(y_preds == y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.819672131147541

In [269]:
#another way to evaluate predicts (comparing them to the truth labels) is with 
#scikit learns sklearn.metrics moduelde youll find accuracy_score
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_preds)

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.819672131147541

In [270]:
#predict_prob() returns  the probabilities of a label


In [271]:
#return probabilites rather than labels
clf.predict_proba(X_test[:5])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[0.85, 0.15],
       [0.91, 0.09],
       [0.03, 0.97],
       [0.89, 0.11],
       [0.24, 0.76]])

In [273]:
#lets look at the diff
clf.predict(X_test[:5])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([0, 0, 1, 0, 1])

In [275]:
#find prediction probabilites for 1 sample
clf.predict_proba(X_test[:1])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([[0.85, 0.15]])

In [276]:
#this output means for the sample X_test[:1] the model is predicting label 0 (index 0)
# with a prob of0.9
#because the highest probability is at index 0, is over 0.5 a label of 0 is assigned
#return the label for 1 sample
clf.predict(X_test[:1])

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


array([0])

# 1. Evaluating a model 

In [277]:
# main idea is to compare the models redictions to what they shouldve ideally been (truth
# label



### 3 differnt ways of evaluating model
    1. The score() method. will return a metric associated with teh type of model your using. 
    2. scoring parameter this can be passed to methods such as cross_val_score() or
    GridsearchCV() 
    3.problem specific metric functions in sklearn.metrics 


## 4.1 general model evaluation with score()

In [278]:
#once the model has been fit on the training data, we can call the score() method
# on it and evaluate our mode on the test data. 
#this will also vary depending on the problem
#classification usually = metrics.accuracy_score()
#regression usually = metrics.r2_score


In [279]:
#you can use SHIFT + TAB to check on differnt things
#we have used this one a lot, highest score is 1.00 or 100%

## 4.2 Evaluating your modesl using the scoreing parameter

In [281]:
#parameters will be different based on teh problem you are working on

In [282]:
heart_disease

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [287]:
#import cross_val_score from model_selection module
from sklearn.model_selection import cross_val_score

#import the RandomForesetClassifier model class from ensemble model
from sklearn.ensemble import RandomForestClassifier

#set up random seed 
np.random.seed(42)

#split into data X(features/data) and Y(target/labels)
h4_x = heart_disease.drop("target", axis=1)
h4_y = heart_disease["target"]

#splot into train and test sets
h4_x_train, h4_x_test, h4_y_train, h4_y_test = train_test_split(h4_x,h4_y, test_size=0.2)

#instantiate the model (On the training set)
clf = RandomForestClassifier(n_estimators=100)

#call the fit method on the model and pass it training data
clf.fit(h4_x_train,h4_y_train);

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


In [288]:
#let see both score() and cross_val_score() in action
clf.score(h4_x_test,h4_y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.8524590163934426

In [289]:
#using cross_val_score()
cross_val_score(clf,h4_x,h4_y, cv=5) #cv numer of split to test 5 is default

  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if

array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])

In [290]:
#first difference cross_val_score() returns an array. This is because cv stands for c
#cross validation 
#if you want to see the parameters just hit SHIFT TAB in the brackets

![](sklearn-cross-validation.png)

In [292]:
#this is an example showing cross validation

In [293]:
#tries to solve the problen of not training on all the data and avoid getting lucky scores
#insteat of using 1 training split it does it 5 times on a diff split each time
#we will get back 5 different scores, and taking the mean will give us  a more indepth
#idea of how we are doing.
np.random.seed(42)

#simgle trianigna dn test split score
clf_single_score = clf.score(h4_x_test,h4_y_test)

#take the mean of the 5-fold
clf_cross_val_score = np.mean(cross_val_score(clf, h4_x,h4_y,cv=5))

clf_single_score, clf_cross_val_score

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if

(0.8524590163934426, 0.8248087431693989)

In [294]:
#so despite the cross_val being lower you would wnat to report on it.
#but note that teh scoring parameter is set to none by default. you can change it if needed
#as diff problems need diff things.

## 4.2.1 Classification Model Evaluation Metrics
    There are 4 main evaluation metrics/methods your come across
        1. Accuracy
        2. Area under ROC Curve (receiver operating characteristic curve
        3. Confusion Matrix
        4. Classification Report

In [296]:
#import cross_val_score from model_selection module
from sklearn.model_selection import cross_val_score

#import the RandomForesetClassifier model class from ensemble model
from sklearn.ensemble import RandomForestClassifier

#set up random seed 
np.random.seed(42)

#split into data X(features/data) and Y(target/labels)
h4_x = heart_disease.drop("target", axis=1)
h4_y = heart_disease["target"]

#splot into train and test sets
h4_x_train, h4_x_test, h4_y_train, h4_y_test = train_test_split(h4_x,h4_y, test_size=0.2)

#instantiate the model (On the training set)
clf = RandomForestClassifier(n_estimators=100)

#call the fit method on the model and pass it training data
clf.fit(h4_x_train,h4_y_train)
clf.score(h4_x_test,h4_y_test)

  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


0.8524590163934426

### Accuracy

In [297]:
#this is default for the score(_ function and probably the most commonly used for 
#classification
# as you will see if may not always be the best to use
#usually decimal but easily convert to percent
print(f"Heart Disease Classifier Accuracy: {clf.score(h4_x_test,h4_y_test) * 100:.2f}%")

Heart Disease Classifier Accuracy: 85.25%


  if not hasattr(array, "sparse") and array.dtypes.apply(is_sparse).any():
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):
  if is_sparse(pd_dtype):
  if is_sparse(pd_dtype) or not is_extension_array_dtype(pd_dtype):


### Area under Receiver Operating Characteristic (ROC) Curve