In [1]:
#Standard import for all projects
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

Three ways to evaluate SkLearn models/estimators:
* 1. Estimator's built-in 'score()' method
* 2. The 'scoring' parameter
* 3. Problem-specific metric functions

## 1. Evaluating a model with the score method

In [3]:
heart_disease = pd.read_csv("data/heart-disease.csv")
heart_disease.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [12]:
# Import the RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

# setup random seed
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit the model to the data (training the ML model)
clf = RandomForestClassifier()
clf.fit(X_train, y_train)



RandomForestClassifier()

In [13]:
# Evaluate the model RFC (use the patterns the model learned to make predictions)
# The highest value for the .score() method is 1.0, the lowest in 0.0
clf.score(X_train, y_train)

1.0

In [14]:
clf.score(X_test, y_test)

0.8524590163934426

### score() on regression problem

In [32]:
from sklearn.ensemble import RandomForestClassifier

# setup random seed
np.random.seed(42)

# Make the data
X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Create model instance
model = RandomForestClassifier(n_estimators=100) # n_estimators : you can tweak this hyperparam to try to improve

# Fit the model to the data (training the ML model)
model.fit(X_train, y_train)

RandomForestClassifier()

In [33]:
model.score(X_test, y_test)

0.8524590163934426

## 2. Evaluating using the 'scoring' parameter 

In [34]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split


np.random.seed(42)

X = heart_disease.drop("target", axis=1)
y = heart_disease["target"]


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


clf = RandomForestClassifier()
clf.fit(X_train, y_train);



In [35]:
clf.score(X_test, y_test)

0.8524590163934426

In [36]:
cross_val_score(clf, X, y)

array([0.81967213, 0.86885246, 0.81967213, 0.78333333, 0.76666667])