In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

### Classification problem

As a refresh, the scoring parameter can be used with a function like cross_val_score() to tell Scikit-Learn what evaluation metric to return using cross-validation.

check it out with our classification model 

In [2]:
heart_df = pd.read_csv("data/heart-disease.csv")
heart_df.head() # classification dataset - supervised learning

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
from sklearn.model_selection import train_test_split

# Import cross_val_score from the model_selection module
from sklearn.model_selection import cross_val_score

# Import the RandomForestClassifier model class from the ensemble module
from sklearn.ensemble import RandomForestClassifier

# Setup random seed 
np.random.seed(42) # making the results reproducable

# Split the data into X (features/data) and y (target/labels)
X = heart_df.drop("target",axis=1)
y = heart_df["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

# Instantiate the model (on the training set)
clf = RandomForestClassifier()

# Call the fit method on the model and pass it training data
clf.fit(X_train,y_train)

RandomForestClassifier()

## The scoring parameters

In [4]:
# 1) Cross-validated accuracy
# cross_val_score(clf,X,y,cv=5,scoring=None)
clf_cross_valid_score = np.mean(cross_val_score(clf,X,y,cv=5,scoring=None))
print(f"Heart Disease Classifier Cross-Validated Accuracy = {(clf_cross_valid_score)*100:.2f}%")

Heart Disease Classifier Cross-Validated Accuracy = 81.84%


In [5]:
# 2) Accuracy
cv_acc = np.mean(cross_val_score(clf,X,y,cv=5,scoring="accuracy"))
print(f"Heart Disease Classifier Cross-Validated Accuracy = {(cv_acc)*100:.2f}%")

Heart Disease Classifier Cross-Validated Accuracy = 81.83%


In [6]:
# 3) Precision
cv_precision = np.mean(cross_val_score(clf,X,y,cv=5,scoring="precision"))
print(f"Heart Disease Classifier Cross-Validated Accuracy = {(cv_precision)*100:.2f}%")

Heart Disease Classifier Cross-Validated Accuracy = 82.93%


In [7]:
# 4) Recall
cv_recall = np.mean(cross_val_score(clf,X,y,cv=5,scoring="recall"))
print(f"Heart Disease Classifier Cross-Validated Accuracy = {(cv_recall)*100:.2f}%")

Heart Disease Classifier Cross-Validated Accuracy = 84.24%


In [8]:
# 5) F1
# f1 --> precision + recall
cv_f1 = np.mean(cross_val_score(clf,X,y,cv=5,scoring="f1"))
print(f"Heart Disease Classifier Cross-Validated Accuracy = {(cv_f1)*100:.2f}%")

Heart Disease Classifier Cross-Validated Accuracy = 85.29%


### Regression problem

In [9]:
# Import the Boston housing dataset of SKlearn - built in regression dataset
from sklearn.datasets import load_boston
boston = load_boston()

In [10]:
# Covert it to a pandas dataframe - for better inspection

# take the data key, and label the columns
boston_df = pd.DataFrame(boston["data"],columns=boston["feature_names"])

# create a target column in df by using target values from dataset
boston_df["target"] = pd.Series(boston["target"])
boston_df

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [11]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor

# Setup random seed
np.random.seed(42)

# Create the data
X = boston_df.drop("target", axis=1)
y = boston_df["target"]

# Split into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Institate and fit the model (on the training set)
model = RandomForestRegressor()
model.fit(X_train, y_train)

RandomForestRegressor()

## The scoring parameters

In [12]:
# 1) Default evaluation metric --> r^2
cv_r2 = cross_val_score(model, X, y, cv=5, scoring="r2") # or scoring="None"

# Take the mean of the array returned  
print(f"The cross-validated R^2 score is: {np.mean(cv_r2):.2f}")

The cross-validated R^2 score is: 0.61


In [13]:
# 2) MAE (mean absolute error) --> neg_mean_absolute_error
cv_mae = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
print(f"The cross-validated MAE score is: {np.mean(cv_mae):.2f}")

The cross-validated MAE score is: -3.06


Why the "neg_"? neg means negative 

Because Scikit-Learn documentation states:

["All scorer objects follow the convention that higher return values are better than lower return values."](https://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values)

**Which in this case, means a lower negative value (closer to 0) is better**.

In [14]:
# 3) MSE (mean squared error) --> neg_mean_squared_error
cv_mse = cross_val_score(model, X, y, cv=5,scoring="neg_mean_squared_error")
print(f"The cross-validated MSE score is: {np.mean(cv_mse):.2f}")

The cross-validated MSE score is: -21.28


In [None]:
# -----------------------------------------------------------------