In [1]:
# standard import
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
from sklearn.datasets import load_boston
boston = load_boston()

In [3]:
boston_df = pd.DataFrame(boston["data"],columns=boston["feature_names"])
boston_df["target"]=pd.Series(boston["target"])

In [4]:
# now i am going to try the ridge regression model
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
np.random.seed(3)
# split the data
X= boston_df.drop("target",axis=1)
y=boston_df["target"]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2)
clf = RandomForestRegressor()
clf.fit(X_train,y_train)
clf.score(X_test,y_test)

0.9079961076816832

###  Regression model evaluation metrics

Model evaluation metrics documentation - https://scikit-learn.org/stable/modules/model_evaluation.html

1. R^2 (pronounced r-squared) or coefficient of determination.
2. Mean absolute error (MAE)
3. Mean squared error (MSE)

**R^2**

What R-squared does: Compares your models predictions to the mean of the targets. Values can range from negative infinity (a very poor model) to 1.  For example, if all your model does is predict the mean of the targets, it's R^2 value would be 0. And if your model perfectly predicts a range of numbers it's R^2 value would be 1.

In [5]:
from sklearn.metrics import r2_score

# Fill an array with y_test mean
y_test_mean = np.full(len(y_test), y_test.mean())

In [6]:
y_test.mean()

22.865686274509798

In [7]:
# Model only predicting the mean gets an R^2 score of 0
r2_score(y_test, y_test_mean)

0.0

In [8]:
# Model predicting perfectly the correct values gets an R^2 score of 1
r2_score(y_test, y_test)

1.0

**Mean absolue error (MAE)**

MAE is the average of the aboslute differences between predictions and actual values. It gives you an idea of how wrong your models predictions are.

In [10]:
# Mean absolute error
from sklearn.metrics import mean_absolute_error

y_preds = clf.predict(X_test)
mae = mean_absolute_error(y_test, y_preds)
mae

1.9551568627450968

In [11]:
df = pd.DataFrame(data={"actual values": y_test,
                        "predicted values": y_preds})
df["differences"] = df["predicted values"] - df["actual values"]
df

Unnamed: 0,actual values,predicted values,differences
224,44.8,43.762,-1.038
137,17.1,18.224,1.124
453,17.8,20.616,2.816
303,33.1,30.668,-2.432
254,21.9,22.487,0.587
...,...,...,...
434,11.7,13.660,1.960
294,21.7,21.876,0.176
306,33.4,34.821,1.421
463,20.2,21.794,1.594


**Mean squared error (MSE)**

In [13]:
# Mean squared error
from sklearn.metrics import mean_squared_error

y_preds = clf.predict(X_test)
mse = mean_squared_error(y_test, y_preds)
mse

7.613763980392157

In [14]:
# Calculate MSE by hand
squared = np.square(df["differences"])
squared.mean()

7.613763980392157

In [17]:
from sklearn.model_selection import cross_val_score
model = RandomForestRegressor(n_estimators=100)

In [18]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, cv=5, scoring=None)
np.mean(cv_r2)

0.622375083951403

In [19]:
np.random.seed(42)
cv_r2 = cross_val_score(model, X, y, cv=5, scoring="r2")
cv_r2

array([0.76861165, 0.85851765, 0.74941131, 0.47891315, 0.25642166])

In [20]:
# Mean absolute error
cv_mae = cross_val_score(model, X, y, cv=5, scoring="neg_mean_absolute_error")
cv_mae

array([-2.12751961, -2.53956436, -3.42026733, -3.82432673, -3.06893069])

In [21]:
# Mean squared error
cv_mse = cross_val_score(model, X, y, cv=5, scoring="neg_mean_squared_error")
np.mean(cv_mse)

-21.02253826604542

**Regression evaluation functions**

In [22]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

np.random.seed(42)

X = boston_df.drop("target", axis=1)
y = boston_df["target"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

model = RandomForestRegressor(n_estimators=100)
model.fit(X_train, y_train)

# Make predictions using our regression model
y_preds = model.predict(X_test)

# Evaluate the regression model
print("Regression model metrics on the test set")
print(f"R^2: {r2_score(y_test, y_preds)}")
print(f"MAE: {mean_absolute_error(y_test, y_preds)}")
print(f"MSE: {mean_squared_error(y_test, y_preds)}")

Regression model metrics on the test set
R^2: 0.8739690141174031
MAE: 2.1226372549019623
MSE: 9.242328990196082
