Boston Housing Dat

In order to gain a better understanding of the metrics used in regression settings, I will be looking at the Boston Housing Dataset

First use the cell below to read in the dataset and set up the training and testing data that will be used for the rest of the problem.



In [12]:
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import numpy as np

# import models
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.linear_model import LinearRegression

# import metrics
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

boston = load_boston()
y = boston.target
X = boston.data

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

In [6]:
# Instantiate each of the models you imported
# For now use the defaults for all the hyperparameters

# Decision Tree
dt_mod = DecisionTreeRegressor()

# Random Forest
rf_mod = RandomForestRegressor()

# Adaptive Boosting
ada_mod = AdaBoostRegressor()

# Linear Regression
lm_mod = LinearRegression()

In [9]:
# Fit each of your models using the training data

dt_mod.fit(X_train, y_train)

rf_mod.fit(X_train, y_train)

ada_mod.fit(X_train, y_train)

lm_mod.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [11]:
# Predict on the test values for each model

dt_pred = dt_mod.predict(X_test)

rf_pred = rf_mod.predict(X_test)

ada_pred = ada_mod.predict(X_test)

lm_pred = lm_mod.predict(X_test)

In [17]:
def r2(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the r-squared score as a float
    '''
    sse = np.sum((actual-preds)**2)
    sst = np.sum((actual-np.mean(actual))**2)
    return 1 - sse/sst

# Check solution matches sklearn
print(r2(y_test, dt_pred))
print(r2_score(y_test, dt_pred))
print("Since the above match, we can see that we have correctly calculated the r2 value.")

0.7252124886013344
0.7252124886013344
Since the above match, we can see that we have correctly calculated the r2 value.


In [16]:
def mse(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the mean squared error as a float
    '''
    
    return np.sum((actual-preds)**2)/len(actual)


# Check your solution matches sklearn
print(mse(y_test, dt_pred))
print(mean_squared_error(y_test, dt_pred))
print("If the above match, you are all set!")

20.795508982035926
20.795508982035926
If the above match, you are all set!


In [20]:
def mae(actual, preds):
    '''
    INPUT:
    actual - numpy array or pd series of actual y values
    preds - numpy array or pd series of predicted y values
    OUTPUT:
    returns the mean absolute error as a float
    '''
    
    return np.sum(np.abs(actual-preds))/len(actual)

# Check your solution matches sklearn
print(mae(y_test, dt_pred))
print(mean_absolute_error(y_test, dt_pred))
print("If the above match, you are all set!")

3.18622754491018
3.18622754491018
If the above match, you are all set!


In [32]:
for n, preds in zip(['DT', 'RF', 'ADA', 'lm'],[dt_pred, rf_pred, ada_pred, lm_pred]):
    print("R2")
    print(n,":\t", r2(y_test, preds))
    print('MSE')
    print(n,":\t", mse(y_test, preds))
    print('MAE')
    print(n,":\t", mae(y_test, preds))
    print("\n")
    

R2
DT :	 0.7252124886013344
MSE
DT :	 20.795508982035926
MAE
DT :	 3.18622754491018


R2
RF :	 0.8314785799182443
MSE
RF :	 12.753449700598804
MAE
RF :	 2.3514371257485034


R2
ADA :	 0.799901633436377
MSE
ADA :	 15.143145909304039
MAE
ADA :	 2.6385133840517563


R2
lm :	 0.7258515818230061
MSE
lm :	 20.747143360308847
MAE
lm :	 3.15128783658839


