In [1]:
# Importing important libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.datasets import load_boston
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier

In [2]:
# Extracting the dataset
boston = load_boston()
print(boston.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [3]:
# EDA
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [4]:
# Separating input and output features
X = pd.DataFrame(boston.data, columns = boston.feature_names)
y = pd.Series(boston.target)

In [5]:
# Train Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 2)

In [6]:
# Verfiying if train test split is performed properly or not
print('Input Features Shape :', X.shape)
print('Output Feature Shape :', y.shape)
print('X train Shape :', X_train.shape)
print('X test Shape :', X_test.shape)
print('y train Shape :', y_train.shape)
print('y test Shape :', y_test.shape)

Input Features Shape : (506, 13)
Output Feature Shape : (506,)
X train Shape : (404, 13)
X test Shape : (102, 13)
y train Shape : (404,)
y test Shape : (102,)


In [7]:
# Training the model
model = GradientBoostingRegressor(max_depth = 2, n_estimators = 3, learning_rate = 1.0)
model.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=3)

In [8]:
# Error
errors = [mean_squared_error(y_test, y_pred) for y_pred in model.staged_predict(X_test)]

In [9]:
# Getting best n_estimators
best_n_estimators = np.argmin(errors)
best_n_estimators

2

In [10]:
# Getting best model
best_regressor = GradientBoostingRegressor(max_depth = 2, 
                                           n_estimators = best_n_estimators, 
                                           learning_rate = 1.0)
best_regressor.fit(X_train, y_train)

GradientBoostingRegressor(learning_rate=1.0, max_depth=2, n_estimators=2)

In [11]:
# Predictions from best model
y_pred = best_regressor.predict(X_test)

In [12]:
# Geting RMSE and R2 Score
print('RMSE :', np.sqrt(mean_squared_error(y_test, y_pred)))
print('R2 Score :', r2_score(y_test, y_pred))

RMSE : 5.019121636764081
R2 Score : 0.6988802557382686
