In [31]:
import numpy as np

from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import validation_curve

In [2]:
boston = load_boston()
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [3]:
p = 0.75

idx = int(p * boston.data.shape[0]) + 1

X_train, X_test = np.split(boston.data, [idx])
Y_train, Y_test = np.split(boston.target, [idx])

In [4]:
def L_i_derivative (y_predict,y):
    return (y_predict - y)

In [5]:
base_algorithms = []
base_alg_coeffs = []

def treegboost_pred(base_algs, coeffs, X):
    return sum(i * j.predict(X) for i,j in zip(coeffs,base_algs))

dtr = DecisionTreeRegressor(max_depth = 5, random_state = 42)
dtr.fit(X_train, Y_train)
base_algorithms.append(dtr)
base_alg_coeffs.append(0.9)

for i in range (2):
    dtr = DecisionTreeRegressor(max_depth = 5, random_state = 42)
    dtr.fit(X_train, L_i_derivative(treegboost_pred(base_algorithms, base_alg_coeffs, X_train),Y_train))
    base_algorithms.append(dtr)
    base_alg_coeffs.append(0.9/(1.0+i))
    
#print(treegboost_pred(base_algorithms, base_alg_coeffs, X_test))
#print(Y_test)
print("RMSE: ", np.sqrt(mean_squared_error(Y_test, treegboost_pred(base_algorithms,base_alg_coeffs,X_test))))
    

RMSE:  4.969222077628377


In [22]:
GBT = GradientBoostingRegressor()
arg_range = [20,50,100,150,200,350,500,700,800,900,1000,1450]
train_scores, test_scores = validation_curve(GBT, boston.data, boston.target, param_name = "n_estimators", param_range = arg_range, cv = 5)
print("train score:", train_scores.mean(axis = 1), "strain score std:", train_scores.std(axis =1))
print("test score:", test_scores.mean(axis = 1), "strain score std:", test_scores.std(axis = 1))


train score: [0.90529915 0.96243631 0.97875555 0.98651671 0.99131279 0.99735812
 0.99906909 0.99974376 0.99986577 0.99992858 0.99996045 0.99999719] strain score std: [8.53761524e-03 4.55782388e-03 2.71681971e-03 1.67808540e-03
 1.26722791e-03 4.15989389e-04 1.64600898e-04 3.84028821e-05
 1.99854744e-05 1.11433679e-05 5.05626467e-06 4.79690372e-07]
test score: [0.57183394 0.66247202 0.67791619 0.66951505 0.68298986 0.67590452
 0.67275961 0.66369047 0.6668695  0.67268193 0.66069097 0.67126719] strain score std: [0.22512869 0.1684089  0.15946315 0.17292391 0.15074363 0.16130299
 0.16375608 0.1791528  0.17303449 0.15724698 0.17586587 0.16423517]


In [23]:
arg_range = [2,3,4,5,8,12,20,40,100,150]
train_scores, test_scores = validation_curve(GBT, boston.data, boston.target, param_name = "max_depth", param_range = arg_range, cv = 5)
print("train score:", train_scores.mean(axis = 1), "strain score std:", train_scores.std(axis =1))
print("test score:", test_scores.mean(axis = 1), "strain score std:", test_scores.std(axis = 1))


train score: [0.95096293 0.97875555 0.9925238  0.99774812 0.99999608 1.
 1.         1.         1.         1.        ] strain score std: [5.48192721e-03 2.71681971e-03 9.06733107e-04 3.82204891e-04
 1.54415760e-06 5.98859230e-11 4.18776655e-14 0.00000000e+00
 0.00000000e+00 0.00000000e+00]
test score: [0.68954001 0.67774182 0.63802253 0.56530986 0.47728097 0.39633303
 0.38763882 0.38697361 0.383208   0.3861982 ] strain score std: [0.15509841 0.15450158 0.17632729 0.22345417 0.33126798 0.44566117
 0.40949888 0.40288348 0.43118986 0.42457114]


In [25]:
GBT.fit(X_train,Y_train)
print("RMSE: ", np.sqrt(mean_squared_error(Y_test, GBT.predict(X_test))))

RMSE:  4.261642365798029


In [30]:
lreg = LinearRegression()
lreg.fit(X_train,Y_train)
print("RMSE: ", np.sqrt(mean_squared_error(Y_test, lreg.predict(X_test))))

RMSE:  7.819688142087344
