In [None]:
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import make_regression
import pandas as pd
X, Y = make_regression(n_features=5, n_informative=4,
random_state=123, shuffle=False, n_samples=5000)
# we skip the important task of feature engineering and take the data sample as is
# For an example with feature engineering see the house price example
print("XY shape",X.shape,Y.shape)

In [None]:
df = pd.DataFrame({'x0':X[:,0],'x1':X[:,1],'x2':X[:,2],'x3':X[:,3],'x4':X[:,4],'y':Y })
print(df.head(10))
for col in df.columns: df.plot.scatter(x=col,y='y')

In [None]:
BDT = AdaBoostRegressor(n_estimators=100)
BDT.fit(X, Y)
Y_pred = BDT.predict(X)
df['pred'] = Y_pred
df.plot.scatter(x='pred',y='y')

In [None]:
df['diff'] = df['y']-df['pred']
df['diff'].hist(bins=100)

In [None]:
from sklearn.tree import DecisionTreeRegressor as DTR
# hyperparameter scan by hand based on independent dataset
ntrain = 3000
Xtrain,Ytrain = X[:ntrain],Y[:ntrain] # split train data into train and test
Xtest,Ytest = X[ntrain:],Y[ntrain:]

for ndepth in [2,3,5]:
    for ntree in [50,100,200]:
        BDT = AdaBoostRegressor(n_estimators=ntree,base_estimator=DTR(max_depth=ndepth))
        BDT.fit(Xtrain,Ytrain)
        score = BDT.score(Xtest, Ytest)
        print("performance scan, ntree:",ntree,"ndepth:",ndepth,"score:",score)

In [None]:
from sklearn.tree import DecisionTreeRegressor as DTR
from sklearn.model_selection import cross_val_score

for ndepth in [2,3,5]:
    for ntree in [50,100,200]:
        BDT = AdaBoostRegressor(n_estimators=ntree,base_estimator=DTR(max_depth=ndepth))
        score = cross_val_score(BDT, X, Y, cv=2).mean()   # cv=5 is consensus for good performance
        print("performance scan, ntree:",ntree,"ndepth:",ndepth,"score:",score)

In [None]:
# take maximum parameter set and perform fit with full training data to obtain best regressor
ntree,ndepth = 200,5
BDT = AdaBoostRegressor(n_estimators=ntree,base_estimator=DTR(max_depth=ndepth))
bestregressor = BDT.fit(X,Y)
# question: what is the performance of the best regressor?
# we cannot use score = BDT.score(X, Y), because this gives us biased training performance
# we would need another independent sample to make a performance estimate