###### Importing libraries

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
% matplotlib inline

In [2]:
%load_ext autoreload
%autoreload 2

###### Reading in the data to model

In [25]:
X_test = pd.read_csv('../Data/test_x2.csv')
X = pd.read_csv('../Data/train_x2.csv')
y = pd.read_csv('../Data/train_y2.csv', header = None)
print(X_test.shape)
print(X.shape)
print(y.shape)

(1459, 181)
(1458, 181)
(1458, 1)


###### Gradient Boosting (book, p. 198)

In [122]:
# http://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html

In [4]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, np.ravel(y))

In [97]:

gbrt = GradientBoostingRegressor(max_depth=3, n_estimators=3000, random_state=42, learning_rate = 0.02,
                                min_samples_leaf = 15, min_samples_split=10)
gbrt.fit(X_train, y_train)

errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]
bst_n_estimators = np.argmin(errors)

gbrt_best = GradientBoostingRegressor(max_depth=3, n_estimators=bst_n_estimators, random_state=42, learning_rate = 0.02)
gbrt_best.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.02, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=1250, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False)

In [98]:
bst_n_estimators  # 270 (OUT OF 300)

1250

In [99]:
min(errors) # 0.014635885043588544

0.014989687644554045

In [100]:
min(errors)**0.5

0.12243237988601727

###### Grid Search with Cross Validation

In [28]:
boost_model = GradientBoostingRegressor(random_state=42, learning_rate = 0.01)  # 

In [29]:
from sklearn import model_selection
grid_para_boost = [
    {'max_depth': [2],
     'min_samples_split': [10],  # [8, 9, 10, 11],
     'max_features': [11,12],  #[12, 14, 16],
      'subsample': [0.6, 0.7, 0.8],   # [0.5, 0.6, 0.7],
      'n_estimators': [10000]}
]

In [30]:
grid_search_boost = model_selection.GridSearchCV(boost_model, grid_para_boost, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
grid_search_boost.fit(X_train, y_train)

GridSearchCV(cv=5, error_score='raise',
       estimator=GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=3, max_features=None,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             n_estimators=100, presort='auto', random_state=42,
             subsample=1.0, verbose=0, warm_start=False),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid=[{'max_depth': [2], 'min_samples_split': [10], 'max_features': [11, 12], 'subsample': [0.6, 0.7, 0.8], 'n_estimators': [10000]}],
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_mean_squared_error', verbose=0)

In [31]:
grid_search_boost.best_params_
# Best parameters I found first time: {'max_depth': 2, 'max_features': 10, 'min_samples_split': 14}
# Second time I ran it I got best params: {'max_depth': 2, 'max_features': 10, 'min_samples_split': 12, 'subsample': 0.8}
# Third time I ran it I got best params: {'max_depth': 2, 'max_features': 12, 'min_samples_split': 10, 'subsample': 0.7}
# Fourtt time I ran it I got best params: {'max_depth': 2, 'max_features': 12, 'min_samples_split': 10, 'n_estimators': 9000, 'subsample': 0.6}
# Fifth time I ran: {'max_depth': 2,  'max_features': 12,  'min_samples_split': 10,  'n_estimators': 9000,  'subsample': 0.5}
# Sixths time I ran: {'max_depth': 2,  'max_features': 15,  'min_samples_split': 8, 'n_estimators': 11000, 'subsample': 0.6}
# Seventh time I ran - with learning rate 0.01: {'max_depth': 2,  'max_features': 14,  'min_samples_split': 8,  'n_estimators': 10000,'subsample': 0.5}

{'max_depth': 2,
 'max_features': 11,
 'min_samples_split': 10,
 'n_estimators': 10000,
 'subsample': 0.6}

In [32]:
grid_search_boost.score(X_train, y_train)

-0.0027569935680823816

In [6]:
mytrees = list()
myerr = list()
for rand in range(20):

    print(rand)
    X_train, X_val, y_train, y_val = train_test_split(X, np.ravel(y), random_state = rand)
    gbrt = GradientBoostingRegressor(max_depth = 2, max_features = 11, min_samples_split = 10, subsample = 0.6,
     random_state=42, learning_rate = 0.01, n_estimators = 5000, verbose = 0)
# First time I used (random state was undefined):
# gbrt = GradientBoostingRegressor(max_depth = 2, max_features = 10, min_samples_split = 14,
#     random_state=42, learning_rate = 0.02, n_estimators = 100000, verbose = 0)
# Second time I used (random state was undefined):
# gbrt = GradientBoostingRegressor(max_depth = 2, max_features = 10, min_samples_split = 12, subsample = 0.8,
#     random_state=42, learning_rate = 0.02, n_estimators = 100000, verbose = 0)
# Third time I used (with random_state = 123):
# gbrt = GradientBoostingRegressor(max_depth = 2, max_features = 12, min_samples_split = 10, subsample = 0.7,
#     random_state=42, learning_rate = 0.02, n_estimators = 10000, verbose = 0)
# Fifths time:
# gbrt = GradientBoostingRegressor(max_depth = 2, max_features = 12, min_samples_split = 10, subsample = 0.5,
#     random_state=42, learning_rate = 0.02, n_estimators = 10000, verbose = 0)
# Sixths time:
# gbrt = GradientBoostingRegressor(max_depth = 2, max_features = 15, min_samples_split = 8, subsample = 0.6, 
#                                    random_state=42, learning_rate = 0.02, n_estimators = 8000, verbose = 0)
# With the new data first time:
# gbrt = GradientBoostingRegressor(max_depth = 2, max_features = 12, min_samples_split = 10, subsample = 0.7,
#      random_state=42, learning_rate = 0.01, n_estimators = 10000, verbose = 0)

    gbrt.fit(X_train, y_train)
    errors = [mean_squared_error(y_val, y_pred)**0.5 for y_pred in gbrt.staged_predict(X_val)]
    bst_n_estimators = np.argmin(errors)

    mytrees.append(bst_n_estimators)
    myerr.append(min(errors))

# gbrt_best = GradientBoostingRegressor()
# gbrt_best.fit(X_train, y_train)
# First time, the best number of trees was 6,706 and its error was 0.11978486920254075
# Second time, the best number of trees was 5,449 and its error was 0.11386754321967392 or, with random_state 123 above 
# the error was 0.1111 for the best number of trees 6,233
# Third time, the best number of trees was 3,916 and its error was 0.11005619929888709 (with traintest random state=123)
# and when random state for train-test split was = 12, the best number of trees was 8942, and error 0.1117828243038694
# and for range(20) average RMSE across 20 runs was: 0.11822365072318571
# Fourth time, the best number of trees was 4433 and random state was 12 and error 0.111, 
# but for random state 123 it was 5309 trees and error 0.11034254374190201.
# Firth time I got 5705 trees for random state 123 and error: 0.11005303800608779
# and for random state 12 - 5831 trees and error: 0.11294471708044936
# and for range(20) average RMSE across 20 runs was: 0.11818895489231944
# Sixth time:
# and for range(20) average RMSE across 20 runs was: 0.11918941133084104
# Seventh time: 0.11843656398659105 - best estimators 8672
np.mean(myerr)
# Eighth time - 0.1152

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19


0.1152030762562872

In [33]:
bst_n_estimators  # 270 (OUT OF 300)

4054

In [34]:
min(errors)

0.12362774625269257

In [9]:
mytrees

[4982,
 4033,
 3902,
 2587,
 4892,
 4895,
 4994,
 4996,
 4603,
 4880,
 4999,
 4379,
 4999,
 4929,
 4988,
 4862,
 4609,
 4757,
 4989,
 2575]

In [10]:
myerr

[0.10794639376127524,
 0.11872493806424715,
 0.11182043270982346,
 0.12655817446800563,
 0.10408685292504148,
 0.12302379421987837,
 0.10680724439727693,
 0.12099045104164678,
 0.10704291282584164,
 0.12497143321992053,
 0.10781964434852237,
 0.11323368732528978,
 0.1306322773983431,
 0.12230449624037233,
 0.1127721386705772,
 0.11124752512062976,
 0.10928910905293525,
 0.11501649184749926,
 0.10725532577140991,
 0.12251820171720797]

In [26]:
X_train, X_val, y_train, y_val = train_test_split(X, np.ravel(y))
gbrt_last = GradientBoostingRegressor(max_depth = 2, max_features = 11, min_samples_split = 10, subsample = 0.6,
     random_state=42, learning_rate = 0.01, n_estimators = 4800, verbose = 0)
gbrt_last.fit(X_train, y_train)
y_pred = gbrt_last.predict(X_val)

In [27]:
np.corrcoef(y_val, y_pred)  # Correlation is 0.9611! - based on the updated data

array([[ 1.      ,  0.955567],
       [ 0.955567,  1.      ]])

###### Submitting to LB

In [29]:
gbrt_last2 = GradientBoostingRegressor(max_depth = 2, max_features = 11, min_samples_split = 10, subsample = 0.6,
     random_state=42, learning_rate = 0.01, n_estimators = 4800, verbose = 0)
gbrt_last2.fit(X, np.ravel(y))

GradientBoostingRegressor(alpha=0.9, criterion='friedman_mse', init=None,
             learning_rate=0.01, loss='ls', max_depth=2, max_features=11,
             max_leaf_nodes=None, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=10, min_weight_fraction_leaf=0.0,
             n_estimators=4800, presort='auto', random_state=42,
             subsample=0.6, verbose=0, warm_start=False)

In [31]:
test_pred = gbrt_last2.predict(X_test)

In [32]:
forkagel = pd.DataFrame({'Id' : (np.arange(len(X_test))+1461),
            'SalePrice': np.exp(test_pred)})

In [33]:
forkagel.to_csv(path_or_buf="../Data/predictions_justGradientBoost.csv",index=None)