In [1]:
import pandas as pd 
import numpy as np
import warnings 
from sklearn.model_selection import train_test_split
warnings.filterwarnings("ignore")

df_bikes = pd.read_csv('./bike_rentals_cleaned.csv')
X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]

X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, 
                                                    random_state=2)


In [2]:
from sklearn.tree import DecisionTreeRegressor
tree_1 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_1.fit(X_train, y_train)
y_train_pred = tree_1.predict(X_train)
y2_train = y_train - y_train_pred

In [3]:
tree_2 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_2.fit(X_train, y2_train)
y2_train_pred = tree_2.predict(X_train)
y3_train = y2_train - y2_train_pred


In [4]:
tree_3 = DecisionTreeRegressor(max_depth=2, random_state=2)
tree_3.fit(X_train, y3_train)


In [5]:
y1_pred = tree_1.predict(X_test)
y2_pred = tree_2.predict(X_test)
y3_pred = tree_3.predict(X_test)

In [6]:
y_pred = y1_pred + y2_pred + y3_pred

In [7]:
from sklearn.metrics import mean_squared_error as MSE
MSE(y_test, y_pred) ** .5

911.0479538776444

In [8]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=3, random_state=2,
                                learning_rate=1.0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
MSE(y_test, y_pred) ** .5

911.0479538776439

In [9]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=30, 
                                random_state=2, learning_rate=1.0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
MSE(y_test, y_pred) ** .5

857.1072323426944

In [10]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, 
                                random_state=2, learning_rate=1.0)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
MSE(y_test, y_pred) ** .5

936.3617413678853

In [11]:
gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, 
                                random_state=2)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
MSE(y_test, y_pred) ** .5

653.7456840231495

In [12]:
learning_rate_values = [0.001, 0.01, 0.05, 0.1, 0.15, 0.2, 0.3, 0.5, 1.0]
for value in learning_rate_values:
  gbr = GradientBoostingRegressor(max_depth=2, n_estimators=300, 
                                random_state=2, learning_rate=value)
  gbr.fit(X_train, y_train)
  y_pred = gbr.predict(X_test)
  rmse = MSE(y_test, y_pred) ** .5
  print(f"Learning Rate: {value}, Score: {rmse}")

Learning Rate: 0.001, Score: 1633.0261400367258
Learning Rate: 0.01, Score: 831.5430182728547
Learning Rate: 0.05, Score: 685.0192988749717
Learning Rate: 0.1, Score: 653.7456840231495
Learning Rate: 0.15, Score: 687.666134269379
Learning Rate: 0.2, Score: 664.312804425697
Learning Rate: 0.3, Score: 689.4190385930236
Learning Rate: 0.5, Score: 693.8856905068778
Learning Rate: 1.0, Score: 936.3617413678853


In [13]:
depths = [None, 1, 2, 3, 4]
for depth in depths:
  gbr = GradientBoostingRegressor(max_depth=depth, n_estimators=300, 
                                  random_state=2)
  gbr.fit(X_train, y_train)
  y_pred = gbr.predict(X_test)
  rmse = MSE(y_test, y_pred) ** 0.5
  print(f"Max Depth: {depth}, Score: {rmse}")

Max Depth: None, Score: 869.2788645118395
Max Depth: 1, Score: 707.8261886858736
Max Depth: 2, Score: 653.7456840231495
Max Depth: 3, Score: 646.4045923317708
Max Depth: 4, Score: 663.048387855927


In [15]:
samples = [1, 0.9, 0.8, .7, .6, .5]
for sample in samples:
  gbr = GradientBoostingRegressor(max_depth=3, n_estimators=300, 
                                  subsample=sample, random_state=2)
  gbr.fit(X_train, y_train)
  y_pred = gbr.predict(X_test)
  rmse = MSE(y_test, y_pred) ** 0.5
  print(f"Subsample: {sample}, Score: {rmse}")

Subsample: 1, Score: 646.4045923317708
Subsample: 0.9, Score: 620.1819001443569
Subsample: 0.8, Score: 617.2355650565677
Subsample: 0.7, Score: 612.9879156983139
Subsample: 0.6, Score: 622.6385116402317
Subsample: 0.5, Score: 626.9974073227554


In [16]:
from sklearn.model_selection import RandomizedSearchCV

params = {'subsample': [0.65, 0.70, 0.75], 
          'n_estimators': [300, 500, 1000], 
          'learning_rate': [0.05, 0.075, 0.1]}
gbr = GradientBoostingRegressor(max_depth=3, random_state=2)
rand_reg = RandomizedSearchCV(gbr, params, n_iter=10, 
                              scoring='neg_mean_squared_error', cv=5, 
                              n_jobs=-1, random_state=2)
rand_reg.fit(X_train, y_train)
best_model = rand_reg.best_estimator_
best_params = rand_reg.best_params_
print("Best params:", best_params)
best_score = np.sqrt(-rand_reg.best_score_)
print("Training score: {:.3f}".format(best_score))
y_pred = best_model.predict(X_test)
rmse_test = MSE(y_test, y_pred)**0.5
print('Test set score: {:.3f}'.format(rmse_test))

Best params: {'subsample': 0.65, 'n_estimators': 300, 'learning_rate': 0.05}
Training score: 636.200
Test set score: 625.985


In [17]:
gbr = GradientBoostingRegressor(max_depth=3, n_estimators=1600, 
                                  subsample=0.75, learning_rate=0.02,
                                  random_state=2)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
MSE(y_test, y_pred) ** .5


596.9544588974487

In [18]:
from xgboost import XGBRegressor
xg_reg = XGBRegressor(max_depth=3, n_estimators=1600, eta=0.02, 
                      subsample=0.75, random_state=2)
xg_reg.fit(X_train, y_train)
y_pred = xg_reg.predict(X_test)
MSE(y_test, y_pred) ** .5

584.3395337495713

**Approaching big data -- gradient boosting versus XGBoost**

In [19]:
df = pd.read_csv('./exoplanets.csv')
df.head()



Unnamed: 0,LABEL,FLUX.1,FLUX.2,FLUX.3,FLUX.4,FLUX.5,FLUX.6,FLUX.7,FLUX.8,FLUX.9,...,FLUX.3188,FLUX.3189,FLUX.3190,FLUX.3191,FLUX.3192,FLUX.3193,FLUX.3194,FLUX.3195,FLUX.3196,FLUX.3197
0,2,93.85,83.81,20.1,-26.98,-39.56,-124.71,-135.18,-96.27,-79.89,...,-78.07,-102.15,-102.15,25.13,48.57,92.54,39.32,61.42,5.08,-39.54
1,2,-38.88,-33.83,-58.54,-40.09,-79.31,-72.81,-86.55,-85.33,-83.97,...,-3.28,-32.21,-32.21,-24.89,-4.86,0.76,-11.7,6.46,16.0,19.93
2,2,532.64,535.92,513.73,496.92,456.45,466.0,464.5,486.39,436.56,...,-71.69,13.31,13.31,-29.89,-20.88,5.06,-11.8,-28.91,-70.02,-96.67
3,2,326.52,347.39,302.35,298.13,317.74,312.7,322.33,311.31,312.42,...,5.71,-3.73,-3.73,30.05,20.03,-12.67,-8.77,-17.31,-17.35,13.98
4,2,-1107.21,-1112.59,-1118.95,-1095.1,-1057.55,-1034.48,-998.34,-1022.71,-989.57,...,-594.37,-401.66,-401.66,-357.24,-443.76,-438.54,-399.71,-384.65,-411.79,-510.54


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5087 entries, 0 to 5086
Columns: 3198 entries, LABEL to FLUX.3197
dtypes: float64(3197), int64(1)
memory usage: 124.1 MB


In [21]:
df.isnull().sum().sum()


0

In [22]:
X, y = df.iloc[:,1:], df.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [23]:
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier 
from sklearn.metrics import accuracy_score

import time 

start = time.time()
df.info()
end = time.time()
elapsed = end - start 
print(f"\nRun Time: {elapsed} seconds.")


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5087 entries, 0 to 5086
Columns: 3198 entries, LABEL to FLUX.3197
dtypes: float64(3197), int64(1)
memory usage: 124.1 MB

Run Time: 0.0468754768371582 seconds.


In [24]:
start = time.time()
gbr = GradientBoostingClassifier(n_estimators=100, max_depth=2, 
                                 random_state=2)
gbr.fit(X_train, y_train)
y_pred = gbr.predict(X_test)
score = accuracy_score(y_pred, y_test)
print(f"Score: {score}")
end = time.time()
elapsed = end - start
print(f"\nRun Time: {elapsed} seconds")

Score: 0.9874213836477987

Run Time: 629.8467800617218 seconds


In [26]:
start = time.time()
xg_reg = XGBClassifier(n_estimators=100, max_depth=2, 
                       random_state=2)
xg_reg.fit(X_train, y_train - 1)
y_pred = xg_reg.predict(X_test)
score = accuracy_score(y_pred, y_test - 1)
print(f"Score: {score}")
end = time.time()
elapsed = end - start 
print(f"\nRun Time: {elapsed} seconds")


Score: 0.9913522012578616

Run Time: 36.13928723335266 seconds
