In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import warnings
warnings.filterwarnings('ignore')

In [2]:
randstate = 1000

In [3]:
df = pd.read_csv("bikeshare.csv")

In [4]:
df.head()

Unnamed: 0,season,yr,mnth,hr,holiday,weekday,notbizday,weathersit,temp,hum,windspeed,dteday,cnt
0,1,0,1,0,0,6,1,1,-1.334609,0.947345,-1.553844,2011-01-01,16
1,1,0,1,1,0,6,1,1,-1.438475,0.895513,-1.553844,2011-01-01,40
2,1,0,1,2,0,6,1,1,-1.438475,0.895513,-1.553844,2011-01-01,32
3,1,0,1,3,0,6,1,1,-1.334609,0.636351,-1.553844,2011-01-01,13
4,1,0,1,4,0,6,1,1,-1.334609,0.636351,-1.553844,2011-01-01,1


In [5]:
df.drop('dteday', axis=1, inplace=True)

In [6]:
# no null values, great!

df.isnull().sum()/len(df)

season        0.0
yr            0.0
mnth          0.0
hr            0.0
holiday       0.0
weekday       0.0
notbizday     0.0
weathersit    0.0
temp          0.0
hum           0.0
windspeed     0.0
cnt           0.0
dtype: float64

In [7]:
sum(df.duplicated())

2

In [8]:
# let's go ahead and drop those 2 duplicates

df.drop_duplicates(inplace=True)

# **Boosting Regression**

## Question 1

In [9]:
from sklearn.model_selection import train_test_split

y = df['cnt']
X = df.drop('cnt', axis=1, inplace=False)

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=randstate)

First, let's go ahead and build our vanilla RF model. We will then build out our boosting models and reference the RF model as a benchmark.

#### *Speed*

In [10]:
from sklearn.ensemble import RandomForestRegressor

In [18]:
# trained vanilla RF model

# %%time
RF_regression = RandomForestRegressor(random_state=randstate, n_estimators=100)
RF_regression.fit(X_train, y_train)

RandomForestRegressor(random_state=1000)

In [14]:
from sklearn.ensemble import AdaBoostRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor

In [16]:
%%time
ADB_regression = AdaBoostRegressor(random_state=randstate, n_estimators=100)
ADB_regression.fit(X_train, y_train)

CPU times: total: 469 ms
Wall time: 470 ms


AdaBoostRegressor(n_estimators=100, random_state=1000)

- Look at that time difference between vanilla RF and Adaboost!

In [19]:
%%time
GBM_regression = GradientBoostingRegressor(random_state=randstate, n_estimators=100)
GBM_regression.fit(X_train, y_train)

CPU times: total: 688 ms
Wall time: 694 ms


GradientBoostingRegressor(random_state=1000)

In [20]:
%%time
XGB_regression = XGBRegressor(random_state=randstate, n_estimators=100)
XGB_regression.fit(X_train, y_train)

CPU times: total: 2.78 s
Wall time: 208 ms


XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, feature_types=None, gamma=0, gpu_id=-1,
             grow_policy='depthwise', importance_type=None,
             interaction_constraints='', learning_rate=0.300000012, max_bin=256,
             max_cat_threshold=64, max_cat_to_onehot=4, max_delta_step=0,
             max_depth=6, max_leaves=0, min_child_weight=1, missing=nan,
             monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=1000, ...)

No surprise, XGBoost is the fastest. From fastest to slowest, the ranking is as follows:

1. XGBoost
2. AdaBoost
3. GBM
4. Random Forest 

#### *Performance*

In [21]:
y_hat_RF = RF_regression.predict(X_test)
y_hat_ADB = ADB_regression.predict(X_test)
y_hat_GBM = GBM_regression.predict(X_test)
y_hat_XGB = XGB_regression.predict(X_test)

In [22]:
predictions = pd.DataFrame({'y_test':y_test, 'y_hat_RF':y_hat_RF, 'y_hat_ADB':y_hat_ADB, 
                            'y_hat_GBM':y_hat_GBM, 'y_hat_XGB':y_hat_XGB})
predictions.head()

Unnamed: 0,y_test,y_hat_RF,y_hat_ADB,y_hat_GBM,y_hat_XGB
4495,26,21.51,34.971014,11.944526,28.991903
4418,47,63.72,62.055024,43.340632,81.034119
7141,11,5.27,34.971014,11.875055,-4.260192
14766,232,245.43,448.818367,327.30781,246.974426
13601,278,244.99,379.906046,246.650621,270.168213


In [23]:
np.round(RF_regression.score(X_test, y_test), 4)


0.9479

In [24]:
np.round(ADB_regression.score(X_test, y_test), 4)

0.658

In [25]:
np.round(GBM_regression.score(X_test, y_test), 4)

0.8422

In [26]:
np.round(XGB_regression.score(X_test, y_test), 4)

0.9539

Oddly enough, the best performance is coming from the vanilla RF model. The performance, ranked from highest to lowest $R^2$ are as follows:

1. Random Forest: $R^2 = 0.9479$
2. XGBoost: $R^2 = 0.8532$
3. GBM: $R^2 = 0.8422$
4. AdaBoost: $R^2 = 0.658$

I am curious what happened with the AdaBoost model; why is the $R^2$ so low?

Overall however, XGBoost is the clear winner in terms of speed and performance. The model was trained in ~700ms and championed an $R^2$ of 85%. 

---

# **Boosting Classification**

## Question 2

#### *Speed*

In [None]:
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
%%time
RF_classification = RandomForestClassifier(random_state=randstate, n_estimators=100)
RF_classification.fit(X_train, y_train)

- Look at how slow vanilla RF is at training!

In [None]:
%%time
ADB_classification = AdaBoostClassifier(random_state=randstate, n_estimators=100)
ADB_classification.fit(X_train, y_train)

In [None]:
%%time
GBM_classification = GradientBoostingClassifier(random_state=randstate, n_estimators=100)
GBM_classification.fit(X_train, y_train)

In [None]:
%%time
XGB_classification = XGBClassifier(random_state=randstate, n_estimators=100)
XGB_classification.fit(X_train, y_train)

#### *Performance*

In [None]:
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score

In [None]:
y_hat_RF = RF_classifier.predict(X_test)
y_hat_ADB = AdB_classifier.predict(X_test)
y_hat_GBM = GBM_classifier.predict(X_test)
y_hat_XGB = XGB_classifier.predict(X_test)

In [None]:
print('RF f1 = {}'.format(f1_score(y_test, y_hat_RF)))
print('AdB f1 = {}'.format(f1_score(y_test, y_hat_ADB)))
print('GBM f1 = {}'.format(f1_score(y_test, y_hat_GBM)))
print('XGB f1 = {}'.format(f1_score(y_test, y_hat_XGB)))