# Model IV : Ensembling

This is the last model of this part of the project. We will used estimated models to perform an ensembling.

In [51]:
import pandas as pd
import numpy as np
from scipy import stats
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold
from sklearn.metrics import mean_absolute_error, make_scorer
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.tree import DecisionTreeRegressor
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import StackingRegressor

# Step 1: Prepare the data

In [52]:
data5 = pd.read_csv('data/data_after_prep.csv')

In [53]:
transformed_rented_bike_count, lambda_best_fit = stats.boxcox(data5['rented_bike_count'])

boxcox_target = pd.DataFrame()
boxcox_target['boxcox_rented_bike_count'] = transformed_rented_bike_count

In [54]:
X = data5.drop(columns=['rented_bike_count']) 
y_t = boxcox_target['boxcox_rented_bike_count'] # _t stands for transformed

In [55]:
X_train, X_test, y_train_t, y_test_t = train_test_split(X, y_t, test_size=0.3, random_state=123)
len(X_train)

7458

For reverse from box-cox transformed values to the real interpretable values please see the ensembling script.

# Step 2: Copy paste the best models

### Random Forest

In [56]:
best_params_rf = {
    'max_depth': 20,
    'min_samples_leaf': 3,
    'min_samples_split': 7,
    'n_estimators': 100
}

best_rf_regressor5 = RandomForestRegressor(random_state=123, **best_params_rf)

### XGBOOST

In [57]:
best_params_xgb = {
'colsample_bytree': 0.9,
 'learning_rate': 0.01,
 'max_depth': 8,
 'n_estimators': 1400,
 'subsample': 0.8
 }

best_xgb_model4 = xgb.XGBRegressor(**best_params_xgb, random_state=123)

### Regression Tree

In [58]:
best_params_dtr = {
    'max_depth': 30,
    'min_samples_leaf': 3, 
    'min_samples_split': 9}

best_regressor2 = DecisionTreeRegressor(**best_params_dtr)

In [59]:
models = {'Random Forest' : RandomForestRegressor(random_state=123, **best_params_rf),
                        'XGB' :  xgb.XGBRegressor(**best_params_xgb, random_state=123),
                         'DTR' :  DecisionTreeRegressor(**best_params_dtr, random_state=123)}


predictions_df = pd.DataFrame()

for model_name, model in models.items():

    model.fit(X_train, y_train_t)
    train_predictions = model.predict(X_train)
    
    predictions_df[model_name] = train_predictions

predictions_df


Unnamed: 0,Random Forest,XGB,DTR
0,10.259062,10.241086,10.369159
1,14.749179,14.765772,14.572358
2,13.071292,13.074230,13.225625
3,14.339373,14.550099,14.075066
4,15.281739,15.578500,15.467207
...,...,...,...
7453,15.467101,15.649019,15.563404
7454,8.747498,8.887631,8.814815
7455,14.104087,14.157540,13.871781
7456,11.181591,11.195338,11.229902


# Step 3: Stacking

In [60]:
def custom_mae(y_true, y_pred):
    return mean_absolute_error(y_true, y_pred)

In [61]:
from vecstack import stacking
from sklearn.model_selection import RandomizedSearchCV

models = [RandomForestRegressor(random_state=123, **best_params_rf), 
          xgb.XGBRegressor(**best_params_xgb, random_state=123),
          DecisionTreeRegressor(**best_params_dtr, random_state=123)]


S_Train, S_test = stacking(models,                   
                           X_train, y_train_t, X_test,   
                           regression=True,
                           mode='oof_pred_bag',
                           needs_proba=False,
                           save_dir=None, 
                           metric= custom_mae, 
                           n_folds=5, 
                           stratified=True,
                           shuffle=True, 
                           random_state=123, 
                           verbose=2)




task:         [regression]
metric:       [custom_mae]
mode:         [oof_pred_bag]
n_models:     [3]

model  0:     [RandomForestRegressor]
    fold  0:  [0.61222189]
    fold  1:  [0.63121820]
    fold  2:  [0.64083715]
    fold  3:  [0.66761426]
    fold  4:  [0.65088324]
    ----
    MEAN:     [0.64055495] + [0.01859514]
    FULL:     [0.64054993]

model  1:     [XGBRegressor]
    fold  0:  [0.52995083]
    fold  1:  [0.53645839]
    fold  2:  [0.53397705]
    fold  3:  [0.56033081]
    fold  4:  [0.54490790]
    ----
    MEAN:     [0.54112499] + [0.01077874]
    FULL:     [0.54112191]

model  2:     [DecisionTreeRegressor]
    fold  0:  [0.79069244]
    fold  1:  [0.78081929]
    fold  2:  [0.79229353]
    fold  3:  [0.82547695]
    fold  4:  [0.84446634]
    ----
    MEAN:     [0.80674971] + [0.02413486]
    FULL:     [0.80674214]



Since we already trained our models and applied the cross-validation, we can use the sklearn package (which do not utilize any CV).

In [62]:
models = [
    ('rf', RandomForestRegressor(random_state=123, **best_params_rf)),
    ('xgb', xgb.XGBRegressor(**best_params_xgb, random_state=123)),
    ('dtr', DecisionTreeRegressor(**best_params_dtr, random_state=123))
    ]
meta_model = xgb.XGBRegressor(**best_params_xgb, random_state=123)

# Create the StackingRegressor
stacking_regressor = StackingRegressor(estimators=models, final_estimator=meta_model)
stacking_regressor.fit(X_train, y_train_t)

# Evaluate the StackingRegressor 
score = stacking_regressor.score(X_test, y_test_t)

# Make predictions using the meta-model
y_pred = stacking_regressor.predict(X_test)

# Step 4: Evaluate the stacking model

In [63]:
def custom_mape_scorer(y_true, y_pred):
    ape = abs((y_true - y_pred) / y_true) * 100
    return np.mean(ape)

In [64]:
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)

In [66]:
scores_mae = cross_val_score(stacking_regressor, X_train, y_train_t, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores_mse = cross_val_score(stacking_regressor, X_train, y_train_t, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

scores_mae = abs(scores_mae)
scores_mse = abs(scores_mse)

print('Mean MAE: %.3f (%.3f) Mean MSE: %.3f' % (scores_mae.mean(), scores_mae.std(), scores_mse.mean()) )

mape_scores = cross_val_score(stacking_regressor, X_train, y_train_t, cv=5, scoring=make_scorer(custom_mape_scorer))

print("MAPE scores:", mape_scores)
print("Mean MAPE:", np.mean(mape_scores))

Mean MAE: 0.566 (0.029) Mean MSE: 0.838
MAPE scores: [7.12125442 7.01969374 6.33868515 6.58329872 7.42769338]
Mean MAPE: 6.89812508306818


# Step 5: Conclusion

Stacking model was performed using the target variable with Boc-Cox transformation. Surpisingly, its performance was worse in relation to the meta model: XGBoost.

# Step 6: Final prediciton

In [67]:
scores_mae = cross_val_score(stacking_regressor, X_test, y_test_t, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores_mse = cross_val_score(stacking_regressor, X_test, y_test_t, scoring='neg_mean_squared_error', cv=cv, n_jobs=-1)

scores_mae = abs(scores_mae)
scores_mse = abs(scores_mse)

print('Mean MAE: %.3f (%.3f) Mean MSE: %.3f' % (scores_mae.mean(), scores_mae.std(), scores_mse.mean()) )

mape_scores = cross_val_score(stacking_regressor, X_test, y_test_t, cv=5, scoring=make_scorer(custom_mape_scorer))

print("MAPE scores:", mape_scores)
print("Mean MAPE:", np.mean(mape_scores))

Mean MAE: 0.705 (0.053) Mean MSE: 1.191
MAPE scores: [8.0657022  8.90722287 7.83579876 8.34958414 9.55975083]
Mean MAPE: 8.543611758396105


In [70]:
y_test_pred = stacking_regressor.predict(X_test)  
y_test_pred = pd.DataFrame(y_test_pred)
y_test_pred.to_csv('y_pred_STACK.csv', index=False)