In [1]:
import os
import pandas as pd
import datetime
import seaborn as sns
import holidays
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder
from sklearn.compose import make_column_transformer
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import FunctionTransformer, StandardScaler
from scipy.stats import uniform, randint
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold, RandomizedSearchCV, train_test_split
import xgboost as xgb
import lightgbm as lgb
import catboost
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import StackingRegressor, AdaBoostRegressor
from sklearn.model_selection import cross_val_predict
from sklearn.linear_model import LassoLarsCV, ElasticNetCV, RidgeCV, LassoCV, SGDRegressor
import numpy as np
import problem
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import HistGradientBoostingRegressor
from catboost import CatBoostRegressor
from sklearn.impute import SimpleImputer
#!pip install category_encoders
from category_encoders.target_encoder import TargetEncoder
import warnings
warnings.filterwarnings('ignore')

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


In [2]:
# load train data
X, y = problem.get_train_data()
external_data = pd.read_csv(os.path.join('submissions', 'test_final', 'external_data.csv'))

In [3]:
external_data.dtypes

DateOfDeparture              object
event_level_dep             float64
mean_temp_dep                 int64
year                          int64
month                         int64
Departure                    object
Arrival                      object
departures_performed        float64
passengers                  float64
utilization_rate            float64
distance                    float64
passengers_per_departure    float64
event_level_arr             float64
gdp_capita_dep              float64
population_dep                int64
gdp_capita_arr              float64
population_arr                int64
sum_gdp_dep_arr             float64
sum_pop_dep_arr               int64
avg_monthly_cost_gallon     float64
perc_cancelled              float64
is_holiday                     bool
holidays_distance             int64
dtype: object

In [4]:
external_data.head()

Unnamed: 0,DateOfDeparture,event_level_dep,mean_temp_dep,year,month,Departure,Arrival,departures_performed,passengers,utilization_rate,...,gdp_capita_dep,population_dep,gdp_capita_arr,population_arr,sum_gdp_dep_arr,sum_pop_dep_arr,avg_monthly_cost_gallon,perc_cancelled,is_holiday,holidays_distance
0,2011-09-01,0.0,29,2011,9,ATL,BOS,465.0,61995.0,0.864427,...,56783.0,437812,81680.0,630505,138463.0,1068317,0.3087,0.0,False,4
1,2011-09-01,0.0,24,2011,9,EWR,IAH,303.0,40906.0,0.73564,...,72237.0,8272948,66407.0,2126032,138644.0,10398980,0.3087,0.0,False,4
2,2011-09-01,0.0,28,2011,9,DEN,LAS,599.0,77612.0,0.909263,...,59659.0,620530,47641.0,586606,107300.0,1207136,0.3087,0.0,False,4
3,2011-09-01,0.0,24,2011,9,EWR,DEN,257.0,32136.0,0.821577,...,72237.0,8272948,59659.0,620530,131896.0,8893478,0.3087,0.0,False,4
4,2011-09-01,0.0,24,2011,9,EWR,CLT,440.0,38418.0,0.783641,...,72237.0,8272948,61277.0,754829,133514.0,9027777,0.3087,0.0,False,4


In [5]:
def _merge_external_data(X):
    external_data = pd.read_csv(os.path.join('submissions', 'test_final', 'external_data.csv'))
    
    # create a "year" and "month" columns to enable the merge
    X.loc[:, 'DateOfDeparture'] = pd.to_datetime(X.loc[:, 'DateOfDeparture'])
    external_data.loc[:, 'DateOfDeparture'] = pd.to_datetime(external_data.loc[:, 'DateOfDeparture'])
    #external_data = external_data.fillna(0.0)
    
    X['connexion'] = X['Departure'] + '_' + X['Arrival']
    X['connexion'] = ['_'.join(np.sort(x.split('_'))) for x in X['connexion']]

    external_data['event_level_dep_arr'] = external_data['event_level_dep'] * external_data['event_level_arr']
    external_data.drop(columns=['event_level_dep', 'event_level_arr'],inplace=True)

    X_merged = X.merge(external_data, how='left', on=['Departure', 'Arrival', 'DateOfDeparture'])
    
    pass_per_month = external_data[external_data['year'] == 2012].groupby('month').agg('sum')['passengers'].to_dict()
    X_merged['month_importance'] = X_merged['month'].replace(pass_per_month)
    
    pass_per_connexion = X_merged.groupby('connexion').agg('mean')['passengers'].to_dict()
    X_merged['connexion_importance'] = X_merged['connexion'].replace(pass_per_connexion)
    
    
    
    # super cheat
    #X_merged['log_passengers_PAX'] = X_merged['log_passengers'] - y
    
    X_merged.drop(columns=['mean_temp_dep','passengers_per_departure', 'avg_monthly_cost_gallon'], inplace=True)
    return X_merged

In [6]:

def _encode_dates(X):
    X_encoded = X.copy()

    # Make sure that DateOfDeparture is of datetime format
    X_encoded.loc[:, 'DateOfDeparture'] = pd.to_datetime(X_encoded.loc[:, 'DateOfDeparture'])
    
    # Encode the DateOfDeparture
    X_encoded.loc[:, 'day'] = X_encoded['DateOfDeparture'].dt.day
    X_encoded.loc[:, 'weekday'] = X_encoded['DateOfDeparture'].dt.weekday
    X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week
    X_encoded.loc[:, 'is_weekend'] = [True if x in [5, 6] else False for x in X_encoded.loc[:, 'weekday']]
    X_encoded.loc[:, 'n_days'] = X_encoded['DateOfDeparture'].apply(lambda date: (date - pd.to_datetime("1970-01-01")).days)

    # Encode holidays
    us_holidays = holidays.US()
    X_encoded.loc[:, 'is_holiday'] = [x in us_holidays for x in X_encoded['DateOfDeparture']]
    X_encoded.loc[:, 'is_beginning_holidays'] = [(x not in us_holidays) & (x + datetime.timedelta(days=1) in us_holidays) for x in X_encoded['DateOfDeparture']]
    X_encoded.loc[:, 'is_end_holidays'] = [(x in us_holidays) & (x + datetime.timedelta(days=1) not in us_holidays) for x in X_encoded['DateOfDeparture']]
    
    X_encoded.drop(columns=['DateOfDeparture'], inplace=True)

    return X_encoded 


In [33]:
def get_estimator():
    # preprocessing the data
    data_merger = FunctionTransformer(_merge_external_data)
    date_encoder = FunctionTransformer(_encode_dates)

    # create a preprocessor 
    preprocessor = make_pipeline(data_merger, date_encoder)

    ###
    target_encoder = TargetEncoder()
    target_cols = ['connexion', 'event_level_dep_arr', 'day', 'weekday', 'week', 'year', 'month']


    numerical_cols = ['WeeksToDeparture', 'std_wtd',
                'departures_performed', 'passengers',
                'utilization_rate', 'distance', 'gdp_capita_dep', 'population_dep',
                'gdp_capita_arr', 'population_arr', 'sum_gdp_dep_arr',
                'sum_pop_dep_arr', 'month_importance', 'connexion_importance']

    numerical_encoder = make_pipeline(StandardScaler())#, fill_value="missing"))

    categorical_cols = ['Departure', 'Arrival']

    categorical_encoder = make_pipeline(OrdinalEncoder())

    encoder = make_column_transformer(
        (categorical_encoder, categorical_cols),
        (numerical_encoder, numerical_cols),
        (target_encoder, target_cols),
        remainder='passthrough'  # passthrough numerical columns as they are
    )

    # Models
    xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, reg_alpha=0.795, reg_lambda=0.172,
                                colsample_bytree=0.836, gamma=0.042, learning_rate=0.114,
                                max_depth=13, subsample=0.920, booster='dart')

    #lgb_model = lgb.LGBMRegressor(objective='regression', random_state=42, metric="rmse", learning_rate=0.114, max_depth=41, 
                                  #n_estimators=775, num_leaves=20, reg_lambda=0.123, reg_alpha=0.46, boosting='dart')
    
    lgb_model = lgb.LGBMRegressor(objective='regression', random_state=42, metric="rmse", learning_rate=0.221, max_depth=10, 
                                  n_estimators=1931, num_leaves=28, reg_lambda=0.502, reg_alpha=0.121, boosting='dart')

    hist_model = HistGradientBoostingRegressor(learning_rate=0.18, l2_regularization=0.511, max_depth=45)

    ada_model = AdaBoostRegressor(n_estimators=493, learning_rate=0.011)
    
    cat_model = CatBoostRegressor(loss_function='RMSE', n_estimators=5000, learning_rate=0.05, max_depth=6, verbose=False)
    
    randf_model = RandomForestRegressor(n_estimators=296, max_depth=39, min_samples_split=3, min_samples_leaf=1 )
    
    gbr_model = GradientBoostingRegressor(learning_rate=0.111, max_depth=7, min_samples_leaf=2, min_samples_split=6,
                                         n_estimators=217)
    #sgd_model = SGDRegressor()

    #estimators = [('xgb', xgb_model), ('lgb', lgb_model), ('hist', hist_model)]
    estimators = [('xgb', xgb_model), ('lgb', lgb_model)]
    
    stacked_model = StackingRegressor(estimators=estimators,
                                      final_estimator=LassoLarsCV(normalize=True)
                                      )
    
    # create the final pipeline
    return make_pipeline(preprocessor, encoder, stacked_model)


In [34]:
pipeline = get_estimator()

In [35]:

scores = cross_val_score(
    pipeline, X, y, cv=5, scoring='neg_mean_squared_error'
)
rmse_scores = np.sqrt(-scores)

print(
    f"RMSE: {np.mean(rmse_scores):.4f} +/- {np.std(rmse_scores):.4f}"
)

RMSE: 0.3118 +/- 0.0192


# Model Testing

DART optimizer: 
LGB: RMSE: 0.3138 +/- 0.0198
XGB: RMSE: 0.3363
Stacked + DART LGB+XGB+HIST: RMSE: 0.3120 +/- 0.0195
Stacked + DART LGB+XGB: RMSE: 0.3118 +/- 0.0192

### + target encoding (connexion)
Untuned:
LGB: RMSE: 0.3292 +/- 0.0184
XGB: RMSE: 0.3364 +/- 0.0192
HIST: RMSE: 0.3472 +/- 0.0261
Stacked, LGB+XGB+HIST: RMSE: 0.3229 +/- 0.0224

Tuned:
LGB: RMSE: 0.3292 +/- 0.0184
XGB: RMSE: 0.3362 +/- 0.0192
HIST: RMSE: 0.3441 +/- 0.0198
Stacked, LGB+XGB+HIST: RMSE: 0.3230 +/- 0.0210

### + monthly importance
Untuned:
LGB: RMSE: 0.3318 +/- 0.0205
XGB: RMSE: 0.3443 +/- 0.0191
HIST: RMSE: 0.3490 +/- 0.0205
Stacked, LGB+XGB+HIST: 0.3274

Tuned:
LGB: RMSE: 0.3300 +/- 0.0195
XGB: RMSE: 0.3419 +/- 0.0135
HIST: RMSE: 0.3463 +/- 0.0179
Stacked, LGB+XGB+HIST: RMSE: 0.3244 +/- 0.0177

Ordinal encoding + no scaler:
Stacked, LGB+XGB+HIST: RMSE: 0.3265 +/- 0.0186

Ordinal encoding + standard scaler:
Stacked, LGB+XGB+HIST: RMSE: 0.3265

Standard scaler + one hot encoder: 
LGB: RMSE: 0.3397 +/- 0.0172
Cat:
Stacked, LGB+XGB+HIST:

Standar scaler:
LGB: RMSE: 0.3318 +/- 0.0205
Stacked, LGB+XGB+HIST: RMSE: 0.3274 +/- 0.0203

One hot encoder:
LGB: RMSE: 0.3394 +/- 0.0181
Stacked, LGB+XGB+HIST:


RMSE: 0.3266 +/- 0.0189
fill 0: RMSE: 0.3379 +/- 0.0254
fill 1: RMSE: 0.3379 +/- 0.0253

## Untuned:

### +distance to holiday:
Stacked, LGB+XGB+GBR: RMSE: 0.3279 +/- 0.0208
Stacked, LGB+XGB+HIST: RMSE: 0.3286 +/- 0.0196
LGB: RMSE: 0.3327 +/- 0.0185
HIST: RMSE: 0.3443 +/- 0.0186
XGB: RMSE: 0.3418 +/- 0.0221
RANDF: RMSE: 0.4082 +/- 0.0248
GBR: RMSE: 0.3422 +/- 0.0226

### Final estimator: LassoLarsCV
Stacked: RMSE: 0.3410 +/- 0.0268
Randf: RMSE: 0.4349 +/- 0.0307
Stacked, LGB+XGB+ADA: RMSE: 0.3385 +/- 0.0228
Stacked, LGB+XGB+GBR: RMSE: 0.3380 +/- 0.0229
Stacked, LGB+XGB+GBR+RANDF: RMSE: 0.3372 +/- 0.0223
Stacked, LGB+XGB+GBR+ADA: RMSE: 0.3377 +/- 0.0227

### Final estimator: ElasticNetCV
Stacked, LGB+XGB+GBR: RMSE: 0.3460 +/- 0.0255
Stacked, LGB+XGB+GBR+ADA: RMSE: 0.3445 +/- 0.0253

### Final estimator: RidgeCV
Stacked, LGB+XGB+GBR: RMSE: 0.3478 +/- 0.0258

### Final estimator: Randf
Stacked, LGB+XGB+GBR: RMSE: 0.3612 +/- 0.0193

### Final estimator: LassoCV
Stacked, LGB+XGB+GBR: RMSE: 0.3382 +/- 0.0231

### Final estimator: LGB
Stacked, LGB+XGB+GBR: RMSE: 0.3469 +/- 0.0234


## Tuned:
### +distance to holiday:
Stacked, LGB+XGB+RANDF: RMSE: 0.3255 +/- 0.0196
Stacked, LGB+XGB+GBR+RANDF: RMSE: 0.3232 +/- 0.0203
Stacked, LGB+XGB+GBR: RMSE: 0.3255 +/- 0.0205
Stacked, LGB+XGB+HIST: RMSE: 0.3261 +/- 0.0199
HIST: RMSE: 0.3440 +/- 0.0183
XGB: RMSE: 0.3418 +/- 0.0221
LGB: RMSE: 0.3327 +/- 0.0185
RANDF:RMSE: 0.4079 +/- 0.0243
GBR: RMSE: 0.3422 +/- 0.0226

### Final estimator: LassoLarsCV
Stacked, LGB+XGB+GBR+RANDF: RMSE: 0.3348 +/- 0.0244
Stacked, LGB+XGB+GBR: RMSE: 0.3374 +/- 0.0252
Stacked, LGB+XGB+ADA+RANDF: RMSE: 0.3368 +/- 0.0222
Stacked, LGB+XGB+ADA: RMSE: 0.3382 +/- 0.0227
Stacked, LGB+XGB+RANDF: RMSE: 0.3381 +/- 0.0229
Stacked, LGB+XGB: RMSE: 0.3392 +/- 0.0238
Stacked, LGB+XGB+HIST+RANDF: RMSE: 0.3378 +/- 0.0229
Stacked, LGB+XGB+HIST: RMSE: 0.3390 +/- 0.0238
Stacked, LGB+HIST: RMSE: 0.3431 +/- 0.0227
Stacked, XGB+HIST: RMSE: 0.3440 +/- 0.0245
LGB: RMSE: 0.3441 +/- 0.0253
XGB: RMSE: 0.3577 +/- 0.0257
Hist: RMSE: 0.3565 +/- 0.0231
Randf: RMSE: 0.4338 +/- 0.0307
GBR: RMSE: 0.3540 +/- 0.0294

# Hyperparameters

### LGB
lgb_model = lgb.LGBMRegressor(objective='regression', random_state=42, metric="rmse", learning_rate=0.114, max_depth=41,n_estimators=775, num_leaves=20, reg_lambda=0.123, reg_alpha=0.46)

### XGB
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", random_state=42, reg_alpha=0.795, reg_lambda=0.172,
                                colsample_bytree=0.836, gamma=0.042, learning_rate=0.114,
                                max_depth=13, subsample=0.920)


### HIST
hist_model = HistGradientBoostingRegressor(learning_rate=0.18, l2_regularization=0.511, max_depth=45)


### RANDF
randf_model = RandomForestRegressor(n_estimators=296, max_depth=39, min_samples_split=3, min_samples_leaf=1 )

### ADA
ada_model = AdaBoostRegressor(n_estimators=493, learning_rate=0.011)

### GBR
gbr_model = GradientBoostingRegressor(learning_rate=0.111, max_depth=7, min_samples_leaf=2, min_samples_split=6,
                                     n_estimators=217)


In [463]:
pipeline.get_params().keys()

dict_keys(['memory', 'steps', 'verbose', 'pipeline', 'columntransformer', 'gradientboostingregressor', 'pipeline__memory', 'pipeline__steps', 'pipeline__verbose', 'pipeline__functiontransformer-1', 'pipeline__functiontransformer-2', 'pipeline__functiontransformer-1__accept_sparse', 'pipeline__functiontransformer-1__check_inverse', 'pipeline__functiontransformer-1__func', 'pipeline__functiontransformer-1__inv_kw_args', 'pipeline__functiontransformer-1__inverse_func', 'pipeline__functiontransformer-1__kw_args', 'pipeline__functiontransformer-1__validate', 'pipeline__functiontransformer-2__accept_sparse', 'pipeline__functiontransformer-2__check_inverse', 'pipeline__functiontransformer-2__func', 'pipeline__functiontransformer-2__inv_kw_args', 'pipeline__functiontransformer-2__inverse_func', 'pipeline__functiontransformer-2__kw_args', 'pipeline__functiontransformer-2__validate', 'columntransformer__n_jobs', 'columntransformer__remainder', 'columntransformer__sparse_threshold', 'columntransf

In [327]:
# GBR Hyper-paramter fine tuning (danger)
'''
params = {
    'gradientboostingregressor__n_estimators': randint(50, 500),
    'gradientboostingregressor__learning_rate': uniform(0.01, 1.0),
    'gradientboostingregressor__max_depth': randint(1, 50),
    'gradientboostingregressor__min_samples_split': randint(2, 10),
    'gradientboostingregressor__min_samples_leaf': randint(1, 5)
}

search = RandomizedSearchCV(pipeline, param_distributions=params, random_state=42, scoring='neg_root_mean_squared_error',
                            n_iter=200, cv=5, verbose=1, n_jobs=-1, return_train_score=True)

search.fit(X, y)

search.best_params_

#report_best_scores(search.cv_results_, 1)
'''

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.4min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed: 11.6min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 27.0min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 50.3min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 63.2min finished


{'gradientboostingregressor__learning_rate': 0.11112267612279024,
 'gradientboostingregressor__max_depth': 7,
 'gradientboostingregressor__min_samples_leaf': 2,
 'gradientboostingregressor__min_samples_split': 6,
 'gradientboostingregressor__n_estimators': 217}

In [373]:
# Ada Hyper-paramter fine tuning (danger)
'''
params = {
    'adaboostregressor__n_estimators': randint(50, 500),
    'adaboostregressor__learning_rate': uniform(0.01, 1.0),
}

search = RandomizedSearchCV(pipeline, param_distributions=params, random_state=42, scoring='neg_root_mean_squared_error',
                            n_iter=100, cv=5, verbose=1, n_jobs=-1, return_train_score=True)

search.fit(X, y)

search.best_params_

#report_best_scores(search.cv_results_, 1)
'''

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   50.6s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 17.0min
[Parallel(n_jobs=-1)]: Done 500 out of 500 | elapsed: 20.1min finished


{'adaboostregressor__learning_rate': 0.010778765841014329,
 'adaboostregressor__n_estimators': 493}

In [320]:
# Random forest Hyper-paramter fine tuning (danger)
'''
params = {
    'randomforestregressor__n_estimators': randint(50, 500),
    'randomforestregressor__max_depth': randint(1, 50),
    'randomforestregressor__min_samples_split': randint(2, 10),
    'randomforestregressor__min_samples_leaf': randint(1, 5)
    
}

search = RandomizedSearchCV(pipeline, param_distributions=params, random_state=42, scoring='neg_root_mean_squared_error',
                            n_iter=200, cv=5, verbose=1, n_jobs=-1, return_train_score=True)

search.fit(X, y)

search.best_params_

#report_best_scores(search.cv_results_, 1)
'''

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   40.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  4.6min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 11.2min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 20.2min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 26.8min finished


{'randomforestregressor__max_depth': 39,
 'randomforestregressor__min_samples_leaf': 1,
 'randomforestregressor__min_samples_split': 3,
 'randomforestregressor__n_estimators': 296}

In [16]:
# LGBM Hyper-paramter fine tuning (danger)
#'''
params = {
    'lgbmregressor__learning_rate': uniform(0.01, 0.3),
    'lgbmregressor__num_leaves': randint(10, 200),
    'lgbmregressor__n_estimators': randint(50, 2000),
    'lgbmregressor__max_depth': randint(1, 50),
    'lgbmregressor__reg_lambda':  uniform(0.01, 1.0),
    'lgbmregressor__reg_alpha':  uniform(0.01, 1.0)
}

search = RandomizedSearchCV(pipeline, param_distributions=params, random_state=42, scoring='neg_root_mean_squared_error',
                            n_iter=200, cv=5, verbose=1, n_jobs=-1, return_train_score=True)

search.fit(X, y)

search.best_params_

#report_best_scores(search.cv_results_, 1)
#'''

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:  1.1min
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  8.2min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed: 19.8min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 34.8min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 45.6min finished


{'lgbmregressor__learning_rate': 0.22152530928899797,
 'lgbmregressor__max_depth': 10,
 'lgbmregressor__n_estimators': 1931,
 'lgbmregressor__num_leaves': 28,
 'lgbmregressor__reg_alpha': 0.12119748230615134,
 'lgbmregressor__reg_lambda': 0.5026251042908592}

In [356]:
# Hist Finetuning
#pipeline.get_params().keys()
'''
params = {
    "histgradientboostingregressor__learning_rate": uniform(0.03, 0.3), # default 0.1 
    "histgradientboostingregressor__l2_regularization": uniform(0, 1),
    "histgradientboostingregressor__max_depth": randint(1, 100),
}

search = RandomizedSearchCV(pipeline, param_distributions=params, random_state=42, n_iter=200, cv=5, verbose=1, n_jobs=-1, return_train_score=True)

search.fit(X, y)

search.best_params_

#report_best_scores(search.cv_results_, 1)
'''

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   37.4s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  3.1min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  7.2min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 16.8min finished


{'histgradientboostingregressor__l2_regularization': 0.5113423988609378,
 'histgradientboostingregressor__learning_rate': 0.18045488840615986,
 'histgradientboostingregressor__max_depth': 45}

In [26]:
# XGB Finetuning
#'''
params = {
    "xgbregressor__colsample_bytree": uniform(0.7, 0.3),
    "xgbregressor__gamma": uniform(0.01, 1.0),
    "xgbregressor__learning_rate": uniform(0.03, 0.3), # default 0.1 
    "xgbregressor__max_depth": randint(1, 20),
    "xgbregressor__subsample": uniform(0.01, 1.0),
    "xgbregressor__reg_alpha": uniform(0.01, 1.0),
    "xgbregressor__reg_lambda": uniform(0.01, 1.0)
}

search = RandomizedSearchCV(pipeline, param_distributions=params, random_state=42, n_iter=200, cv=5, verbose=1, n_jobs=-1, return_train_score=True)

search.fit(X, y)

search.best_params_
#report_best_scores(search.cv_results_, 1)
#'''

Fitting 5 folds for each of 200 candidates, totalling 1000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done  18 tasks      | elapsed:   30.2s
[Parallel(n_jobs=-1)]: Done 168 tasks      | elapsed:  2.7min
[Parallel(n_jobs=-1)]: Done 418 tasks      | elapsed:  6.9min
[Parallel(n_jobs=-1)]: Done 768 tasks      | elapsed: 12.6min
[Parallel(n_jobs=-1)]: Done 1000 out of 1000 | elapsed: 16.2min finished


{'xgbregressor__colsample_bytree': 0.9401760097327247,
 'xgbregressor__gamma': 0.2101502442448101,
 'xgbregressor__learning_rate': 0.08024477467772093,
 'xgbregressor__max_depth': 17,
 'xgbregressor__reg_alpha': 0.961549974355327,
 'xgbregressor__reg_lambda': 0.13421996672805936,
 'xgbregressor__subsample': 0.9062213026850383}

In [341]:
!ramp-test --submission test_final

[38;5;178m[1mTesting Number of air passengers prediction[0m
[38;5;178m[1mReading train and test files from ./data ...[0m
[38;5;178m[1mReading cv ...[0m
[38;5;178m[1mTraining submissions/test_final ...[0m
[38;5;178m[1mCV fold 0[0m
This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.
  X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week
  X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week
  X_encoded.loc[:, 'week'] = X_encoded['DateOfDeparture'].dt.week
	[38;5;178m[1mscore   rmse       time[0m
	[38;5;10m[1mtrain[0m  [38;5;10m[1m0.097[0m  [38;5;150m39.440579[0m
	[38;5;12m[1mvalid[0m  [38;5;12m[1m0.354[0m   [38;5;