# LightGBM Baseline Notebook
reference: https://www.kaggle.com/code/greysky/lightgbm-starter-with-us-map-lb-1-0871

In [1]:
import os

import numpy as np
import pandas as pd
import lightgbm as lgb
import optuna

from utils import lagging_on, transpose_columns

pd.set_option('display.max_columns', 50)
# folder path
HOST_PROVIDED = 'data/host_provided'
EXTERNAL = 'data/external'
SAVE_PATH = 'data/ready'
# train data metadata
date_col = 'first_day_of_month'
cat_cols = ['county', 'state']
target = 'microbusiness_density'
idx = 'row_id'
# feature engineering parameter
LAGS = 12

  from .autonotebook import tqdm as notebook_tqdm


## Feature Engineering

### Load host-provided datasets
- train.csv
- test.csv
- sample_submission.csv
- census_starter.csv

In [2]:
df_train = pd.read_csv(os.path.join(HOST_PROVIDED, 'train.csv'), index_col=idx)
df_test = pd.read_csv(os.path.join(HOST_PROVIDED, 'test.csv'), index_col=idx)
df_subm = pd.read_csv(os.path.join(HOST_PROVIDED, 'sample_submission.csv'), index_col=idx)
df_census = pd.read_csv(os.path.join(HOST_PROVIDED, 'census_starter.csv'), index_col="cfips")

### Preprocessing

In [3]:
# get cfips - (state - county) mapping
state_dict = df_train[['cfips', 'state', 'county']]
state_dict = state_dict.set_index('cfips')
state_dict = state_dict.drop_duplicates()
state_dict = state_dict.to_dict()
# map test df's cfips to state and county, then concatenate train / test set for feature engineering
df_test['state'] = df_test['cfips'].map(state_dict['state'])
df_test['county'] = df_test['cfips'].map(state_dict['county'])
df_all = pd.concat([df_train, df_test], axis=0)
# convert date column to datetime type
df_all[date_col] = pd.to_datetime(df_all[date_col])
# separate date time to year and month, add relative month to the first month as 'scale' column
df_all['year'] = df_all[date_col].dt.year
df_all['month'] = df_all[date_col].dt.month
df_all['scale'] = (df_all[date_col] - df_all[date_col].min()).dt.days
df_all['scale'] = df_all['scale'].factorize()[0]
# drop and convert
df_all = df_all.drop(columns=[date_col])
df_all[cat_cols] = df_all[cat_cols].astype('category')

### Generate lag data
generate two time series on the following columns, the time span is 12 (LAGS) months
- microbusiness_density
- active

In [4]:
# add lag data on "microbusiness_density" and "active" columns
df_all = lagging_on(df_all, groupby="cfips", columns=["microbusiness_density", "active"], n_lag=LAGS)
# drop the first month (lagging data columns are empty)
df_all = df_all[df_all['scale'] != 0]


### Join the dataset with population_data.csv

In [5]:
df_population = pd.read_csv(os.path.join(EXTERNAL, 'population_data.csv'))
# since the host use 2-year-lagged data
df_population.year = df_population.year + 2000 + 2
df_all = pd.merge(df_all.reset_index(), df_population, how="left", left_on=['cfips', 'year'], right_on=['cfips', 'year']).drop(columns=["Geography", "Geographic Area Name"])

### Aggregate neighbour county's population
sum aggregation for:
- Total population
- Under 18 years

mean aggregation for:
- median_hh_inc

In [6]:

# load data
df_neighbours = pd.read_csv(os.path.join(EXTERNAL, 'county-neighbours.csv'))
df_population = pd.read_csv(os.path.join(EXTERNAL, 'population_data.csv'))
df_neighbours = df_neighbours.rename(columns={"Neighbour county code": "cfips"})
# join two df
df_neighbours_merge = pd.merge(df_neighbours, df_population, how="left", left_on=['cfips'], right_on=['cfips'])
income_columns = ["median_hh_inc_2017", "median_hh_inc_2018", "median_hh_inc_2019", "median_hh_inc_2020", "median_hh_inc_2021"]
# join with host-provided census data's income related columns
df_neighbours_merge = pd.merge(df_neighbours_merge, df_census[income_columns].reset_index(), how="left", left_on=['cfips'], right_on=['cfips']).dropna()
# sum aggregate
agg_neighbour = df_neighbours_merge.groupby(["Countycode", "year"])[["Total population", "Under 18 years"]].sum()
agg_neighbour =agg_neighbour.rename(columns={"Total population": "sum_neighbour_p", "Under 18 years": "sum_neighbour_u18"})
# mean aggregate
mean_neighbour_inc = df_neighbours_merge.groupby(["Countycode"])[income_columns].mean()
# transpose table
hh_inc_f = lambda x:float(x.replace("median_hh_inc_20", ""))
agg_neighbour["mean_neighbour_median_hh_inc"] = transpose_columns(mean_neighbour_inc, col_f=hh_inc_f)
agg_neighbour = agg_neighbour.reset_index().rename(columns={"Countycode": "cfips"})
agg_neighbour.year = agg_neighbour.year + 2000 + 2
df_all = pd.merge(df_all, agg_neighbour, how="left", left_on=["cfips", "year"], right_on=["cfips", "year"])

In [7]:
agg_neighbour

Unnamed: 0,cfips,year,sum_neighbour_p,sum_neighbour_u18,mean_neighbour_median_hh_inc
0,1001,2019.0,403031.0,95232.0,40975.400000
1,1001,2020.0,402348.0,94665.0,42990.800000
2,1001,2021.0,401368.0,94004.0,44472.800000
3,1001,2022.0,400244.0,93386.0,46675.800000
4,1001,2023.0,410571.0,96299.0,48653.800000
...,...,...,...,...,...
15695,56045,2019.0,213771.0,50830.0,57808.571429
15696,56045,2020.0,214664.0,50380.0,58570.571429
15697,56045,2021.0,216106.0,50564.0,60684.285714
15698,56045,2022.0,217815.0,50411.0,63240.285714


### Transpose census_starter.csv and index by their 'year'
transpose the following columns:
- pct_bb
- pct_college
- pct_foreign_born
- pct_it_workers
- median_hh_inc

In [8]:
# transpose and create a new dataframe
pct_bb_cols = ["pct_bb_2017", "pct_bb_2018", "pct_bb_2019", "pct_bb_2020", "pct_bb_2021"]
pct_bb_f = lambda x:float(x.replace("pct_bb_20", ""))
df_census_transposed = transpose_columns(df_census, columns=pct_bb_cols, col_name="year", col_f=pct_bb_f)
df_census_transposed = pd.DataFrame(df_census_transposed, columns=["pct_bb"])
pct_college_cols = ["pct_college_2017", "pct_college_2018", "pct_college_2019", "pct_college_2020", "pct_college_2021"]
pct_college_f = lambda x:float(x.replace("pct_college_20", ""))
df_census_transposed["pct_college"] = transpose_columns(df_census, columns=pct_college_cols, col_name="year", col_f=pct_college_f)
pct_foreign_born_cols = ["pct_foreign_born_2017", "pct_foreign_born_2018", "pct_foreign_born_2019", "pct_foreign_born_2020", "pct_foreign_born_2021"]
pct_foreign_born_f = lambda x:float(x.replace("pct_foreign_born_20", ""))
df_census_transposed["pct_foreign_born"] = transpose_columns(df_census, columns=pct_foreign_born_cols, col_name="year", col_f=pct_foreign_born_f)
pct_it_workers_cols = ["pct_it_workers_2017", "pct_it_workers_2018", "pct_it_workers_2019", "pct_it_workers_2020", "pct_it_workers_2021"]
pct_it_workers_f = lambda x:float(x.replace("pct_it_workers_20", ""))
df_census_transposed["pct_it_workers"] = transpose_columns(df_census, columns=pct_it_workers_cols, col_name="year", col_f=pct_it_workers_f)
median_hh_inc_cols = ["median_hh_inc_2017", "median_hh_inc_2018", "median_hh_inc_2019", "median_hh_inc_2020", "median_hh_inc_2021"]
median_hh_inc_f = lambda x:float(x.replace("median_hh_inc_20", ""))
df_census_transposed["median_hh_inc"] = transpose_columns(df_census, columns=median_hh_inc_cols, col_name="year", col_f=median_hh_inc_f)
df_census_transposed.reset_index().to_csv(os.path.join(EXTERNAL, "census_transposed.csv"), index=False)
# join the dataset with the transposed census dataset
df_census_transposed = df_census_transposed.reset_index()
df_census_transposed.year = df_census_transposed.year + 2000 + 2
df_all = pd.merge(df_all, df_census_transposed, how="left", left_on=["cfips", "year"], right_on=["cfips", "year"])

In [9]:
df_census_transposed

Unnamed: 0,cfips,year,pct_bb,pct_college,pct_foreign_born,pct_it_workers,median_hh_inc
0,1001,2019.0,76.6,14.5,2.1,1.3,55317.0
1,1001,2020.0,78.9,15.9,2.0,1.1,58786.0
2,1001,2021.0,80.6,16.1,2.3,0.7,58731.0
3,1001,2022.0,82.7,16.7,2.3,0.6,57982.0
4,1001,2023.0,85.5,16.4,2.1,1.1,62660.0
...,...,...,...,...,...,...,...
15703,56045,2019.0,71.1,14.1,3.8,0.6,59605.0
15704,56045,2020.0,73.3,13.5,4.1,0.6,52867.0
15705,56045,2021.0,76.8,13.4,1.7,0.0,57031.0
15706,56045,2022.0,79.7,12.7,2.3,0.0,53333.0


### (Optional) Add masked dataset replica to train a more robust model

In [None]:
# (Optional) add masked replica
from utils import random_masking
columns = [f"lag_{i+1}_microbusiness_density" for i in np.arange(LAGS)] + [f"lag_{i+1}_active" for i in np.arange(LAGS)]
df_all_replica = random_masking(df_all, columns=columns, n_rep=10, threshold=0.8)

### Feature set ready

In [10]:
df_all = df_all.set_index("row_id")

## Explore LightGBM hyperparmeters with Optuna

### Define training trick and loss function

In [11]:
# training trick
def to_percent(X, y):
    yhat = y / X['lag_1_microbusiness_density']
    yhat[X['lag_1_microbusiness_density'] == 0] = 0 # denominator cannot be 0
    return yhat

def from_percent(X, yhat):
    y = yhat * X[f'lag_1_microbusiness_density']
    return y
# loss function
def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    pos_ind = (y_true != 0) | (y_pred != 0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    return 100 * np.mean(smap)

In [12]:
SMAPE_ENABLED = True
def lgb_objective(trial):
    # define hyperparameters and searching range
    params = {
        'n_iter'           : 200,
        'verbosity'        : -1,
        'objective'        : 'l1',
        'random_state'     : 42,
        'extra_trees'      : True,
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth'        : trial.suggest_int('max_depth', 3, 10),
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'num_leaves'       : trial.suggest_int('num_leaves', 8, 1024),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 5, 250),}
    # LightGBM regression model
    model  = lgb.LGBMRegressor(**params)
    # training data pipeline
    X, y   = df_all.drop(columns=[target, "active"]), df_all[target]
    # split train / validation set
    train_times = list(range(38))
    X_train = X[X['scale'].isin(train_times)]
    y_train = y[X['scale'].isin(train_times)]
    valid_times = [38]
    X_valid = X[X['scale'].isin(valid_times)]
    y_valid = y[X['scale'].isin(valid_times)]
    # training
    if SMAPE_ENABLED:
        y_train = to_percent(X=X_train, y=y_train)
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    if SMAPE_ENABLED:
        y_pred = from_percent(X=X_valid, yhat=y_pred)
    return smape(y_valid, y_pred)

In [13]:
study = optuna.create_study(direction='minimize', study_name='Regressor')
study.optimize(lgb_objective, n_trials=50)

[32m[I 2023-02-20 23:35:47,105][0m A new study created in memory with name: Regressor[0m




[32m[I 2023-02-20 23:35:51,253][0m Trial 0 finished with value: 1.0682763749613446 and parameters: {'colsample_bytree': 0.7074145505094599, 'colsample_bynode': 0.3136684886107868, 'max_depth': 10, 'learning_rate': 0.0342958946166189, 'lambda_l1': 6.826332093262305, 'lambda_l2': 3.404079767774333, 'num_leaves': 556, 'min_data_in_leaf': 145}. Best is trial 0 with value: 1.0682763749613446.[0m




[32m[I 2023-02-20 23:35:53,904][0m Trial 1 finished with value: 1.0753098811841098 and parameters: {'colsample_bytree': 0.4360827113312845, 'colsample_bynode': 0.5544234855373612, 'max_depth': 10, 'learning_rate': 0.013449885118313479, 'lambda_l1': 8.34983438617155, 'lambda_l2': 8.97548760916854, 'num_leaves': 32, 'min_data_in_leaf': 102}. Best is trial 0 with value: 1.0682763749613446.[0m




[32m[I 2023-02-20 23:35:56,823][0m Trial 2 finished with value: 1.069463682109735 and parameters: {'colsample_bytree': 0.5742479712518404, 'colsample_bynode': 0.8078105128525591, 'max_depth': 6, 'learning_rate': 0.019797637050175107, 'lambda_l1': 6.752785118666874, 'lambda_l2': 6.454554004722474, 'num_leaves': 343, 'min_data_in_leaf': 128}. Best is trial 0 with value: 1.0682763749613446.[0m




[32m[I 2023-02-20 23:35:58,784][0m Trial 3 finished with value: 1.0672931821033171 and parameters: {'colsample_bytree': 0.1267665007021934, 'colsample_bynode': 0.6644223339101001, 'max_depth': 9, 'learning_rate': 0.09561656829336337, 'lambda_l1': 8.489207615595332, 'lambda_l2': 2.014764862434698, 'num_leaves': 334, 'min_data_in_leaf': 67}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:01,299][0m Trial 4 finished with value: 1.0752592010499538 and parameters: {'colsample_bytree': 0.2719117277759554, 'colsample_bynode': 0.7072526267800726, 'max_depth': 10, 'learning_rate': 0.010500828458287047, 'lambda_l1': 7.28621969473679, 'lambda_l2': 8.798872586568486, 'num_leaves': 933, 'min_data_in_leaf': 224}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:03,749][0m Trial 5 finished with value: 1.0743126786884005 and parameters: {'colsample_bytree': 0.37287493967551466, 'colsample_bynode': 0.10793524668587376, 'max_depth': 4, 'learning_rate': 0.08580562411024734, 'lambda_l1': 1.6285103591078471, 'lambda_l2': 7.489728876509457, 'num_leaves': 909, 'min_data_in_leaf': 204}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:07,566][0m Trial 6 finished with value: 1.0747445436730059 and parameters: {'colsample_bytree': 0.6664554845510159, 'colsample_bynode': 0.6657208198631619, 'max_depth': 4, 'learning_rate': 0.011433292450299127, 'lambda_l1': 7.209619085608966, 'lambda_l2': 7.417893522304241, 'num_leaves': 130, 'min_data_in_leaf': 162}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:09,514][0m Trial 7 finished with value: 1.0742262447316682 and parameters: {'colsample_bytree': 0.1666492518864236, 'colsample_bynode': 0.23540677237396568, 'max_depth': 10, 'learning_rate': 0.042962046295198605, 'lambda_l1': 4.9850979911804165, 'lambda_l2': 3.599566345240722, 'num_leaves': 123, 'min_data_in_leaf': 93}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:13,475][0m Trial 8 finished with value: 1.0710989079014472 and parameters: {'colsample_bytree': 0.970927462618238, 'colsample_bynode': 0.4193423733995769, 'max_depth': 3, 'learning_rate': 0.03818160885848873, 'lambda_l1': 8.213244120881265, 'lambda_l2': 1.6561786350717573, 'num_leaves': 468, 'min_data_in_leaf': 14}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:15,209][0m Trial 9 finished with value: 1.078153754862564 and parameters: {'colsample_bytree': 0.11894167004742044, 'colsample_bynode': 0.2512493805153436, 'max_depth': 8, 'learning_rate': 0.031381531756689854, 'lambda_l1': 9.5101241229372, 'lambda_l2': 4.348954294429021, 'num_leaves': 396, 'min_data_in_leaf': 221}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:16,915][0m Trial 10 finished with value: 1.06730916523438 and parameters: {'colsample_bytree': 0.11457051490074381, 'colsample_bynode': 0.9325058731673672, 'max_depth': 8, 'learning_rate': 0.08729179563828518, 'lambda_l1': 9.989460520317493, 'lambda_l2': 0.10807012763473534, 'num_leaves': 787, 'min_data_in_leaf': 26}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:18,587][0m Trial 11 finished with value: 1.0758886372916787 and parameters: {'colsample_bytree': 0.10059149400771036, 'colsample_bynode': 0.9796278574584906, 'max_depth': 8, 'learning_rate': 0.09774083407210145, 'lambda_l1': 9.868512974852806, 'lambda_l2': 0.08505105149519565, 'num_leaves': 714, 'min_data_in_leaf': 24}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:20,639][0m Trial 12 finished with value: 1.0685083689670207 and parameters: {'colsample_bytree': 0.2628550134671752, 'colsample_bynode': 0.9940649250190979, 'max_depth': 8, 'learning_rate': 0.06741562887831647, 'lambda_l1': 9.829145274039984, 'lambda_l2': 0.16437130730217633, 'num_leaves': 731, 'min_data_in_leaf': 57}. Best is trial 3 with value: 1.0672931821033171.[0m




[32m[I 2023-02-20 23:36:23,730][0m Trial 13 finished with value: 1.0670680691221335 and parameters: {'colsample_bytree': 0.2490337514017953, 'colsample_bynode': 0.8489022819548419, 'max_depth': 7, 'learning_rate': 0.0672490543800021, 'lambda_l1': 5.212355674916926, 'lambda_l2': 2.09084806506577, 'num_leaves': 662, 'min_data_in_leaf': 57}. Best is trial 13 with value: 1.0670680691221335.[0m




[32m[I 2023-02-20 23:36:25,755][0m Trial 14 finished with value: 1.0697467856396763 and parameters: {'colsample_bytree': 0.2862915838319303, 'colsample_bynode': 0.8241521735509453, 'max_depth': 6, 'learning_rate': 0.06116774254079159, 'lambda_l1': 5.085484636273474, 'lambda_l2': 2.2692510355059823, 'num_leaves': 300, 'min_data_in_leaf': 68}. Best is trial 13 with value: 1.0670680691221335.[0m




[32m[I 2023-02-20 23:36:28,274][0m Trial 15 finished with value: 1.0702527373739215 and parameters: {'colsample_bytree': 0.4275376467423025, 'colsample_bynode': 0.5556861804316375, 'max_depth': 7, 'learning_rate': 0.059485657399093735, 'lambda_l1': 3.9162624030069715, 'lambda_l2': 2.1288448617764666, 'num_leaves': 577, 'min_data_in_leaf': 59}. Best is trial 13 with value: 1.0670680691221335.[0m




[32m[I 2023-02-20 23:36:30,760][0m Trial 16 finished with value: 1.0664853656477997 and parameters: {'colsample_bytree': 0.23545111300646682, 'colsample_bynode': 0.7238845925930432, 'max_depth': 7, 'learning_rate': 0.09931032640596525, 'lambda_l1': 2.5600082865232783, 'lambda_l2': 5.322100489765608, 'num_leaves': 249, 'min_data_in_leaf': 88}. Best is trial 16 with value: 1.0664853656477997.[0m




[32m[I 2023-02-20 23:36:32,595][0m Trial 17 finished with value: 1.0671119302689902 and parameters: {'colsample_bytree': 0.2546166445628348, 'colsample_bynode': 0.8572389669865634, 'max_depth': 5, 'learning_rate': 0.06951466898607873, 'lambda_l1': 0.05280054539542167, 'lambda_l2': 5.550991236698545, 'num_leaves': 640, 'min_data_in_leaf': 100}. Best is trial 16 with value: 1.0664853656477997.[0m




[32m[I 2023-02-20 23:36:34,974][0m Trial 18 finished with value: 1.0717514177855336 and parameters: {'colsample_bytree': 0.38877160031977687, 'colsample_bynode': 0.7459092708625165, 'max_depth': 7, 'learning_rate': 0.048886146930037344, 'lambda_l1': 2.8151654581283445, 'lambda_l2': 5.106087731546045, 'num_leaves': 250, 'min_data_in_leaf': 181}. Best is trial 16 with value: 1.0664853656477997.[0m




[32m[I 2023-02-20 23:36:36,709][0m Trial 19 finished with value: 1.0721180449627754 and parameters: {'colsample_bytree': 0.2031570123560259, 'colsample_bynode': 0.8730672179834753, 'max_depth': 5, 'learning_rate': 0.052441296738592866, 'lambda_l1': 2.646524080771359, 'lambda_l2': 4.30886380458505, 'num_leaves': 478, 'min_data_in_leaf': 119}. Best is trial 16 with value: 1.0664853656477997.[0m




[32m[I 2023-02-20 23:36:40,174][0m Trial 20 finished with value: 1.0663971937937626 and parameters: {'colsample_bytree': 0.33687726026393516, 'colsample_bynode': 0.7398432387131895, 'max_depth': 7, 'learning_rate': 0.07363845542016947, 'lambda_l1': 4.894131017623922, 'lambda_l2': 5.800919197464499, 'num_leaves': 194, 'min_data_in_leaf': 39}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:36:43,848][0m Trial 21 finished with value: 1.068710772000266 and parameters: {'colsample_bytree': 0.33898112667703906, 'colsample_bynode': 0.7707039294841016, 'max_depth': 7, 'learning_rate': 0.07611750512763184, 'lambda_l1': 5.558173860403811, 'lambda_l2': 5.871238244698378, 'num_leaves': 197, 'min_data_in_leaf': 39}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:36:46,647][0m Trial 22 finished with value: 1.066823086812858 and parameters: {'colsample_bytree': 0.2185670278713702, 'colsample_bynode': 0.6356097277951348, 'max_depth': 6, 'learning_rate': 0.07666952693085195, 'lambda_l1': 3.9812197817599673, 'lambda_l2': 6.226675636951219, 'num_leaves': 67, 'min_data_in_leaf': 87}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:36:49,016][0m Trial 23 finished with value: 1.0725461415375857 and parameters: {'colsample_bytree': 0.2045700403271649, 'colsample_bynode': 0.6759129869761475, 'max_depth': 6, 'learning_rate': 0.0987771320272634, 'lambda_l1': 3.975496347913768, 'lambda_l2': 6.2835177752222995, 'num_leaves': 61, 'min_data_in_leaf': 90}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:36:51,231][0m Trial 24 finished with value: 1.072850586840701 and parameters: {'colsample_bytree': 0.3264877483606814, 'colsample_bynode': 0.6071521922604376, 'max_depth': 5, 'learning_rate': 0.07908467820629098, 'lambda_l1': 3.6275390802020433, 'lambda_l2': 6.629464656660028, 'num_leaves': 186, 'min_data_in_leaf': 80}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:36:53,368][0m Trial 25 finished with value: 1.072052812672337 and parameters: {'colsample_bytree': 0.19473652493192592, 'colsample_bynode': 0.48827397880378753, 'max_depth': 6, 'learning_rate': 0.07908868368546718, 'lambda_l1': 2.102289554086699, 'lambda_l2': 4.694686658623235, 'num_leaves': 11, 'min_data_in_leaf': 42}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:36:56,563][0m Trial 26 finished with value: 1.0678572316838195 and parameters: {'colsample_bytree': 0.4731541727103159, 'colsample_bynode': 0.7458185537647765, 'max_depth': 9, 'learning_rate': 0.05756292666323276, 'lambda_l1': 3.382969592899487, 'lambda_l2': 5.180274431314877, 'num_leaves': 122, 'min_data_in_leaf': 122}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:36:59,525][0m Trial 27 finished with value: 1.0756027437476912 and parameters: {'colsample_bytree': 0.32266196955420834, 'colsample_bynode': 0.6149732742531187, 'max_depth': 7, 'learning_rate': 0.07521348654691752, 'lambda_l1': 4.4544291288831435, 'lambda_l2': 7.216696979501718, 'num_leaves': 236, 'min_data_in_leaf': 39}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:01,448][0m Trial 28 finished with value: 1.0690591394282727 and parameters: {'colsample_bytree': 0.1901870114287057, 'colsample_bynode': 0.7432686280213293, 'max_depth': 9, 'learning_rate': 0.049594054782762936, 'lambda_l1': 5.859783711980422, 'lambda_l2': 5.730994605946724, 'num_leaves': 416, 'min_data_in_leaf': 137}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:03,707][0m Trial 29 finished with value: 1.0769224263467234 and parameters: {'colsample_bytree': 0.49678711894273686, 'colsample_bynode': 0.6166556052205427, 'max_depth': 5, 'learning_rate': 0.08687254865949887, 'lambda_l1': 4.542756625077319, 'lambda_l2': 7.933141433772935, 'num_leaves': 269, 'min_data_in_leaf': 6}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:05,932][0m Trial 30 finished with value: 1.0706456557654094 and parameters: {'colsample_bytree': 0.3510318869879638, 'colsample_bynode': 0.4791509699226567, 'max_depth': 6, 'learning_rate': 0.06865160087419186, 'lambda_l1': 5.978378124108588, 'lambda_l2': 6.591356566713304, 'num_leaves': 159, 'min_data_in_leaf': 81}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:08,680][0m Trial 31 finished with value: 1.0673731953827326 and parameters: {'colsample_bytree': 0.29900437710288313, 'colsample_bynode': 0.8833761416879186, 'max_depth': 7, 'learning_rate': 0.0646895467713203, 'lambda_l1': 4.5257993259883245, 'lambda_l2': 3.746136239261765, 'num_leaves': 553, 'min_data_in_leaf': 51}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:11,013][0m Trial 32 finished with value: 1.0713652166892085 and parameters: {'colsample_bytree': 0.23433356935793093, 'colsample_bynode': 0.7875768764067831, 'max_depth': 7, 'learning_rate': 0.09981322224578552, 'lambda_l1': 5.127722793931147, 'lambda_l2': 4.990965686658364, 'num_leaves': 72, 'min_data_in_leaf': 77}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:13,132][0m Trial 33 finished with value: 1.07529058679049 and parameters: {'colsample_bytree': 0.25527516622007673, 'colsample_bynode': 0.8142240020722289, 'max_depth': 8, 'learning_rate': 0.07482934997672719, 'lambda_l1': 6.262143348852489, 'lambda_l2': 3.056667417740313, 'num_leaves': 596, 'min_data_in_leaf': 113}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:15,025][0m Trial 34 finished with value: 1.0696870177850213 and parameters: {'colsample_bytree': 0.16240815110274592, 'colsample_bynode': 0.7293605987358154, 'max_depth': 6, 'learning_rate': 0.08446555195396169, 'lambda_l1': 3.0381360722571444, 'lambda_l2': 6.023170479456755, 'num_leaves': 374, 'min_data_in_leaf': 106}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:17,507][0m Trial 35 finished with value: 1.0669098975012108 and parameters: {'colsample_bytree': 0.3897315981077099, 'colsample_bynode': 0.7098943801246103, 'max_depth': 7, 'learning_rate': 0.05653405271688305, 'lambda_l1': 5.418791050201063, 'lambda_l2': 5.758036917859114, 'num_leaves': 859, 'min_data_in_leaf': 71}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:19,869][0m Trial 36 finished with value: 1.0679594328101942 and parameters: {'colsample_bytree': 0.41017744029014475, 'colsample_bynode': 0.7062114298103804, 'max_depth': 6, 'learning_rate': 0.05633016497480004, 'lambda_l1': 6.304261177658711, 'lambda_l2': 5.574165621701946, 'num_leaves': 863, 'min_data_in_leaf': 157}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:22,428][0m Trial 37 finished with value: 1.0771727426410977 and parameters: {'colsample_bytree': 0.37044441336058076, 'colsample_bynode': 0.6648610179642953, 'max_depth': 8, 'learning_rate': 0.08834128190968994, 'lambda_l1': 4.101628422445387, 'lambda_l2': 9.8507640533473, 'num_leaves': 76, 'min_data_in_leaf': 69}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:24,671][0m Trial 38 finished with value: 1.0689664078901902 and parameters: {'colsample_bytree': 0.45886966637280047, 'colsample_bynode': 0.6879457728459288, 'max_depth': 4, 'learning_rate': 0.028349676849280723, 'lambda_l1': 3.429022073626375, 'lambda_l2': 6.879938095182423, 'num_leaves': 337, 'min_data_in_leaf': 97}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:28,015][0m Trial 39 finished with value: 1.0718244121984912 and parameters: {'colsample_bytree': 0.300097707862357, 'colsample_bynode': 0.7819239262965767, 'max_depth': 9, 'learning_rate': 0.0461212952046177, 'lambda_l1': 6.960112611843142, 'lambda_l2': 6.384286491162129, 'num_leaves': 814, 'min_data_in_leaf': 131}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:30,173][0m Trial 40 finished with value: 1.0709801173714517 and parameters: {'colsample_bytree': 0.5190893195063744, 'colsample_bynode': 0.6403187338795213, 'max_depth': 5, 'learning_rate': 0.05425185067876921, 'lambda_l1': 4.64257508622431, 'lambda_l2': 6.120441029322825, 'num_leaves': 224, 'min_data_in_leaf': 85}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:32,525][0m Trial 41 finished with value: 1.069881336489372 and parameters: {'colsample_bytree': 0.23369492210033393, 'colsample_bynode': 0.7078773299970205, 'max_depth': 7, 'learning_rate': 0.06424806446426867, 'lambda_l1': 5.397606836441919, 'lambda_l2': 4.818086287721212, 'num_leaves': 1013, 'min_data_in_leaf': 52}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:34,926][0m Trial 42 finished with value: 1.0701365114286816 and parameters: {'colsample_bytree': 0.39247817966335896, 'colsample_bynode': 0.8191882183834879, 'max_depth': 7, 'learning_rate': 0.07186524111866874, 'lambda_l1': 5.4993156632818705, 'lambda_l2': 4.212917073817503, 'num_leaves': 926, 'min_data_in_leaf': 28}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:36,925][0m Trial 43 finished with value: 1.0709881115271502 and parameters: {'colsample_bytree': 0.2990279266507322, 'colsample_bynode': 0.9110560661804576, 'max_depth': 6, 'learning_rate': 0.06296172067073312, 'lambda_l1': 4.860141555664287, 'lambda_l2': 5.4181684997836586, 'num_leaves': 657, 'min_data_in_leaf': 70}. Best is trial 20 with value: 1.0663971937937626.[0m




[32m[I 2023-02-20 23:37:38,700][0m Trial 44 finished with value: 1.0661006696128192 and parameters: {'colsample_bytree': 0.15046211777171975, 'colsample_bynode': 0.8322770646559513, 'max_depth': 8, 'learning_rate': 0.08873464524004498, 'lambda_l1': 4.148533625228286, 'lambda_l2': 5.925083614051801, 'num_leaves': 701, 'min_data_in_leaf': 48}. Best is trial 44 with value: 1.0661006696128192.[0m




[32m[I 2023-02-20 23:37:41,097][0m Trial 45 finished with value: 1.0722211594278486 and parameters: {'colsample_bytree': 0.1464853801769625, 'colsample_bynode': 0.7734203124805837, 'max_depth': 8, 'learning_rate': 0.08961899853829518, 'lambda_l1': 4.028600534424717, 'lambda_l2': 6.979366689638655, 'num_leaves': 1016, 'min_data_in_leaf': 31}. Best is trial 44 with value: 1.0661006696128192.[0m




[32m[I 2023-02-20 23:37:44,242][0m Trial 46 finished with value: 1.0685664242485766 and parameters: {'colsample_bytree': 0.12141829311725458, 'colsample_bynode': 0.7031351961529084, 'max_depth': 8, 'learning_rate': 0.08327993248822492, 'lambda_l1': 4.383974799302272, 'lambda_l2': 7.653051669092191, 'num_leaves': 737, 'min_data_in_leaf': 16}. Best is trial 44 with value: 1.0661006696128192.[0m




[32m[I 2023-02-20 23:37:46,807][0m Trial 47 finished with value: 1.0699405595100902 and parameters: {'colsample_bytree': 0.15983750714062317, 'colsample_bynode': 0.6454260317790604, 'max_depth': 9, 'learning_rate': 0.09466545535410104, 'lambda_l1': 3.597164171869299, 'lambda_l2': 5.952842405725468, 'num_leaves': 872, 'min_data_in_leaf': 48}. Best is trial 44 with value: 1.0661006696128192.[0m




[32m[I 2023-02-20 23:37:48,830][0m Trial 48 finished with value: 1.0656925661973748 and parameters: {'colsample_bytree': 0.22202173638227363, 'colsample_bynode': 0.8252477342750326, 'max_depth': 8, 'learning_rate': 0.07857132545866048, 'lambda_l1': 4.949683443818313, 'lambda_l2': 6.474362496376458, 'num_leaves': 120, 'min_data_in_leaf': 108}. Best is trial 48 with value: 1.0656925661973748.[0m




[32m[I 2023-02-20 23:37:50,946][0m Trial 49 finished with value: 1.0689018382853406 and parameters: {'colsample_bytree': 0.1352648873893217, 'colsample_bynode': 0.9271095508393492, 'max_depth': 10, 'learning_rate': 0.07927062251835817, 'lambda_l1': 2.355951653012947, 'lambda_l2': 6.696223908197568, 'num_leaves': 142, 'min_data_in_leaf': 146}. Best is trial 48 with value: 1.0656925661973748.[0m


In [14]:
study.best_value

1.0656925661973748

In [15]:
study.best_params

{'colsample_bytree': 0.22202173638227363,
 'colsample_bynode': 0.8252477342750326,
 'max_depth': 8,
 'learning_rate': 0.07857132545866048,
 'lambda_l1': 4.949683443818313,
 'lambda_l2': 6.474362496376458,
 'num_leaves': 120,
 'min_data_in_leaf': 108}

## Retrain LightGBM model with the optimal hyperparameter set

In [16]:
params = {
    'n_iter'           : 200,
    'verbosity'        : -1,
    'objective'        : 'l1',
    'random_state'     : 42,
    'extra_trees'      : True,
    'colsample_bytree': 0.22202173638227363,
    'colsample_bynode': 0.8252477342750326,
    'max_depth': 8,
    'learning_rate': 0.07857132545866048,
    'lambda_l1': 4.949683443818313,
    'lambda_l2': 6.474362496376458,
    'num_leaves': 120,
    'min_data_in_leaf': 108
    }
model = lgb.LGBMRegressor(**params)
X, y = df_all.drop(columns=[target, "active"]), df_all[target]
# include validation set in the training set
train_times = list(range(39))
X_train = X[X['scale'].isin(train_times)]
y_train = y[X['scale'].isin(train_times)]
# test set
X_test, y_test = X[y.isnull()], y[y.isnull()]
# retrain and predict
if SMAPE_ENABLED:
        y_train = to_percent(X=X_train, y=y_train)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
if SMAPE_ENABLED:
    y_pred = from_percent(X=X_test, yhat=y_pred)





## Create submission based on the prediction

In [17]:
df_subm.loc[X_test.index, target] = y_pred
df_subm.fillna(0).to_csv(os.path.join(SAVE_PATH, "submission_6.csv"))