In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

import lightgbm as lgb
import optuna

SMAPE_ENABLED = True
LAGS = [1, 2, 3, 4, 5, 6, 7, 8]

In [44]:
def to_percent(X, y):
    yhat = (X['lags(1)'] - y) / X['lags(1)']
    yhat[X['lags(1)'] == 0] = 0 # denominator cannot be 0
    return yhat

def from_percent(X, y):
    yhat = X['lags(1)'] - (y * X[f'lags(1)'])
    return yhat

def smape(y_true, y_pred):
    smap = np.zeros(len(y_true))
    
    num = np.abs(y_true - y_pred)
    dem = ((np.abs(y_true) + np.abs(y_pred)) / 2)
    
    pos_ind = (y_true != 0) | (y_pred != 0)
    smap[pos_ind] = num[pos_ind] / dem[pos_ind]
    
    return 100 * np.mean(smap)

def lgb_objective(trial):
    params = {
        'n_iter'           : 200,
        'verbosity'        : -1,
        'objective'        : 'l1',
        'random_state'     : 42,
        'extra_trees'      : True,
        'colsample_bytree' : trial.suggest_float('colsample_bytree', 0.1, 1.0),
        'colsample_bynode' : trial.suggest_float('colsample_bynode', 0.1, 1.0),
        'max_depth'        : trial.suggest_int('max_depth', 3, 10),
        'learning_rate'    : trial.suggest_float('learning_rate', 0.01, 0.1, log=True),
        'lambda_l1'        : trial.suggest_float('lambda_l1', 1e-2, 10.0),
        'lambda_l2'        : trial.suggest_float('lambda_l2', 1e-2, 10.0),
        'num_leaves'       : trial.suggest_int('num_leaves', 8, 1024),
        'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 5, 250),}
    
    model  = lgb.LGBMRegressor(**params)
    X, y   = df_all.drop(columns=[target]), df_all[target]
    
    train_times = list(range(38))
    valid_times = [38]
    
    y_train = y[X['scale'].isin(train_times)]
    y_valid = y[X['scale'].isin(valid_times)]
    
    X_train = X[X['scale'].isin(train_times)]
    X_valid = X[X['scale'].isin(valid_times)]
    
    if SMAPE_ENABLED:
        y_train = to_percent(X_train, y_train)
    
    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)
    
    if SMAPE_ENABLED:
        y_pred = from_percent(X_valid, y_pred)
    
    return smape(y_valid, y_pred)

In [7]:
BASE = '../input/'
date_col = 'first_day_of_month'
cat_cols = ['county', 'state']
target = 'microbusiness_density'
idx = 'row_id'

In [49]:
df_census = pd.read_csv(BASE + 'census_starter.csv', index_col='cfips')
df_train = pd.read_csv(BASE + 'train.csv',  index_col=idx)
df_test = pd.read_csv(BASE + 'test.csv',  index_col=idx)
df_subm = pd.read_csv(BASE + 'sample_submission.csv',  index_col=idx)

In [50]:
state_dict = df_train[['cfips', 'state', 'county']]
state_dict = state_dict.set_index('cfips')
state_dict = state_dict.drop_duplicates()
state_dict = state_dict.to_dict()

df_test['state'] = df_test['cfips'].map(state_dict['state'])
df_test['county'] = df_test['cfips'].map(state_dict['county'])

df_all = pd.concat([df_train, df_test], axis=0)

df_all[date_col] = pd.to_datetime(df_all[date_col])

df_all['year'] = df_all[date_col].dt.year
df_all['month'] = df_all[date_col].dt.month
df_all['scale'] = (df_all[date_col] - df_all[date_col].min()).dt.days
df_all['scale'] = df_all['scale'].factorize()[0]

In [51]:
for i in LAGS:
    df_all[f'lags({i})'] = df_all.groupby('cfips')[target].shift(i)
    
df_all['active'] = df_all.groupby('cfips')['active'].shift(8)

In [52]:
df_all = df_all.drop(columns=[date_col])
df_all[cat_cols] = df_all[cat_cols].astype('category')

df_all = df_all.reset_index()
df_all = df_all.set_index('cfips')

df_all[df_census.columns] = df_census

df_all = df_all.reset_index()
df_all = df_all.set_index(idx)

In [53]:
df_all = df_all[df_all['scale'] != 0]

In [45]:
study = optuna.create_study(direction='minimize', study_name='Regressor')
study.optimize(lgb_objective, n_trials=30, show_progress_bar=True)

[32m[I 2023-01-31 12:29:07,170][0m A new study created in memory with name: Regressor[0m


  0%|          | 0/30 [00:00<?, ?it/s]



[32m[I 2023-01-31 12:31:32,668][0m Trial 0 finished with value: 1.0700032817911214 and parameters: {'colsample_bytree': 0.9811096308258251, 'colsample_bynode': 0.5681824208096472, 'max_depth': 6, 'learning_rate': 0.06594559584194976, 'lambda_l1': 6.147756675365701, 'lambda_l2': 9.775103002414689, 'num_leaves': 361, 'min_data_in_leaf': 158}. Best is trial 0 with value: 1.0700032817911214.[0m
[32m[I 2023-01-31 12:34:48,068][0m Trial 1 finished with value: 1.0720098413751513 and parameters: {'colsample_bytree': 0.7881175164053723, 'colsample_bynode': 0.6060340368861783, 'max_depth': 5, 'learning_rate': 0.09786614129578035, 'lambda_l1': 0.9783211029540011, 'lambda_l2': 9.922666085742053, 'num_leaves': 22, 'min_data_in_leaf': 160}. Best is trial 0 with value: 1.0700032817911214.[0m
[32m[I 2023-01-31 12:36:33,338][0m Trial 2 finished with value: 1.0732000258447734 and parameters: {'colsample_bytree': 0.6749796339773719, 'colsample_bynode': 0.11545600139968382, 'max_depth': 6, 'learni

[32m[I 2023-01-31 12:38:51,165][0m Trial 9 finished with value: 1.0733360900758453 and parameters: {'colsample_bytree': 0.5201047339933245, 'colsample_bynode': 0.7286925631078145, 'max_depth': 3, 'learning_rate': 0.07763868616300061, 'lambda_l1': 4.021090296161261, 'lambda_l2': 0.8888700387132186, 'num_leaves': 561, 'min_data_in_leaf': 209}. Best is trial 6 with value: 1.0699579531126844.[0m
[32m[I 2023-01-31 12:38:59,629][0m Trial 10 finished with value: 1.0744633975241342 and parameters: {'colsample_bytree': 0.3525135661188991, 'colsample_bynode': 0.9461369410323662, 'max_depth': 10, 'learning_rate': 0.015714976057187198, 'lambda_l1': 7.409543399375718, 'lambda_l2': 3.7566001982298483, 'num_leaves': 1001, 'min_data_in_leaf': 98}. Best is trial 6 with value: 1.0699579531126844.[0m
[32m[I 2023-01-31 12:39:09,860][0m Trial 11 finished with value: 1.0731272892703003 and parameters: {'colsample_bytree': 0.9986425716087616, 'colsample_bynode': 0.8063860314751822, 'max_depth': 3, 'l

[32m[I 2023-01-31 12:40:20,294][0m Trial 19 finished with value: 1.0706343346651814 and parameters: {'colsample_bytree': 0.2197756780033366, 'colsample_bynode': 0.856886446841294, 'max_depth': 5, 'learning_rate': 0.06047403159604191, 'lambda_l1': 1.3464944808426305, 'lambda_l2': 4.3781804401522955, 'num_leaves': 420, 'min_data_in_leaf': 5}. Best is trial 12 with value: 1.0687445428538411.[0m
[32m[I 2023-01-31 12:40:30,641][0m Trial 20 finished with value: 1.0700237510563209 and parameters: {'colsample_bytree': 0.8550425015649755, 'colsample_bynode': 0.6767094400641329, 'max_depth': 4, 'learning_rate': 0.046202225554687906, 'lambda_l1': 0.06303518571283862, 'lambda_l2': 4.877411903213675, 'num_leaves': 195, 'min_data_in_leaf': 69}. Best is trial 12 with value: 1.0687445428538411.[0m
[32m[I 2023-01-31 12:40:36,984][0m Trial 21 finished with value: 1.0722010024378938 and parameters: {'colsample_bytree': 0.2470733320771698, 'colsample_bynode': 0.8946465386109335, 'max_depth': 7, 'l

[32m[I 2023-01-31 12:41:39,330][0m Trial 28 finished with value: 1.0703026627251397 and parameters: {'colsample_bytree': 0.5153010954733311, 'colsample_bynode': 0.6574987003981061, 'max_depth': 9, 'learning_rate': 0.03468472871399092, 'lambda_l1': 0.7556649037948184, 'lambda_l2': 7.605980190378571, 'num_leaves': 233, 'min_data_in_leaf': 80}. Best is trial 12 with value: 1.0687445428538411.[0m
[32m[I 2023-01-31 12:41:51,403][0m Trial 29 finished with value: 1.0767145567402612 and parameters: {'colsample_bytree': 0.9166382633419011, 'colsample_bynode': 0.7774397687107211, 'max_depth': 6, 'learning_rate': 0.08441665836019674, 'lambda_l1': 6.372539752351248, 'lambda_l2': 6.484989974416297, 'num_leaves': 106, 'min_data_in_leaf': 33}. Best is trial 12 with value: 1.0687445428538411.[0m


In [54]:
params = {
    'n_iter': 200,
    'verbosity': -1,
    'objective': 'l1',
    'random_state': 42,
    'extra_trees': True,
    'colsample_bytree': 0.8841279649367693,
    'colsample_bynode': 0.10142964450634374,
    'max_depth': 8,
    'learning_rate': 0.013647749926797374,
    'lambda_l1': 1.8386216853616875,
    'lambda_l2': 7.557660410418351,
    'num_leaves': 61,
    'min_data_in_leaf': 213}

model = lgb.LGBMRegressor(**params)

In [55]:
X, y   = df_all.drop(columns=[target]), df_all[target]
X_test, y_test = X[y.isnull()], y[y.isnull()]

train_times = list(range(38))

X_train = X[X['scale'].isin(train_times)]
y_train = y[X['scale'].isin(train_times)]

if SMAPE_ENABLED:
    y_train = to_percent(X_train, y_train)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

if SMAPE_ENABLED:
    y_pred = from_percent(X_test, y_pred)
    
df_subm.loc[y_pred.index, target] = y_pred



In [56]:
X, y   = df_all.drop(columns=[target]), df_all[target]
valid_times = list(range(31, 39))
results = []

for valid_time in valid_times:
    train_times = list(range(valid_time))
    
    X_train = X[X['scale'].isin(train_times)]
    X_valid = X[X['scale'].isin([valid_time])]

    y_train = y[X['scale'].isin(train_times)]
    y_valid = y[X['scale'].isin([valid_time])]
    
    if SMAPE_ENABLED:
        y_train = to_percent(X_train, y_train)

    model.fit(X_train, y_train)
    y_pred = model.predict(X_valid)

    if SMAPE_ENABLED:
        y_pred = from_percent(X_valid, y_pred)
        
    X_valid['y_true'] = y_valid
    X_valid['y_base'] = X_valid['lags(1)']
    X_valid['y_pred'] = y_pred
    
    results.append(X_valid[['scale', 'month', 'state', 'county', 'cfips', 'y_true', 'y_base', 'y_pred']])
    
df_record = pd.concat(results, axis=0)
df_record.head()




A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy




Unnamed: 0_level_0,scale,month,state,county,cfips,y_true,y_base,y_pred
row_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1001_2022-03-01,31,3,Alabama,Autauga County,1001,3.336785,3.334431,3.337316
1003_2022-03-01,31,3,Alabama,Baldwin County,1003,7.945311,7.8233,7.832575
1005_2022-03-01,31,3,Alabama,Barbour County,1005,1.196728,1.206827,1.207489
1007_2022-03-01,31,3,Alabama,Bibb County,1007,1.264755,1.23665,1.237323
1009_2022-03-01,31,3,Alabama,Blount County,1009,1.797986,1.777708,1.779424


In [57]:
base_err = df_record.groupby('cfips').apply(lambda x: smape(x.y_true, x.y_base))
pred_err = df_record.groupby('cfips').apply(lambda x: smape(x.y_true, x.y_pred))

blacklist = base_err[(base_err + 1e-3) < pred_err].index

print(f'Avg SMAPE(model): {pred_err.mean():.3f}')
print(f'Avg SMAPE(base): {base_err.mean():.3f}')
print(len(blacklist))

Avg SMAPE(model): 1.409
Avg SMAPE(base): 1.412
1694


In [58]:
idx = df_test[df_test['cfips'].isin(blacklist)].index

df_subm.loc[idx, target] = df_all.loc[idx, 'lags(1)']

In [59]:
df_subm[target] = df_subm[target].fillna(0)

In [60]:
df_subm.head()

Unnamed: 0_level_0,microbusiness_density
row_id,Unnamed: 1_level_1
1001_2022-11-01,3.468241
1003_2022-11-01,8.375471
1005_2022-11-01,1.232074
1007_2022-11-01,1.28724
1009_2022-11-01,1.833519


In [None]:
df_subm.to_csv('submission.csv')