https://www.kaggle.com/ymatioun/simple-lightgbm
https://github.com/kaggle/docker-python

In [59]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O

In [60]:
# Any results you write to the current directory are saved as output.
import time
import lightgbm as lgb
from sklearn.model_selection import KFold
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import seaborn as sns

In [61]:
def add_agg(merged_df, gr_cols, new_col_name, incr_yr):
    agg1 = train_df.groupby(gr_cols)['target'].agg('mean').reset_index()
    cols2 = gr_cols.copy()
    cols2.append(new_col_name)
    agg1.columns = cols2
    if incr_yr:
        agg1['year']+=1
    merged_df = pd.merge(merged_df, agg1, how='left', left_on=gr_cols, right_on=gr_cols)
    return merged_df

In [62]:
# read raw data
start_time = time.time()
print('  Loading data...')
#input_dir  = os.path.join(os.pardir, 'input')
train_df   = pd.read_csv('trainv.csv')
test_df    = pd.read_csv('testv.csv')
print('    Time elapsed %.0f sec'%(time.time()-start_time))

# Merge test/train datasets into a single one and separate unneeded columns
merged_df = pd.concat([train_df, test_df], sort=False)

  Loading data...
    Time elapsed 0 sec


In [63]:
#train_df.shape #(54780, 4)
#test_df.shape   #(2670, 4)
merged_df        #(57450,5)

Unnamed: 0,date,hospital,item,qty,id
0,01/01/21,1,1,13.0,
1,02/01/21,1,1,11.0,
2,03/01/21,1,1,14.0,
3,04/01/21,1,1,13.0,
4,05/01/21,1,1,10.0,
...,...,...,...,...,...
2665,26/09/21,10,3,,2694.0
2666,27/09/21,10,3,,2695.0
2667,28/09/21,10,3,,2696.0
2668,29/09/21,10,3,,2697.0


In [64]:
# add columns: date related
merged_df['date']  = pd.to_datetime(merged_df['date'])
merged_df['year']  = merged_df['date'].dt.year
merged_df['month']  = merged_df['date'].dt.month
merged_df['day']   = merged_df['date'].dt.dayofweek
merged_df.drop('date', axis=1, inplace=True)

In [65]:
merged_df   # date column is removed  and year,month,day day added seperately

Unnamed: 0,hospital,item,qty,id,year,month,day
0,1,1,13.0,,2021,1,4
1,1,1,11.0,,2021,2,0
2,1,1,14.0,,2021,3,0
3,1,1,13.0,,2021,4,3
4,1,1,10.0,,2021,5,5
...,...,...,...,...,...,...,...
2665,10,3,,2694.0,2021,9,6
2666,10,3,,2695.0,2021,9,0
2667,10,3,,2696.0,2021,9,1
2668,10,3,,2697.0,2021,9,2


In [66]:
# add grouped columns
train_df=pd.DataFrame(merged_df[merged_df.qty.notna()].values)
train_df.columns=merged_df.columns
train_df['target']=train_df['qty'] # rename

In [67]:
train_df

Unnamed: 0,hospital,item,qty,id,year,month,day,target
0,1.0,1.0,13.0,,2021.0,1.0,4.0,13.0
1,1.0,1.0,11.0,,2021.0,2.0,0.0,11.0
2,1.0,1.0,14.0,,2021.0,3.0,0.0,14.0
3,1.0,1.0,13.0,,2021.0,4.0,3.0,13.0
4,1.0,1.0,10.0,,2021.0,5.0,5.0,10.0
...,...,...,...,...,...,...,...,...
54775,10.0,3.0,32.0,,2021.0,3.0,5.0,32.0
54776,10.0,3.0,33.0,,2021.0,3.0,6.0,33.0
54777,10.0,3.0,39.0,,2021.0,3.0,0.0,39.0
54778,10.0,3.0,34.0,,2021.0,3.0,1.0,34.0


In [68]:
# scale for item+hospital+year(prev). Need to scale predictions back up!
merged_df = add_agg(merged_df,['item','hospital','year'],'tsy',0)
merged_df['qty']/=merged_df['tsy']
merged_df = merged_df[merged_df.year>2020]
tsy=merged_df.pop('tsy')

In [69]:

merged_df

Unnamed: 0,hospital,item,qty,id,year,month,day
0,1,1,0.650927,,2021,1,4
1,1,1,0.550784,,2021,2,0
2,1,1,0.700998,,2021,3,0
3,1,1,0.650927,,2021,4,3
4,1,1,0.500713,,2021,5,5
...,...,...,...,...,...,...,...
57445,10,3,,2694.0,2021,9,6
57446,10,3,,2695.0,2021,9,0
57447,10,3,,2696.0,2021,9,1
57448,10,3,,2697.0,2021,9,2


In [71]:

# pop qty and ID
ID=merged_df[merged_df.id.notna()]['id']
target=merged_df[merged_df.qty.notna()]['qty']
merged_df.drop(['id','qty'], axis=1, inplace=True)
len_train=target.shape[0]

In [72]:
target

0        0.650927
1        0.550784
2        0.700998
3        0.650927
4        0.500713
           ...   
54775    0.778336
54776    0.802659
54777    0.948597
54778    0.826982
54779    0.948597
Name: qty, Length: 54780, dtype: float64

In [73]:
merged_df

Unnamed: 0,hospital,item,year,month,day
0,1,1,2021,1,4
1,1,1,2021,2,0
2,1,1,2021,3,0
3,1,1,2021,4,3
4,1,1,2021,5,5
...,...,...,...,...,...
57445,10,3,2021,9,6
57446,10,3,2021,9,0
57447,10,3,2021,9,1
57448,10,3,2021,9,2


In [74]:
# use lightgbm for regression
print('    Time elapsed %.0f sec'%(time.time()-start_time))

    Time elapsed 156 sec


In [75]:
# specify your configurations as a dict
params = {
    'nthread': 10,
    'max_depth': 8,
    'task': 'train',
    'boosting_type': 'gbdt',
    'objective': 'regression_l1',
    'metric': 'mape', # this is abs(a-e)/max(1,a)
    'num_leaves': 31,
    'learning_rate': 0.25,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'lambda_l1': 0.06,
    'lambda_l2': 0.1,
    'verbose': -1
}


In [84]:
# do the training
num_folds = 5
test_x = merged_df[len_train:].values
all_x = merged_df[:len_train].values
all_y = target.values
oof_preds = np.zeros([all_y.shape[0]])
sub_preds = np.zeros([test_x.shape[0]])
feature_importance_df = pd.DataFrame()
folds = KFold(n_splits=num_folds, shuffle=True, random_state=345665)
for n_fold, (train_idx, valid_idx) in enumerate(folds.split(all_x)):
    train_x, train_y = all_x[train_idx], all_y[train_idx]
    valid_x, valid_y = all_x[valid_idx], all_y[valid_idx]
    lgb_train = lgb.Dataset(train_x,train_y)
    lgb_valid = lgb.Dataset(valid_x,valid_y)
        
    # train
    gbm = lgb.train(params, lgb_train, 1000, 
        valid_sets=[lgb_train, lgb_valid],
        early_stopping_rounds=100, verbose_eval=100)
    oof_preds[valid_idx] = gbm.predict(valid_x, num_iteration=gbm.best_iteration)
    sub_preds[:] += gbm.predict(test_x, num_iteration=gbm.best_iteration) / folds.n_splits
    valid_idx += 1
    importance_df = pd.DataFrame()
    importance_df['feature'] = merged_df.columns
    importance_df['importance'] = gbm.feature_importance()
    importance_df['fold'] = n_fold + 1
    feature_importance_df = pd.concat([feature_importance_df, importance_df], axis=0)
e = 2 * abs(all_y - oof_preds) / ( abs(all_y)+abs(oof_preds) )
e = e.mean()
print('Full validation score %.4f' %e)

Training until validation scores don't improve for 100 rounds
[100]	training's mape: 0.178692	valid_1's mape: 0.185002
Early stopping, best iteration is:
[16]	training's mape: 0.180186	valid_1's mape: 0.183002
Training until validation scores don't improve for 100 rounds
[100]	training's mape: 0.178277	valid_1's mape: 0.185737
Early stopping, best iteration is:
[12]	training's mape: 0.180473	valid_1's mape: 0.183748
Training until validation scores don't improve for 100 rounds
[100]	training's mape: 0.178639	valid_1's mape: 0.183667
Early stopping, best iteration is:
[10]	training's mape: 0.180852	valid_1's mape: 0.181011
Training until validation scores don't improve for 100 rounds
[100]	training's mape: 0.179346	valid_1's mape: 0.181429
Early stopping, best iteration is:
[15]	training's mape: 0.181141	valid_1's mape: 0.179252
Training until validation scores don't improve for 100 rounds
[100]	training's mape: 0.178966	valid_1's mape: 0.183371
Early stopping, best iteration is:
[16]	t

In [78]:
# Write submission file
pred = (sub_preds * tsy[len_train:] ).astype(np.float32)
out_df = pd.DataFrame({'id': ID.astype(np.int32), 'qty': pred})
out_df.to_csv('submission1.csv', index=False)
print('    Time elapsed %.0f sec'%(time.time()-start_time))

    Time elapsed 369 sec


In [90]:
oof_preds[valid_idx].shape

(10956,)

In [None]:
plt.legend(['Forecast','Actual'])
plt.title('Forecast vs Actuals with Cleaned Model')
plt.show()

In [91]:
sub_preds[:].shape

(2670,)