<a href="https://colab.research.google.com/github/KPC6796/M5/blob/master/M5_LightGBM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# After running
! git clone --recursive https://github.com/Microsoft/LightGBM

#You can run this oneliner which will build and compile LightGBM with GPU enabled in colab:
! cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu; 

Cloning into 'LightGBM'...
remote: Enumerating objects: 18286, done.[K
remote: Total 18286 (delta 0), reused 0 (delta 0), pack-reused 18286[K
Receiving objects: 100% (18286/18286), 12.28 MiB | 7.94 MiB/s, done.
Resolving deltas: 100% (13362/13362), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'compute'
Cloning into '/content/LightGBM/compute'...
remote: Enumerating objects: 21728, done.        
remote: Total 21728 (delta 0), reused 0 (delta 0), pack-reused 21728        
Receiving objects: 100% (21728/21728), 8.51 MiB | 6.15 MiB/s, done.
Resolving deltas: 100% (17565/17565), done.
Submodule path 'compute': checked out '36c89134d4013b2e5e45bc55656a18bd6141995a'
-- The C compiler identification is GNU 7.5.0
-- The CXX compiler identification is GNU 7.5.0
-- Check for working C compiler: /usr/bin/cc
-- Check for working C compiler: /usr/bin/cc -- works
-- Detecting C compiler ABI info
-- Detecting C compiler ABI info - done
-- Detectin

In [2]:
URL_calendar = "https://slavadatasets.s3.us-east-2.amazonaws.com/calendar.csv"
URL_sales_train ='https://slavadatasets.s3.us-east-2.amazonaws.com/sales_train_validation.csv'
URL_prices = 'https://slavadatasets.s3.us-east-2.amazonaws.com/sell_prices.csv'

In [3]:
from  datetime import datetime, timedelta
import numpy as np, pandas as pd
import gc
import io
import dask.dataframe as dd
import lightgbm as lgb

In [4]:
CAL_DTYPES={"event_name_1": "category", "event_name_2": "category", "event_type_1": "category", 
         "event_type_2": "category", "weekday": "category", 'wm_yr_wk': 'int16', "wday": "int16",
        "month": "int16", "year": "int16", "snap_CA": "float32", 'snap_TX': 'float32', 'snap_WI': 'float32' }
PRICE_DTYPES = {"store_id": "category", "item_id": "category", "wm_yr_wk": "int16","sell_price":"float32" }

In [5]:
pd.options.display.max_columns = 50

In [6]:
def create_dt(is_train = True, nrows = None, first_day = 1200):
     
    prices = dd.read_csv(URL_prices,dtype = PRICE_DTYPES).compute()
    for col, col_dtype in PRICE_DTYPES.items():
        if col_dtype == "category":
            prices[col] = prices[col].cat.codes.astype("int16")
            prices[col] -= prices[col].min()
        
    cal = dd.read_csv(URL_calendar,dtype = CAL_DTYPES).compute()
    cal["date"] = pd.to_datetime(cal["date"])
    for col, col_dtype in CAL_DTYPES.items():
        if col_dtype == "category":
            cal[col] = cal[col].cat.codes.astype("int16")
            cal[col] -= cal[col].min()
    
    start_day = max(1 if is_train  else tr_last-max_lags, first_day)
    numcols = [f"d_{day}" for day in range(start_day,tr_last+1)]
    catcols = ['id', 'item_id', 'dept_id','store_id', 'cat_id', 'state_id']
    dtype = {numcol:"float32" for numcol in numcols} 
    dtype.update({col: "category" for col in catcols if col != "id"})
    dt = dd.read_csv(URL_sales_train, 
                     nrows = nrows, usecols = catcols + numcols, dtype = dtype).compute()
    
    for col in catcols:
        if col != "id":
            dt[col] = dt[col].cat.codes.astype("int16")
            dt[col] -= dt[col].min()
    
    if not is_train:
        for day in range(tr_last+1, tr_last+ 28 +1):
            dt[f"d_{day}"] = np.nan
    
    dt = pd.melt(dt,
                  id_vars = catcols,
                  value_vars = [col for col in dt.columns if col.startswith("d_")],
                  var_name = "d",
                  value_name = "sales")
    
    dt = dt.merge(cal, on= "d", copy = False)
    dt = dt.merge(prices, on = ["store_id", "item_id", "wm_yr_wk"], copy = False)
    
    return dt

In [17]:
create_dt()

ValueError: ignored

In [7]:
def create_fea(dt):
    lags = [7, 28]
    lag_cols = [f"lag_{lag}" for lag in lags ]
    for lag, lag_col in zip(lags, lag_cols):
        dt[lag_col] = dt[["id","sales"]].groupby("id")["sales"].shift(lag)

    wins = [7, 28]
    for win in wins :
        for lag,lag_col in zip(lags, lag_cols):
            dt[f"rmean_{lag}_{win}"] = dt[["id", lag_col]].groupby("id")[lag_col].transform(lambda x : x.rolling(win).mean())

    
    
    date_features = {
        
        "wday": "weekday",
        "week": "weekofyear",
        "month": "month",
        "quarter": "quarter",
        "year": "year",
        "mday": "day",
#         "ime": "is_month_end",
#         "ims": "is_month_start",
    }
    
#     dt.drop(["d", "wm_yr_wk", "weekday"], axis=1, inplace = True)
    
    for date_feat_name, date_feat_func in date_features.items():
        if date_feat_name in dt.columns:
            dt[date_feat_name] = dt[date_feat_name].astype("int16")
        else:
            dt[date_feat_name] = getattr(dt["date"].dt, date_feat_func).astype("int16")

In [18]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import matplotlib.pyplot as plt
import seaborn as sns

  import pandas.util.testing as tm


In [23]:
df_cal = pd.read_csv('/calendar.csv')
df_eval = pd.read_csv('/sales_train_validation.csv.zip')
df_price = pd.read_csv('/sell_prices.csv.zip')
df_sample_output = pd.read_csv('/sample_submission.csv.zip')

In [24]:
holiday = ['NewYear', 'OrthodoxChristmas', 'MartinLutherKingDay', 'SuperBowl', 'PresidentsDay', 'StPatricksDay', 'Easter', 'Cinco De Mayo', 'IndependenceDay', 'EidAlAdha', 'Thanksgiving', 'Christmas']
weekend = ['Saturday', 'Sunday']

df_cal['is_holiday_1'] = df_cal['event_name_1'].apply(lambda x : 1 if x in holiday else 0 )
df_cal['is_holiday_2'] = df_cal['event_name_1'].apply(lambda x : 1 if x in holiday else 0 )
df_cal['is_holiday'] = df_cal[['is_holiday_1','is_holiday_2']].max(axis=1)
df_cal['is_weekend'] = df_cal['weekday'].apply(lambda x : 1 if x in weekend else 0 )

In [25]:
df_cal = df_cal.drop(['weekday', 'wday', 'month', 'year', 'event_name_1', 'event_type_1', 'event_name_2', 'event_type_2'], axis='columns')

In [26]:
del_col = []
for x in range(1851):
    del_col.append('d_' + str(x+1))

In [27]:
df_eval = df_eval.drop(del_col, axis='columns')

In [28]:
df_eval = df_eval.melt(['id','item_id','dept_id','cat_id','store_id','state_id'], var_name='d', value_name='qty')
print(df_eval.shape)
df_eval.head()

(1890380, 8)


Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,qty
0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_001,HOBBIES_1,HOBBIES,CA_1,CA,d_1852,0
1,HOBBIES_1_002_CA_1_validation,HOBBIES_1_002,HOBBIES_1,HOBBIES,CA_1,CA,d_1852,0
2,HOBBIES_1_003_CA_1_validation,HOBBIES_1_003,HOBBIES_1,HOBBIES,CA_1,CA,d_1852,0
3,HOBBIES_1_004_CA_1_validation,HOBBIES_1_004,HOBBIES_1,HOBBIES,CA_1,CA,d_1852,1
4,HOBBIES_1_005_CA_1_validation,HOBBIES_1_005,HOBBIES_1,HOBBIES,CA_1,CA,d_1852,0


In [29]:
df_eval = pd.merge(df_eval, df_cal, how='left', on='d')

In [30]:
df_eval = pd.merge(df_eval, df_price, how='left', on=['item_id', 'wm_yr_wk', 'store_id'])

In [31]:
df_eval_test = df_eval.query('d == "d_1852"')

In [32]:
df_eval_test = df_eval_test[['id', 'store_id', 'item_id', 'dept_id', 'cat_id', 'state_id', 'd', 'qty', 'sell_price']]

In [33]:
df_eval_test['qty'] = df_eval_test['d'].apply(lambda x: int(x.replace(x, '0')))

In [34]:
tmp_df = df_eval_test

In [35]:
for x in range(28):
    df_eval_test = df_eval_test.append(tmp_df)

In [36]:
df_eval_test = df_eval_test.reset_index(drop=True)

In [37]:
lst_d = []
i = 0
lst_index = df_eval_test.index
for x in lst_index:
    lst_d.append('d_' + str(((lst_index[i]) // 30490) + 1942))
    i = i + 1

In [38]:
df_eval_test['d'] = lst_d

In [39]:
df_eval_test = pd.merge(df_eval_test, df_cal, how='left', on='d')

In [40]:
df_eval_test = pd.merge(df_eval_test, df_price, how='left', on=['item_id', 'wm_yr_wk', 'store_id'])

In [41]:
import gc
del tmp_df
gc.collect()

325

In [42]:
df_eval = pd.get_dummies(data=df_eval, columns=['dept_id', 'cat_id', 'store_id', 'state_id'])
df_eval_test = pd.get_dummies(data=df_eval_test, columns=['dept_id', 'cat_id', 'store_id', 'state_id'])

In [43]:
df_eval_test = df_eval_test.drop(['sell_price_x', 'snap_CA', 'snap_TX', 'snap_WI'], axis='columns')
df_eval_test = df_eval_test.rename(columns={'sell_price_y': 'sell_price'})
df_eval = df_eval.drop(['snap_CA', 'snap_TX', 'snap_WI'], axis='columns') 

In [44]:
from sklearn.model_selection import train_test_split
target_col = 'qty'
exclude_cols = ['id', 'item_id', 'd', 'date', 'wm_yr_wk']
feature_cols = [col for col in df_eval.columns if col not in exclude_cols]
y = np.array(df_eval[target_col])
X = np.array(df_eval[feature_cols])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [45]:
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test)
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 250,
    'learning_rate': 0.01,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75}

model = lgb.train(params, lgb_train, num_boost_round=5000, early_stopping_rounds=750, valid_sets = [lgb_train, lgb_eval], verbose_eval=100)

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 384
[LightGBM] [Info] Number of data points in the train set: 1417785, number of used features: 29
[LightGBM] [Info] Start training from score 1.375912
Training until validation scores don't improve for 750 rounds
[100]	training's rmse: 1.56638	valid_1's rmse: 1.55736
[200]	training's rmse: 0.743187	valid_1's rmse: 0.738473
[300]	training's rmse: 0.39397	valid_1's rmse: 0.390096
[400]	training's rmse: 0.237782	valid_1's rmse: 0.232816
[500]	training's rmse: 0.171587	valid_1's rmse: 0.164233
[600]	training's rmse: 0.147176	valid_1's rmse: 0.138432
[700]	training's rmse: 0.135609	valid_1's rmse: 0.125652
[800]	training's rmse: 0.129512	valid_1's rmse: 0.119463
[900]	training's rmse: 0.125858	valid_1's rmse: 0.11558
[1000]	training's rmse: 0.123383	valid_1's rmse: 0.112933
[1100]	training's rmse: 0.121819	valid_1's rmse: 0.111345
[1200]	tra

In [46]:
pred = model.predict(df_eval_test[feature_cols])

In [47]:
df_eval_test['pred_qty'] = pred

In [48]:
predictions = df_eval_test[['id', 'date', 'pred_qty']]
predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'pred_qty').reset_index()
predictions

date,id,NaN,2016-05-23,2016-05-24,2016-05-25,2016-05-26,2016-05-27,2016-05-28,2016-05-29,2016-05-30,2016-05-31,2016-06-01,2016-06-02,2016-06-03,2016-06-04,2016-06-05,2016-06-06,2016-06-07,2016-06-08,2016-06-09,2016-06-10,2016-06-11,2016-06-12,2016-06-13,2016-06-14,2016-06-15,2016-06-16,2016-06-17,2016-06-18,2016-06-19
0,FOODS_1_001_CA_1_validation,0.014686,-0.000637,-0.000637,-0.000637,-0.000637,-0.000637,-0.000357,-0.000357,-0.000637,-0.000637,-0.000637,-0.000637,-0.000637,-0.000357,-0.000357,-0.000637,-0.000637,-0.000637,-0.000637,-0.000637,-0.000357,-0.000357,-0.000637,-0.000637,-0.000637,-0.000637,-0.000637,-0.000357,-0.000357
1,FOODS_1_001_CA_2_validation,0.001337,0.001931,0.001931,0.001931,0.001931,0.001931,0.002233,0.002233,0.001931,0.001931,0.001931,0.001931,0.001931,0.002233,0.002233,0.001931,0.001931,0.001931,0.001931,0.001931,0.002233,0.002233,0.001931,0.001931,0.001931,0.001931,0.001931,0.002233,0.002233
2,FOODS_1_001_CA_3_validation,0.150235,0.000691,0.000691,0.000691,0.000691,0.000691,0.002439,0.002439,0.000691,0.000691,0.000691,0.000691,0.000691,0.002439,0.002439,0.000691,0.000691,0.000691,0.000691,0.000691,0.002439,0.002439,0.000691,0.000691,0.000691,0.000691,0.000691,0.002439,0.002439
3,FOODS_1_001_CA_4_validation,-0.035811,0.000139,0.000139,0.000139,0.000139,0.000139,-0.001154,-0.001154,0.000139,0.000139,0.000139,0.000139,0.000139,-0.001154,-0.001154,0.000139,0.000139,0.000139,0.000139,0.000139,-0.001154,-0.001154,0.000139,0.000139,0.000139,0.000139,0.000139,-0.001154,-0.001154
4,FOODS_1_001_TX_1_validation,-0.008844,-0.000399,-0.000399,-0.000399,-0.000399,-0.000399,-0.001588,-0.001588,-0.000399,-0.000399,-0.000399,-0.000399,-0.000399,-0.001588,-0.001588,-0.000399,-0.000399,-0.000399,-0.000399,-0.000399,-0.001588,-0.001588,-0.000399,-0.000399,-0.000399,-0.000399,-0.000399,-0.001588,-0.001588
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30485,HOUSEHOLD_2_516_TX_2_validation,-0.031467,-0.000404,-0.000404,-0.000404,-0.000404,-0.000404,-0.000953,-0.000953,-0.000404,-0.000404,-0.000404,-0.000404,-0.000404,-0.000953,-0.000953,-0.000404,-0.000404,-0.000404,-0.000404,-0.000404,-0.000953,-0.000953,-0.000404,-0.000404,-0.000404,-0.000404,-0.000404,-0.000953,-0.000953
30486,HOUSEHOLD_2_516_TX_3_validation,-0.031290,-0.000280,-0.000280,-0.000280,-0.000280,-0.000280,-0.000733,-0.000733,-0.000280,-0.000280,-0.000280,-0.000280,-0.000280,-0.000733,-0.000733,-0.000280,-0.000280,-0.000280,-0.000280,-0.000280,-0.000733,-0.000733,-0.000280,-0.000280,-0.000280,-0.000280,-0.000280,-0.000733,-0.000733
30487,HOUSEHOLD_2_516_WI_1_validation,-0.046290,-0.000104,-0.000104,-0.000104,-0.000104,-0.000104,-0.000708,-0.000708,-0.000104,-0.000104,-0.000104,-0.000104,-0.000104,-0.000708,-0.000708,-0.000104,-0.000104,-0.000104,-0.000104,-0.000104,-0.000708,-0.000708,-0.000104,-0.000104,-0.000104,-0.000104,-0.000104,-0.000708,-0.000708
30488,HOUSEHOLD_2_516_WI_2_validation,-0.034450,-0.001090,-0.001090,-0.001090,-0.001090,-0.001090,-0.000314,-0.000314,-0.001090,-0.001090,-0.001090,-0.001090,-0.001090,-0.000314,-0.000314,-0.001090,-0.001090,-0.001090,-0.001090,-0.001090,-0.000314,-0.000314,-0.001090,-0.001090,-0.001090,-0.001090,-0.001090,-0.000314,-0.000314


In [49]:
predictions = predictions.drop(predictions.columns[1], axis=1)

In [50]:
predictions.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

In [51]:
x = 2744099 + 1 - 853720
df_val = df_eval[x:]

In [52]:
predictions_v = df_val[['id', 'date', 'qty']]
predictions_v = pd.pivot(predictions_v, index = 'id', columns = 'date', values = 'qty').reset_index()

In [53]:
predictions_v['id'] = predictions['id'].apply(lambda x: x.replace('evaluation', 'validation'))

In [55]:
predictions_v.head()

date,id
0,FOODS_1_001_CA_1_validation
1,FOODS_1_001_CA_2_validation
2,FOODS_1_001_CA_3_validation
3,FOODS_1_001_CA_4_validation
4,FOODS_1_001_TX_1_validation


In [56]:
predictions_v.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

ValueError: ignored

In [57]:
predictions_v.columns

Index(['id'], dtype='object', name='date')

In [59]:
predictions_concat = pd.concat([predictions, predictions_v], axis=0)
predictions_concat.head()

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,-0.000637,-0.000637,-0.000637,-0.000637,-0.000637,-0.000357,-0.000357,-0.000637,-0.000637,-0.000637,-0.000637,-0.000637,-0.000357,-0.000357,-0.000637,-0.000637,-0.000637,-0.000637,-0.000637,-0.000357,-0.000357,-0.000637,-0.000637,-0.000637,-0.000637,-0.000637,-0.000357,-0.000357
1,FOODS_1_001_CA_2_validation,0.001931,0.001931,0.001931,0.001931,0.001931,0.002233,0.002233,0.001931,0.001931,0.001931,0.001931,0.001931,0.002233,0.002233,0.001931,0.001931,0.001931,0.001931,0.001931,0.002233,0.002233,0.001931,0.001931,0.001931,0.001931,0.001931,0.002233,0.002233
2,FOODS_1_001_CA_3_validation,0.000691,0.000691,0.000691,0.000691,0.000691,0.002439,0.002439,0.000691,0.000691,0.000691,0.000691,0.000691,0.002439,0.002439,0.000691,0.000691,0.000691,0.000691,0.000691,0.002439,0.002439,0.000691,0.000691,0.000691,0.000691,0.000691,0.002439,0.002439
3,FOODS_1_001_CA_4_validation,0.000139,0.000139,0.000139,0.000139,0.000139,-0.001154,-0.001154,0.000139,0.000139,0.000139,0.000139,0.000139,-0.001154,-0.001154,0.000139,0.000139,0.000139,0.000139,0.000139,-0.001154,-0.001154,0.000139,0.000139,0.000139,0.000139,0.000139,-0.001154,-0.001154
4,FOODS_1_001_TX_1_validation,-0.000399,-0.000399,-0.000399,-0.000399,-0.000399,-0.001588,-0.001588,-0.000399,-0.000399,-0.000399,-0.000399,-0.000399,-0.001588,-0.001588,-0.000399,-0.000399,-0.000399,-0.000399,-0.000399,-0.001588,-0.001588,-0.000399,-0.000399,-0.000399,-0.000399,-0.000399,-0.001588,-0.001588


In [60]:
predictions_concat.to_csv('submission.csv', index=False)

In [62]:
import os
os.getcwd()

'/content'