In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
import warnings
warnings.filterwarnings('ignore')
import joblib
from lightgbm import LGBMRegressor

#The code was referent by THIEN1504 kaggle website https://www.kaggle.com/code/thien1504/m5-simple-fast-model

In [2]:
data=pd.read_pickle('data.pkl')
valid=data[(data['d']>=1914)&(data['d']<1942)][['id','d','sold']]
test=data[data['d']>=1942][['id','d','sold']]

In [9]:
def lightgbm_model(X_train,y_train,X_valid,y_valid,N_e):
    model = LGBMRegressor(
        n_estimators=N_e,
        learning_rate=0.3,
        subsample=0.8,
        colsample_bytree=0.8,
        max_depth=8,
        num_leaves=224,
        )
    model.fit(X_train,y_train,eval_set=[(X_train,y_train),(X_valid,y_valid)],verbose=20,
              early_stopping_rounds=20,eval_metric='rmse')
    return model

In [4]:
N_e_list=[50,100,150,200,250,300,350,400]


df= data[data['store_id']==0]
X_train,y_train=df[df['d']<1914].drop('sold',axis=1),df[df['d']<1914]['sold']
X_valid,y_valid=df[(df['d']>=1914)&(df['d']<1942)].drop('sold',axis=1),df[(df['d']>=1914)&(df['d']<1942)]['sold']

for N_e in N_e_list:
    print (N_e)
    model= lightgbm_model(X_train,y_train,X_valid,y_valid,N_e)
    del model
    gc.collect()

50
[20]	training's rmse: 2.07691	training's l2: 4.31355	valid_1's rmse: 2.05543	valid_1's l2: 4.22479
[40]	training's rmse: 2.03671	training's l2: 4.14819	valid_1's rmse: 2.05072	valid_1's l2: 4.20545
100
[20]	training's rmse: 2.07691	training's l2: 4.31355	valid_1's rmse: 2.05543	valid_1's l2: 4.22479
[40]	training's rmse: 2.03671	training's l2: 4.14819	valid_1's rmse: 2.05072	valid_1's l2: 4.20545
[60]	training's rmse: 2.0051	training's l2: 4.02042	valid_1's rmse: 2.04763	valid_1's l2: 4.19279
150
[20]	training's rmse: 2.07691	training's l2: 4.31355	valid_1's rmse: 2.05543	valid_1's l2: 4.22479
[40]	training's rmse: 2.03671	training's l2: 4.14819	valid_1's rmse: 2.05072	valid_1's l2: 4.20545
[60]	training's rmse: 2.0051	training's l2: 4.02042	valid_1's rmse: 2.04763	valid_1's l2: 4.19279
200
[20]	training's rmse: 2.07691	training's l2: 4.31355	valid_1's rmse: 2.05543	valid_1's l2: 4.22479
[40]	training's rmse: 2.03671	training's l2: 4.14819	valid_1's rmse: 2.05072	valid_1's l2: 4.205

In [5]:
cal=pd.read_csv('./data/calendar.csv')
sales_train_evaluation=pd.read_csv('data/sales_train_evaluation.csv')

In [6]:
def downcase(df):  
    cols = df.dtypes.index.tolist()
    types = df.dtypes.values.tolist()
    for i,t in enumerate(types):
        if 'int' in str(t):
            if df[cols[i]].min() > np.iinfo(np.int8).min and df[cols[i]].max() < np.iinfo(np.int8).max:
                df[cols[i]] = df[cols[i]].astype(np.int8)
            elif df[cols[i]].min() > np.iinfo(np.int16).min and df[cols[i]].max() < np.iinfo(np.int16).max:
                df[cols[i]] = df[cols[i]].astype(np.int16)
            elif df[cols[i]].min() > np.iinfo(np.int32).min and df[cols[i]].max() < np.iinfo(np.int32).max:
                df[cols[i]] = df[cols[i]].astype(np.int32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.int64)
        elif 'float' in str(t):
            if df[cols[i]].min() > np.finfo(np.float16).min and df[cols[i]].max() < np.finfo(np.float16).max:
                df[cols[i]] = df[cols[i]].astype(np.float16)
            elif df[cols[i]].min() > np.finfo(np.float32).min and df[cols[i]].max() < np.finfo(np.float32).max:
                df[cols[i]] = df[cols[i]].astype(np.float32)
            else:
                df[cols[i]] = df[cols[i]].astype(np.float64)
        elif t == np.object:
            if cols[i] == 'date':
                df[cols[i]] = pd.to_datetime(df[cols[i]], format='%Y-%m-%d')
            else:
                df[cols[i]] = df[cols[i]].astype('category')
    return df

In [7]:
cal=downcase(cal)
sales_train_evaluation=downcase(sales_train_evaluation)
catalog_all=pd.melt(sales_train_evaluation,id_vars=['id','item_id','dept_id','cat_id','store_id','state_id'],var_name='d',value_name='sold')
catalog_all=pd.merge(catalog_all,cal,on='d',how='left')
d_id=dict(zip(catalog_all['id'].cat.codes,catalog_all['id']))
del cal,sales_train_evaluation
gc.collect

<function gc.collect(generation=2)>

In [8]:
for i in range(10):
    df= data[data['store_id']==i]
    X_train,y_train=df[df['d']<1914].drop('sold',axis=1),df[df['d']<1914]['sold']
    X_valid,y_valid=df[(df['d']>=1914)&(df['d']<1942)].drop('sold',axis=1),df[(df['d']>=1914)&(df['d']<1942)]['sold']
    X_test= df[df['d']>=1942].drop('sold',axis=1)
    print (f'-----Train model for store {i}------')
    model= lightgbm_model(X_train,y_train,X_valid,y_valid,350)
    
    print (f'-----predicting for store {i}------')
    pred_val= model.predict(X_valid)
    valid.loc[X_valid.index,'sold']=pred_val
    pred_eva=model.predict(X_test) 
    test.loc[X_test.index,'sold']= pred_eva
    print ('----------store model---------------')
    filename=f'model_store_LGBM_{i}.pkl'
    joblib.dump(model,filename)
    del model,X_train,y_train,X_valid,y_valid,X_test
    gc.collect()

-----Train model for store 0------
[20]	training's rmse: 2.07691	training's l2: 4.31355	valid_1's rmse: 2.05543	valid_1's l2: 4.22479
[40]	training's rmse: 2.03671	training's l2: 4.14819	valid_1's rmse: 2.05072	valid_1's l2: 4.20545
[60]	training's rmse: 2.0051	training's l2: 4.02042	valid_1's rmse: 2.04763	valid_1's l2: 4.19279
-----predicting for store 0------
----------store model---------------
-----Train model for store 1------
[20]	training's rmse: 1.67564	training's l2: 2.80776	valid_1's rmse: 1.93255	valid_1's l2: 3.73473
[40]	training's rmse: 1.65082	training's l2: 2.72519	valid_1's rmse: 1.93148	valid_1's l2: 3.73062
[60]	training's rmse: 1.63322	training's l2: 2.66741	valid_1's rmse: 1.93154	valid_1's l2: 3.73085
-----predicting for store 1------
----------store model---------------
-----Train model for store 2------
[20]	training's rmse: 2.78657	training's l2: 7.76495	valid_1's rmse: 2.42142	valid_1's l2: 5.86326
[40]	training's rmse: 2.71705	training's l2: 7.38238	valid_1'

In [10]:
sample_sub=pd.read_csv('data/sample_submission.csv')

In [11]:
sample_sub = sample_sub[['id']]
f_col= [f'F{i}' for i in range(1,29)]
f_col.insert(0,'id')

valid['id']=valid['id'].map(d_id)
valid=valid.pivot(index='id',columns='d',values='sold').reset_index()
valid['id']=valid['id'].str.replace("evaluation","validation")
out_val=pd.merge(left=sample_sub[:30490],right=valid,on='id')
out_val.columns=f_col

test['id'] = test['id'].map(d_id)
test = test.pivot(index='id',columns='d',values='sold').reset_index()
out_eva=pd.merge(left=sample_sub[30490:],right=test,on='id')
out_eva.columns=f_col

submit= pd.concat([out_val,out_eva],ignore_index=True)

In [None]:
submit.to_csv('submission_LGBM.csv')