In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

%matplotlib inline

In [2]:
inputDir = "../../input/m5-forecasting-accuracy/"
calendar_df = pd.read_csv(inputDir + 'calendar.csv')
train_df = pd.read_csv(inputDir +'sales_train_validation.csv')
prices_df = pd.read_csv(inputDir + 'sell_prices.csv')

validation_start = "d_1914"
validation_end = "d_1941"
evaluation_start = "d_1942"
evaluation_end = "d_1969"

In [3]:
# 学習データの土台を作成
# 各イベントの前後の影響を少し考える
event_type = ["Cultural", "National", "Religious", "Sporting"]

calendar_df["Cultural_forward"] = 0
calendar_df["Cultural_back"] = 0
calendar_df["National_forward"] = 0
calendar_df["National_back"] = 0
calendar_df["Religious_forward"] = 0
calendar_df["Religious_back"] = 0
calendar_df["Sporting_forward"] = 0
calendar_df["Sporting_back"] = 0

forward = [event_type[i] + "_forward" for i in range(4)]
back = [event_type[i] + "_back" for i in range(4)]
now_score = [0 for i in range(4)]
for day in range(calendar_df.shape[0]):
    for i in range(4):
        if calendar_df.event_type_1.values[day] == event_type[i]:
            now_score[i] = 1
        if calendar_df.event_type_2.values[day] == event_type[i]:
            now_score[i] = 1
    
    for i in range(4):
        calendar_df.loc[day, forward[i]] = now_score[i]
        now_score[i] /= 2
        if now_score[i] < 0.1:
            now_score[i] = 0
            
now_score = [0 for i in range(4)]
for day in reversed(range(calendar_df.shape[0])):
    for i in range(4):
        if calendar_df.event_type_1.values[day] == event_type[i]:
            now_score[i] = 1
        if calendar_df.event_type_2.values[day] == event_type[i]:
            now_score[i] = 1
    
    for i in range(4):
        calendar_df.loc[day, back[i]] = now_score[i]
        now_score[i] /= 2
        if now_score[i] < 0.1:
            now_score[i] = 0
            
# いらない行をけす
calendar_df = calendar_df.drop(columns=["date", "weekday", "event_type_1", "event_type_2", "event_name_1", "event_name_2"])
print(calendar_df.shape)
calendar_df.to_csv('tmp.csv', index=False)
calendar_df.head()

(1969, 16)


Unnamed: 0,wm_yr_wk,wday,month,year,d,snap_CA,snap_TX,snap_WI,Cultural_forward,Cultural_back,National_forward,National_back,Religious_forward,Religious_back,Sporting_forward,Sporting_back
0,11101,1,1,2011,d_1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11101,2,1,2011,d_2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11101,3,1,2011,d_3,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11101,4,2,2011,d_4,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11101,5,2,2011,d_5,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def predict(good_id):
    this_store = train_df.loc[good_id, "store_id"]
    this_good = train_df.loc[good_id, "item_id"]
    
    # 決めた商品と店のところだけ抽出
    selected_price_df = prices_df[prices_df["store_id"] == this_store]
    selected_price_df = selected_price_df[selected_price_df["item_id"] == this_good]

    data_df = pd.merge(calendar_df, selected_price_df)
    # indexはマージすると書き換えられてしまうのでマージしてから
    data_df.index = data_df["d"]
    # いらないやつ
    data_df = data_df.drop(columns=["d", "store_id", "item_id"])

    # 学習データ
    start_day = data_df.index[0]
    end_day = "d_1913"
    data_x = data_df.loc[start_day:end_day, :]
    data_y = train_df.loc[good_id, start_day:end_day].T.astype('float')
    
    train_x, valid_x, train_y, valid_y = train_test_split(data_x, data_y, test_size=0.33, random_state=0)

    # パラメータ
    metric = 'rmse'
    objective = 'regression'
    lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric': metric,
        'verbose': -1
    }

    lgb_train = lgb.Dataset(train_x, train_y, params={'verbose': -1})
    lgb_eval = lgb.Dataset(valid_x, valid_y, params={'verbose': -1})
    evals_result = {}
    
    gbm = lgb.train(params=lgbm_params,
                    train_set=lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    early_stopping_rounds=10, 
                    evals_result=evals_result, 
                    verbose_eval=False);
    
    pred_x = data_df.loc[validation_start:evaluation_end, :]
    dfa = pd.DataFrame([this_store, this_good], index=['store', 'good'])
    dfb = pd.DataFrame(gbm.predict(pred_x), index=["d_{}".format(i) for i in range(1914, 1970)])
    return pd.concat([dfa, dfb], axis=0)

In [5]:
start = time.time()
df = pd.DataFrame()
for i in range(10):
    df = pd.concat([df, predict(i).T], axis=0)
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")
df

elapsed_time:5.340412855148315[sec]


Unnamed: 0,store,good,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,CA_1,HOBBIES_1_001,0.628288,0.726315,0.726315,0.463902,0.463902,0.815791,0.628288,0.628288,...,0.463902,0.672995,0.628288,0.628288,0.726315,0.726315,0.463902,0.463902,0.672995,0.628288
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_003,0.451548,0.484937,0.515311,0.57261,0.57261,0.853484,0.63692,0.388258,...,0.518831,0.901316,0.740689,0.532594,0.484937,0.562968,0.58212,0.58212,0.853484,0.607101
0,CA_1,HOBBIES_1_004,1.61463,1.37661,1.55635,2.1429,1.9982,2.8932,3.79893,1.57495,...,1.60188,2.45432,3.02369,1.51992,1.40821,1.47311,1.686,1.76992,2.68381,3.31693
0,CA_1,HOBBIES_1_005,1.37109,1.40896,1.45933,1.64118,1.64118,1.34993,1.64289,0.896183,...,1.15367,1.23622,1.32149,1.2787,1.40896,1.33752,1.69274,1.5877,1.5483,1.32877
0,CA_1,HOBBIES_1_006,0.858192,0.858192,1.04748,1.04748,1.04748,0.858192,0.830346,0.830346,...,1.01964,0.858192,0.858192,0.858192,0.858192,1.04748,1.04748,1.04748,0.858192,0.858192
0,CA_1,HOBBIES_1_007,0.303748,0.303748,0.303748,0.284884,0.284884,0.314443,0.314443,0.303748,...,0.332769,0.314443,0.314443,0.303748,0.303748,0.303748,0.332769,0.332769,0.314443,0.314443
0,CA_1,HOBBIES_1_008,6.09614,5.05064,5.87224,5.75855,6.12085,7.11396,7.98272,5.89815,...,6.65122,9.07724,8.92692,7.18186,4.95972,6.51504,7.52878,7.29095,7.88928,8.36981
0,CA_1,HOBBIES_1_009,0.989713,0.989713,0.989713,0.989713,0.989713,0.989713,0.989713,0.989713,...,0.989713,0.989713,0.989713,0.989713,0.989713,0.989713,0.989713,0.989713,0.989713,0.989713
0,CA_1,HOBBIES_1_010,0.629248,0.629248,0.629248,0.683674,0.683674,0.542379,0.759083,0.523793,...,0.60499,0.594278,0.735852,0.629248,0.567512,0.567512,0.648708,0.648708,0.586021,0.759083
