In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import lightgbm as lgb
import time
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

%matplotlib inline

In [2]:
inputDir = "../../input/m5-forecasting-accuracy/"
calendar_df = pd.read_csv(inputDir + 'calendar.csv')
train_df = pd.read_csv(inputDir +'sales_train_validation.csv')
prices_df = pd.read_csv(inputDir + 'sell_prices.csv')

validation_start = "d_1914"
validation_end = "d_1941"
evaluation_start = "d_1942"
evaluation_end = "d_1969"

In [3]:
# 学習データの土台を作成
# 各イベントの前後の影響を少し考える
event_type = ["Cultural", "National", "Religious", "Sporting"]

calendar_df["Cultural_forward"] = 0
calendar_df["Cultural_back"] = 0
calendar_df["National_forward"] = 0
calendar_df["National_back"] = 0
calendar_df["Religious_forward"] = 0
calendar_df["Religious_back"] = 0
calendar_df["Sporting_forward"] = 0
calendar_df["Sporting_back"] = 0

forward = [event_type[i] + "_forward" for i in range(4)]
back = [event_type[i] + "_back" for i in range(4)]
now_score = [0 for i in range(4)]
for day in range(calendar_df.shape[0]):
    for i in range(4):
        if calendar_df.event_type_1.values[day] == event_type[i]:
            now_score[i] = 1
        if calendar_df.event_type_2.values[day] == event_type[i]:
            now_score[i] = 1
    
    for i in range(4):
        calendar_df.loc[day, forward[i]] = now_score[i]
        now_score[i] /= 2
        if now_score[i] < 0.1:
            now_score[i] = 0
            
now_score = [0 for i in range(4)]
for day in reversed(range(calendar_df.shape[0])):
    for i in range(4):
        if calendar_df.event_type_1.values[day] == event_type[i]:
            now_score[i] = 1
        if calendar_df.event_type_2.values[day] == event_type[i]:
            now_score[i] = 1
    
    for i in range(4):
        calendar_df.loc[day, back[i]] = now_score[i]
        now_score[i] /= 2
        if now_score[i] < 0.1:
            now_score[i] = 0
            
# いらない行をけす
calendar_df = calendar_df.drop(columns=["date", "weekday", "event_type_1", "event_type_2", "event_name_1", "event_name_2"])
print(calendar_df.shape)
calendar_df.to_csv('tmp.csv', index=False)
calendar_df.head()

(1969, 16)


Unnamed: 0,wm_yr_wk,wday,month,year,d,snap_CA,snap_TX,snap_WI,Cultural_forward,Cultural_back,National_forward,National_back,Religious_forward,Religious_back,Sporting_forward,Sporting_back
0,11101,1,1,2011,d_1,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,11101,2,1,2011,d_2,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,11101,3,1,2011,d_3,0,0,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,11101,4,2,2011,d_4,1,1,0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,11101,5,2,2011,d_5,1,0,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
def predict(good_id):
    this_store = train_df.loc[good_id, "store_id"]
    this_good = train_df.loc[good_id, "item_id"]
    
    # 決めた商品と店のところだけ抽出
    selected_price_df = prices_df[prices_df["store_id"] == this_store]
    selected_price_df = selected_price_df[selected_price_df["item_id"] == this_good]

    data_df = pd.merge(calendar_df, selected_price_df)
    # indexはマージすると書き換えられてしまうのでマージしてから
    data_df.index = data_df["d"]
    # いらないやつ
    data_df = data_df.drop(columns=["d", "store_id", "item_id"])

    # 学習データ
    start_day = data_df.index[0]
    end_day = "d_1913"
    data_x = data_df.loc[start_day:end_day, :]
    data_y = train_df.loc[good_id, start_day:end_day].T.astype('float')
    
    train_x, valid_x, train_y, valid_y = train_test_split(data_x, data_y, test_size=0.33, random_state=0)

    # パラメータ
    metric = 'rmse'
    objective = 'regression'
    lgbm_params = {
        'boosting_type': 'gbdt',
        'objective': objective,
        'metric': metric,
    }

    lgb_train = lgb.Dataset(train_x, train_y)
    lgb_eval = lgb.Dataset(valid_x, valid_y)
    evals_result = {}
    
    gbm = lgb.train(params=lgbm_params,
                    train_set=lgb_train, 
                    valid_sets=[lgb_train, lgb_eval], 
                    early_stopping_rounds=10, 
                    evals_result=evals_result, 
                    verbose_eval=100);
    
    pred_x = data_df.loc[validation_start:evaluation_end, :]
    dfa = pd.DataFrame([this_store, this_good], index=['store', 'good'])
    dfb = pd.DataFrame(gbm.predict(pred_x), index=["d_{}".format(i) for i in range(1914, 1970)])
    return pd.concat([dfa, dfb], axis=0)

In [5]:
start = time.time()
df = pd.DataFrame()
for i in range(10):
    df = pd.concat([df, predict(i).T], axis=0)
elapsed_time = time.time() - start
print ("elapsed_time:{0}".format(elapsed_time) + "[sec]")
df

Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	training's rmse: 0.509414	valid_1's rmse: 0.578985
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	training's rmse: 0.509414	valid_1's rmse: 0.578985
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	training's rmse: 0.509414	valid_1's rmse: 0.578985
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	training's rmse: 0.509414	valid_1's rmse: 0.578985
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	training's rmse: 0.509414	valid_1's rmse: 0.578985
Training until validation scores don't improve for 10 rounds
Early stopping, best iteration is:
[20]	training's rmse: 0.509414	valid_1's rmse: 0.578985
Training until validation scores don't improve for 10 rounds
Early stopping, best iterat

Unnamed: 0,store,good,d_1914,d_1915,d_1916,d_1917,d_1918,d_1919,d_1920,d_1921,...,d_1960,d_1961,d_1962,d_1963,d_1964,d_1965,d_1966,d_1967,d_1968,d_1969
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
0,CA_1,HOBBIES_1_002,0.068299,0.068299,0.0717508,0.12995,0.0646741,0.329452,0.457873,0.25302,...,0.202833,0.648553,0.522612,0.325376,0.283531,0.323352,0.363794,0.196221,0.620859,0.479103
