## 参考  
### kaggle notebook lightGBM
https://www.kaggle.com/girmdshinsei/for-japanese-beginner-with-wrmsse-in-lgbm?scriptVersionId=31044557

### lightGBM 時系列
https://qiita.com/ground0state/items/657861de619a4e4a30de

### Quick start catboost
https://catboost.ai/docs/concepts/python-quickstart.html

### Pythonでcatboostを使ってみる
#### (cat_featuresの使い方を調べた)
https://qiita.com/shin_mura/items/3d9ce25a60bdd25a3333

### XGBoostパラメータのまとめとランダムサーチ実装
https://qiita.com/FJyusk56/items/0649f4362587261bd57a

In [None]:
import pandas as pd
import os
import gc
import numpy as np
import matplotlib.pyplot as plt

# ラベルエンコーダー
from sklearn import preprocessing, metrics

# lightGBM
import lightgbm as lgb

# CatBoost
from catboost import Pool, CatBoostRegressor

# XGBoost
import xgboost as xgb

# 1セルでまとめて.head()、.tail()等を入力しても大丈夫になる
from IPython.display import display

# 誤差算定
from sklearn.metrics import mean_absolute_error

# 二乗平均平方根誤差 (RMSE)
from sklearn.metrics import mean_squared_error

# 決定係数
from sklearn.metrics import r2_score

import warnings
warnings.filterwarnings('ignore')

# DataFrameの表示数を変更
pd.set_option('max_columns', 500)
pd.set_option('max_rows', 500)

### データ作成

In [None]:
# ローカル用
path = os.getcwd() + "/"

# kaggle Notebook用
INPUT_DIR = '../input/m5-forecasting-accuracy'

calendar.csv -製品の販売日に関する情報が含まれています。
sales_train_validation.csv -製品および店舗ごとの過去の毎日の販売台数データが​​含まれています [d_1 - d_1913]
sample_submission.csv-提出の正しい形式。詳細については、「評価」タブを参照してください。
sell_prices.csv -店舗および日付ごとに販売された製品の価格に関する情報が含まれています。

sales_train_evaluation.csv-締め切りの1か月前に1回ご利用いただけます。売上高が含まれます[d_1 - d_1941]

各行は含むidの連結であるitem_idとstore_idのいずれかである、validation（公共のランキングに対応する）、またはevaluation（プライベートランキングに対応します）。　　 F1-F28各行で販売されるアイテムの28予測日（）を予測しています。　　 以下のためのvalidation行、これに相当するd_1914 - d_1941、とのためevaluationの行、これに相当しますd_1942 - d_1969。　　

validation d_1914 - d_1941の単価と量を予測する。１か月前に１回公開される。 evaluation d_1942-d_1969の単価と量を予測する

### 作成データの読込

In [3]:
try:
    stv_melt = pd.read_pickle(path + "melt_stv.pkl")
except FileNotFoundError:
    1

### 通常通りcsvを読み込んでデータを作る

In [None]:
# sales_train_validation.csv
try:
    stv = pd.read_csv(path + "sales_train_validation.csv") # ローカル用
except FileNotFoundError:
    stv = pd.read_csv(f"{INPUT_DIR}/sales_train_validation.csv") # kaggle用

    
# calendar.csv
try:
    cal = pd.read_csv(path + "calendar.csv") # ローカル用
except FileNotFoundError:
    cal = pd.read_csv(f"{INPUT_DIR}/calendar.csv") # kaggle用

    
# sell_prices.csv
try:
    price = pd.read_csv(path + "sell_prices.csv") # ローカル用
except FileNotFoundError:
    price = pd.read_csv(f"{INPUT_DIR}/sell_prices.csv") # kaggle用

    
# sample_submission.csv
try:
    ss = pd.read_csv(path + "sample_submission.csv") # ローカル用
except FileNotFoundError:
    ss = pd.read_csv(f"{INPUT_DIR}/sample_submission.csv") # kaggle用

stv.shape

### ロースペックマシン限定

stv = pd.read_csv(path + "sales_train_validation.csv",
                               skiprows=lambda x: x not in range(0,1001))

cal = pd.read_csv(path + "calendar.csv",
                               skiprows=lambda x: x not in range(0,3001))

price = pd.read_csv(path + "sell_prices.csv",
                               skiprows=lambda x: x not in range(0,3001))

ss = pd.read_csv(path + "sample_submission.csv",
                               skiprows=lambda x: x not in range(0,1001))

In [None]:
display(stv.head())
display(stv.tail())
display(stv.dtypes)
display(cal.head())
display(cal.tail())
display(cal.dtypes)
display(cal.max())
display(price.head())
display(price.tail())
display(price.dtypes)
display(price.max())
display(price.shape)
display(ss.head())
display(ss.tail())
display(ss.shape)

In [None]:
day1_1913 = [f"d_{i}" for i in range(1, 1914)]

In [None]:
stv_melt =  pd.melt(stv, id_vars=['id','store_id','item_id'],
           value_vars=day1_1913,
           var_name = "d", value_name = "vol")

In [None]:
del day1_1913
gc.collect()

In [None]:
product = stv[["id","item_id","store_id"]]

In [None]:
ss_val = ss[0:30490]
ss_val.columns = ["id"] + [f"d_{d}" for d in range(1914, 1942)]

ss_eva = ss[30490:60980]
ss_eva.columns = ["id"] + [f"d_{d}" for d in range(1942, 1970)]

In [None]:
ss_eva['id'] = ss_eva['id'].str.replace('_evaluation','_validation')

In [None]:
ss_val = pd.merge(ss_val, product, how = 'left', left_on = ['id'], right_on = ['id'])
ss_eva = pd.merge(ss_eva, product, how = 'left', left_on = ['id'], right_on = ['id'])

In [None]:
display(ss_val.head(3))
display(ss_val.tail(3))
display(ss_val.shape)
display(ss_eva.head(3))
display(ss_eva.tail(3))
display(ss_eva.shape)

In [None]:
val_1914_1941 = [f"d_{i}" for i in range(1914, 1942)]
eva_1942_1969 = [f"d_{i}" for i in range(1942, 1970)]

In [None]:
val_melt =  pd.melt(ss_val, id_vars=['id','store_id', "item_id"],
           value_vars=val_1914_1941,
           var_name = "d", value_name = "vol")
eva_melt =  pd.melt(ss_eva, id_vars=['id','store_id', "item_id"],
           value_vars=eva_1942_1969,
           var_name = "d", value_name = "vol")

In [None]:
stv_melt = pd.concat([stv_melt, val_melt, eva_melt])

In [None]:
display(stv_melt.head(3))
display(stv_melt.tail(3))
display(stv_melt.shape)

In [None]:
del ss, ss_val, ss_eva, val_1914_1941, eva_1942_1969, val_melt, eva_melt, product
gc.collect()

In [None]:
cal = cal[["date","wm_yr_wk","d","event_name_1","event_type_1","event_name_2","event_type_2"]]

In [None]:
stv_melt = pd.merge(stv_melt, cal, how = 'left', left_on = ['d'], right_on = ['d'])

In [None]:
del cal
gc.collect()

In [None]:
display(stv_melt.head())
display(stv_melt.tail())
display(stv_melt.dtypes)
display(stv_melt.shape)

In [None]:
stv_melt = stv_melt.merge(price, on = ['store_id', 'item_id', 'wm_yr_wk'], how = 'left')

In [None]:
del price
gc.collect()

In [None]:
display(stv_melt.head())
display(stv_melt.tail())
display(stv_melt.dtypes)
display(stv_melt.shape)

### 特徴量作成

In [None]:
stv_melt["date2"] = pd.to_datetime(stv_melt["date"])

In [None]:
stv_melt["year"] = stv_melt["date2"].dt.year
stv_melt["month"] = stv_melt["date2"].dt.month
stv_melt["week"] = stv_melt["date2"].dt.week
stv_melt["day"] = stv_melt["date2"].dt.day
stv_melt["dayofweek"] = stv_melt["date2"].dt.dayofweek

stv_melt["year"] = stv_melt["year"].astype('int8')
stv_melt["month"] = stv_melt["month"].astype('int8')
stv_melt["week"] = stv_melt["week"].astype('int8')
stv_melt["day"] = stv_melt["day"].astype('int8')
stv_melt["dayofweek"] = stv_melt["dayofweek"].astype('int8')

In [None]:
stv_melt = stv_melt.drop("date2", axis=1)

In [None]:
#　ラグの作成
for i in [7,30,90]:
    stv_melt['shift%s'%i] = stv_melt["vol"].shift(i)

In [None]:
# 平均値
'''
for i in [7,30,90]:
    stv_melt['mean%s'%i] = stv_melt["vol"].rolling(i).mean()
'''

In [None]:
# 中央値
'''
for i in [7,30,90]:
    stv_melt['median%s'%i] = stv_melt["vol"].rolling(i).median()
'''

In [None]:
# 最小値
'''
for i in [7,30,90]:
    stv_melt['min%s'%i] = stv_melt["vol"].rolling(i).min()
'''

In [None]:
display(stv_melt.head(3))
display(stv_melt.tail(3))
display(stv_melt.dtypes)

In [None]:
stv_melt["vol"] = stv_melt[["vol"]].astype('int16')
stv_melt["wm_yr_wk"] = stv_melt[ "wm_yr_wk"].astype('int16')

In [None]:
stv_melt["sell_price"] = stv_melt["sell_price"].astype('float16')
stv_melt["shift7"] = stv_melt["shift7"].astype('float16')

In [None]:
stv_melt["shift30"] = stv_melt["shift30"].astype('float16')
stv_melt["shift90"] = stv_melt["shift90"].astype('float16')

In [None]:
# ラベルエンコーダー
lbl = preprocessing.LabelEncoder()
stv_melt["store_id"] = lbl.fit_transform(stv_melt["store_id"])
stv_melt["item_id"] = lbl.fit_transform(stv_melt["item_id"])

stv_melt["store_id"] = stv_melt["store_id"].astype('int8')
stv_melt["item_id"] = stv_melt["item_id"].astype('int8')

In [None]:
# event は欠損値があるので前処理
stv_melt["event_name_1"] = stv_melt["event_name_1"].fillna("missing", inplace=True)
stv_melt["event_type_1"] = stv_melt["event_type_1"].fillna("missing", inplace=True)
stv_melt["event_name_2"] = stv_melt["event_name_2"].fillna("missing", inplace=True)
stv_melt["event_type_2"] = stv_melt["event_type_2"].fillna("missing", inplace=True)

stv_melt["event_name_1"] = lbl.fit_transform(stv_melt["event_name_1"])
stv_melt["event_type_1"] = lbl.fit_transform(stv_melt["event_type_1"])
stv_melt["event_name_2"] = lbl.fit_transform(stv_melt["event_name_2"])
stv_melt["event_type_2"] = lbl.fit_transform(stv_melt["event_type_2"])

stv_melt["event_name_1"] = stv_melt["event_name_1"].astype('int8')
stv_melt["event_name_2"] = stv_melt["event_name_2"].astype('int8')
stv_melt["event_type_1"] = stv_melt["event_type_1"].astype('int8')
stv_melt["event_type_2"] = stv_melt["event_type_2"].astype('int8')

In [None]:
display(stv_melt.head(3))
display(stv_melt.tail(3))
display(stv_melt.dtypes)

### 学習用データセットの作成

In [7]:
x_train = stv_melt[stv_melt['date'] <= '2016-03-27']
y_train = x_train['vol']
x_val   = stv_melt[(stv_melt['date'] > '2016-03-27') & (stv_melt['date'] <= '2016-04-24')]
y_val   = x_val['vol']
test    = stv_melt[(stv_melt['date'] > '2016-04-24')]

In [8]:
display(test.head())
display(test.tail())
display(test.dtypes)
display(test.shape)

Unnamed: 0,id,store_id,item_id,d,vol,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,sell_price,year,month,week,day,dayofweek,shift7,shift30,shift90
58327370,HOBBIES_1_001_CA_1_validation,0,-99,d_1914,0,2016-04-25,11613,0,0,0,0,8.382812,-32,4,17,25,0,0.0,2.0,0.0
58327371,HOBBIES_1_002_CA_1_validation,0,-98,d_1914,0,2016-04-25,11613,0,0,0,0,3.970703,-32,4,17,25,0,1.0,0.0,2.0
58327372,HOBBIES_1_003_CA_1_validation,0,-97,d_1914,0,2016-04-25,11613,0,0,0,0,2.970703,-32,4,17,25,0,1.0,10.0,12.0
58327373,HOBBIES_1_004_CA_1_validation,0,-96,d_1914,0,2016-04-25,11613,0,0,0,0,4.640625,-32,4,17,25,0,0.0,2.0,0.0
58327374,HOBBIES_1_005_CA_1_validation,0,-95,d_1914,0,2016-04-25,11613,0,0,0,0,2.880859,-32,4,17,25,0,0.0,0.0,4.0


Unnamed: 0,id,store_id,item_id,d,vol,date,wm_yr_wk,event_name_1,event_type_1,event_name_2,event_type_2,sell_price,year,month,week,day,dayofweek,shift7,shift30,shift90
60034805,FOODS_3_823_WI_3_validation,9,-104,d_1969,0,2016-06-19,11621,0,0,0,0,2.980469,-32,6,24,19,6,0.0,0.0,0.0
60034806,FOODS_3_824_WI_3_validation,9,-103,d_1969,0,2016-06-19,11621,0,0,0,0,2.480469,-32,6,24,19,6,0.0,0.0,0.0
60034807,FOODS_3_825_WI_3_validation,9,-102,d_1969,0,2016-06-19,11621,0,0,0,0,3.980469,-32,6,24,19,6,0.0,0.0,0.0
60034808,FOODS_3_826_WI_3_validation,9,-101,d_1969,0,2016-06-19,11621,0,0,0,0,1.280273,-32,6,24,19,6,0.0,0.0,0.0
60034809,FOODS_3_827_WI_3_validation,9,-100,d_1969,0,2016-06-19,11621,0,0,0,0,1.0,-32,6,24,19,6,0.0,0.0,0.0


id               object
store_id           int8
item_id            int8
d                object
vol               int16
date             object
wm_yr_wk          int16
event_name_1       int8
event_type_1       int8
event_name_2       int8
event_type_2       int8
sell_price      float16
year               int8
month              int8
week               int8
day                int8
dayofweek          int8
shift7          float16
shift30         float16
shift90         float16
dtype: object

(1707440, 20)

In [9]:
del stv_melt
gc.collect()

22

### lightGBM モデルの実行

In [10]:
features = [
    "store_id",
    "item_id",
    "sell_price",
    "shift7",
    "shift30",
    "shift90",
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "year",
    "month",
    "week",
    "day",
    "dayofweek"
]

In [11]:
params = {
    'boosting_type': 'gbdt',
    'metric': 'rmse',
    'objective': 'regression',
    'n_jobs': -1,
    'seed': 236,
    'learning_rate': 0.1,
    'bagging_fraction': 0.75,
    'bagging_freq': 10, 
    'colsample_bytree': 0.75
}

In [12]:
train_set = lgb.Dataset(x_train[features], y_train)
val_set = lgb.Dataset(x_val[features], y_val)

In [13]:
model_lgb = lgb.train(params,
                      train_set,
                      num_boost_round = 3,# 2500
                      early_stopping_rounds = 50,
                      valid_sets = [train_set, val_set],
                      verbose_eval = 100)

Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[3]	training's rmse: 3.74209	valid_1's rmse: 3.46698


In [14]:
val_pred = model_lgb.predict(x_val[features])
val_score = np.sqrt(metrics.mean_squared_error(val_pred, y_val))
print(f'Our val rmse score は {val_score}')

Our val rmse score は 3.466979875948609


In [15]:
y_pred = model_lgb.predict(test[features])
test['vol'] = y_pred

In [16]:
predictions = test[['id', 'date', 'vol']]
predictions = pd.pivot(predictions, index = 'id', columns = 'date', values = 'vol').reset_index()

In [17]:
display(predictions.head())
display(predictions.tail())
display(predictions.shape)

date,id,2016-04-25,2016-04-26,2016-04-27,2016-04-28,2016-04-29,2016-04-30,2016-05-01,2016-05-02,2016-05-03,2016-05-04,2016-05-05,2016-05-06,2016-05-07,2016-05-08,2016-05-09,2016-05-10,2016-05-11,2016-05-12,2016-05-13,2016-05-14,2016-05-15,2016-05-16,2016-05-17,2016-05-18,2016-05-19,2016-05-20,2016-05-21,2016-05-22,2016-05-23,2016-05-24,2016-05-25,2016-05-26,2016-05-27,2016-05-28,2016-05-29,2016-05-30,2016-05-31,2016-06-01,2016-06-02,2016-06-03,2016-06-04,2016-06-05,2016-06-06,2016-06-07,2016-06-08,2016-06-09,2016-06-10,2016-06-11,2016-06-12,2016-06-13,2016-06-14,2016-06-15,2016-06-16,2016-06-17,2016-06-18,2016-06-19
0,FOODS_1_001_CA_1_validation,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615
1,FOODS_1_001_CA_2_validation,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615
2,FOODS_1_001_CA_3_validation,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519
3,FOODS_1_001_CA_4_validation,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261
4,FOODS_1_001_TX_1_validation,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176


date,id,2016-04-25,2016-04-26,2016-04-27,2016-04-28,2016-04-29,2016-04-30,2016-05-01,2016-05-02,2016-05-03,2016-05-04,2016-05-05,2016-05-06,2016-05-07,2016-05-08,2016-05-09,2016-05-10,2016-05-11,2016-05-12,2016-05-13,2016-05-14,2016-05-15,2016-05-16,2016-05-17,2016-05-18,2016-05-19,2016-05-20,2016-05-21,2016-05-22,2016-05-23,2016-05-24,2016-05-25,2016-05-26,2016-05-27,2016-05-28,2016-05-29,2016-05-30,2016-05-31,2016-06-01,2016-06-02,2016-06-03,2016-06-04,2016-06-05,2016-06-06,2016-06-07,2016-06-08,2016-06-09,2016-06-10,2016-06-11,2016-06-12,2016-06-13,2016-06-14,2016-06-15,2016-06-16,2016-06-17,2016-06-18,2016-06-19
30485,HOUSEHOLD_2_516_TX_2_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30486,HOUSEHOLD_2_516_TX_3_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.48233,1.48233,1.462784,1.462784,1.462784,1.462784,1.462784,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30487,HOUSEHOLD_2_516_WI_1_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30488,HOUSEHOLD_2_516_WI_2_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30489,HOUSEHOLD_2_516_WI_3_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092


(30490, 57)

In [18]:
del params, model_lgb, y_pred
gc.collect()

44

In [19]:
pre_val = predictions.iloc[:,:29]

In [20]:
pre_eva = pd.concat([predictions.iloc[:,0],predictions.iloc[:,29:57]], axis=1)
pre_eva['id'] = pre_eva['id'].str.replace('_validation', '_evaluation')

In [21]:
del predictions
gc.collect()

78

In [22]:
pre_val.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
pre_eva.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

In [23]:
display(pre_val.head())
display(pre_val.tail())
display(pre_val.shape)

display(pre_eva.head())
display(pre_eva.tail())
display(pre_eva.shape)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_validation,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615
1,FOODS_1_001_CA_2_validation,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615
2,FOODS_1_001_CA_3_validation,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519
3,FOODS_1_001_CA_4_validation,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261
4,FOODS_1_001_TX_1_validation,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30485,HOUSEHOLD_2_516_TX_2_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30486,HOUSEHOLD_2_516_TX_3_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.48233,1.48233,1.462784,1.462784,1.462784,1.462784,1.462784,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30487,HOUSEHOLD_2_516_WI_1_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30488,HOUSEHOLD_2_516_WI_2_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30489,HOUSEHOLD_2_516_WI_3_validation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092


(30490, 29)

Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
0,FOODS_1_001_CA_1_evaluation,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615
1,FOODS_1_001_CA_2_evaluation,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615,1.164546,1.164546,1.164546,1.164546,1.164546,1.200615,1.200615
2,FOODS_1_001_CA_3_evaluation,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519,1.220519
3,FOODS_1_001_CA_4_evaluation,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261,1.093261
4,FOODS_1_001_TX_1_evaluation,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176,1.10863,1.10863,1.10863,1.10863,1.10863,1.128176,1.128176


Unnamed: 0,id,F1,F2,F3,F4,F5,F6,F7,F8,F9,F10,F11,F12,F13,F14,F15,F16,F17,F18,F19,F20,F21,F22,F23,F24,F25,F26,F27,F28
30485,HOUSEHOLD_2_516_TX_2_evaluation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30486,HOUSEHOLD_2_516_TX_3_evaluation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30487,HOUSEHOLD_2_516_WI_1_evaluation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30488,HOUSEHOLD_2_516_WI_2_evaluation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092
30489,HOUSEHOLD_2_516_WI_3_evaluation,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092,1.013545,1.013545,1.013545,1.013545,1.013545,1.033092,1.033092


(30490, 29)

### 誤差率

In [24]:
sta = pd.read_csv(path + "sales_train_evaluation.csv")
sta = sta[["d_1914", "d_1915","d_1916","d_1917","d_1918","d_1919","d_1920","d_1921","d_1922","d_1923","d_1924","d_1925","d_1926","d_1927","d_1928","d_1929","d_1930","d_1931","d_1932","d_1933","d_1934","d_1935","d_1936","d_1937","d_1938","d_1939","d_1940","d_1941"]]
sta.columns = ["F1", "F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","F25","F26","F27","F28"]

for i in ["F1", "F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","F25","F26","F27","F28"]:
    sta[i] = sta[i].astype('float64')

In [25]:
pre_val_temp = pre_val.drop("id",axis=1)

In [26]:
lgb_score = []

lgb_score.append(np.sqrt(mean_squared_error(sta, pre_val_temp)))
lgb_score.append(mean_absolute_error(sta, pre_val_temp))
lgb_score.append(r2_score(sta, pre_val_temp))

for i in range(3):
    print(lgb_score[i])

f = open('lgb_score.txt', 'w')
for x in lgb_score:
    f.write(str(x) + "\n")
f.close()

3.668300951468225
1.6090214481293426
-0.01735749727012554


In [27]:
del pre_val_temp, pre_val, lgb_score
gc.collect()

66

In [28]:
pre_uni = pd.concat([sta, pre_eva], axis=0)

In [29]:
pre_uni.to_csv('submission_lgb.csv', index = False)

In [30]:
del sta, pre_uni
gc.collect()

66

### catboostの実行

In [None]:
features = [
    "store_id",
    "item_id",
    "sell_price",
    "shift7",
    "shift30",
    "shift90",
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "year",
    "month",
    "week",
    "day",
    "dayofweek"
]

In [None]:
# initialize Pool
train_pool = Pool(x_train[features], 
                  y_train)

test_pool = Pool(test[features])

In [None]:
# specify the training parameters
model2 = CatBoostRegressor(iterations=3,# 1000
                          depth=5,
                          learning_rate=0.2,# 0.05
                          loss_function='RMSE')

In [None]:
#train the model
model2.fit(train_pool)

In [None]:
# make the prediction using the resulting model
preds2 = model2.predict(test_pool)

In [None]:
del model2
gc.collect()

In [None]:
test2 = test
test2['vol'] = preds2

In [None]:
predictions2 = test2[['id', 'date', 'vol']]
predictions2 = pd.pivot(predictions2, index = 'id', columns = 'date', values = 'vol').reset_index()

In [None]:
del train_pool, test_pool, test2, preds2
gc.collect()

In [None]:
pre_val2 = predictions2.iloc[:,:29]

In [None]:
pre_eva2 = pd.concat([predictions2.iloc[:,0],predictions2.iloc[:,29:57]], axis=1)
pre_eva2['id'] = pre_eva2['id'].str.replace('_validation', '_evaluation')

In [None]:
del predictions2
gc.collect()

In [None]:
pre_val2.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
pre_eva2.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

In [None]:
display(pre_val2.head())
display(pre_val2.tail())
display(pre_val2.shape)

display(pre_eva2.head())
display(pre_eva2.tail())
display(pre_eva2.shape)

### 誤差率

In [None]:
sta = pd.read_csv(path + "sales_train_evaluation.csv")
sta = sta[["d_1914", "d_1915","d_1916","d_1917","d_1918","d_1919","d_1920","d_1921","d_1922","d_1923","d_1924","d_1925","d_1926","d_1927","d_1928","d_1929","d_1930","d_1931","d_1932","d_1933","d_1934","d_1935","d_1936","d_1937","d_1938","d_1939","d_1940","d_1941"]]
sta.columns = ["F1", "F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","F25","F26","F27","F28"]

for i in ["F1", "F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","F25","F26","F27","F28"]:
    sta[i] = sta[i].astype('float64')

In [None]:
pre_val2_temp = pre_val2.drop("id",axis=1)

In [None]:
ctb_score = []

ctb_score.append(np.sqrt(mean_squared_error(sta, pre_val2_temp)))
ctb_score.append(mean_absolute_error(sta, pre_val2_temp))
ctb_score.append(r2_score(sta, pre_val2_temp))

for i in range(3):
    print(cbt_score[i])

f = open('ctb_score.txt', 'w')
for x in ctb_score:
    f.write(str(x) + "\n")
f.close()

In [None]:
del pre_val2_temp, pre_val2, ctb_score
gc.collect()

In [None]:
pre_uni2 = pd.concat([sta, pre_eva2], axis=0)

In [None]:
pre_uni2.to_csv('submission_ctb.csv', index = False)

In [None]:
del sta, pre_uni2, pre_eva2, features
gc.collect()

### XGboost モデルの実行

In [None]:
xgb_features = [
    "store_id",
    "sell_price",
    "event_name_1",
    "event_type_1",
    "event_name_2",
    "event_type_2",
    "year",
    "month",
    "week",
    "day"
]

In [None]:
# 学習用のパラメータ
xgb_params = {
        # 回帰問題
        'objective': 'reg:linear',
        # 学習用の指標 (RMSE)
        'eval_metric': 'rmse',
    }
# param['nthread'] = 4
# param['eval_metric'] = 'auc'

In [None]:
dtrain = xgb.DMatrix(x_train[xgb_features], label=y_train)
dtest = xgb.DMatrix(test[xgb_features], label=test['vol'])
deval= xgb.DMatrix(x_val[xgb_features], label=y_val)

In [None]:
display(x_train[xgb_features].shape)
display(y_train.shape)
display(x_val[xgb_features].shape)
display(y_val.shape)

In [None]:
evals = [(dtrain, 'train'), (deval, 'eval')]

In [None]:
evals_result = {}

In [None]:
model3 = xgb.train(xgb_params,
                    dtrain,
                    num_boost_round=3,#800
                    early_stopping_rounds=1,#10
                    evals=evals,
                    evals_result=evals_result,
                    )

In [None]:
# 検証用データが各クラスに分類される確率を計算する
y_pred3 = model3.predict(dtest)

In [None]:
del model3, xgb_params, xgb_features, dtrain, dtest, evals, x_val, y_val, 
gc.collect()

In [None]:
# y_pred3 の保存、読込
a = np.loadtxt(path + "y_pred.csv")
np.savetxt(path + "y_pred.csv", y_pred3)

In [None]:
display(len(y_pred3))
display(test.head())
display(test.shape)

In [None]:
test3 = test
test3['vol'] = y_pred3

In [None]:
predictions3 = test3[['id', 'date', 'vol']]
predictions3 = pd.pivot(predictions3, index = 'id', columns = 'date', values = 'vol').reset_index()

In [None]:
pre_val3 = predictions3.iloc[:,:29]

In [None]:
pre_eva3 = pd.concat([predictions3.iloc[:,0],predictions3.iloc[:,29:57]], axis=1)
pre_eva3['id'] = pre_eva3['id'].str.replace('_validation', '_evaluation')

In [None]:
del test3, predictions3, 
gc.collect()

In [None]:
pre_val3.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]
pre_eva3.columns = ['id'] + ['F' + str(i + 1) for i in range(28)]

In [None]:
display(pre_val3.head())
display(pre_val3.tail())
display(pre_val3.shape)

display(pre_eva3.head())
display(pre_eva3.tail())
display(pre_eva3.shape)

In [None]:
sta = pd.read_csv(path + "sales_train_evaluation.csv")
sta = sta[["d_1914", "d_1915","d_1916","d_1917","d_1918","d_1919","d_1920","d_1921","d_1922","d_1923","d_1924","d_1925","d_1926","d_1927","d_1928","d_1929","d_1930","d_1931","d_1932","d_1933","d_1934","d_1935","d_1936","d_1937","d_1938","d_1939","d_1940","d_1941"]]
sta.columns = ["F1", "F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","F25","F26","F27","F28"]

for i in ["F1", "F2","F3","F4","F5","F6","F7","F8","F9","F10","F11","F12","F13","F14","F15","F16","F17","F18","F19","F20","F21","F22","F23","F24","F25","F26","F27","F28"]:
    sta[i] = sta[i].astype('float64')

In [None]:
pre_val3_temp = pre_val3.drop("id",axis=1)

In [None]:
xgb_score = []

xgb_score.append(np.sqrt(mean_squared_error(sta, pre_val3_temp)))
xgb_score.append(mean_absolute_error(sta, pre_val3_temp))
xgb_score.append(r2_score(sta, pre_val3_temp))

for i in range(3):
    print(xgb_score[i])

f = open('xgb_score.txt', 'w')
for x in xgb_score:
    f.write(str(x) + "\n")
f.close()

In [None]:
del x_train, y_train, test, pre_val3_temp, pre_val3, xgb_score
gc.collect()

In [None]:
pre_uni3 = pd.concat([sta, pre_eva3], axis=0)

In [None]:
pre_uni3.to_csv('submission_xgb.csv', index = False)

In [None]:
del sta, pre_uni3, pre_eval3
gc.collect()

### データ統合 csv保存

In [None]:
pre_uni  = pd.read_csv(path + "submission_lgb.csv") 
pre_uni2 = pd.read_csv(path + "submission_ctb.csv") 
pre_uni3 = pd.read_csv(path + "submission_xgb.csv") 

In [None]:
pre_uni_am = pre_uni.set_index('id') * 0.2 + pre_uni2.set_index('id') * 0.7 + pre_uni2.set_index('id') * 0.1

In [None]:
# pre_uni_am = pre_uni_am.reset_index()

In [None]:
del pre_uni, pre_uni2, pre_uni3
gc.collect()

In [None]:
display(pre_uni_am.head())
display(pre_uni_am.tail())
display(pre_uni_am.shape)

In [None]:
pre_uni_am.to_csv('submission_uni.csv', index = False)

In [None]:
del pre_uni_am
gc.collect()