In [220]:
import pandas as pd
import numpy as np
import ydata_profiling as yp
import datetime as dt
import sklearn
from tqdm.notebook import trange
import holidays

# 学習データの準備

In [221]:
df_train = pd.read_csv("sales_history.csv", index_col=None)

In [222]:
# 予測する商品IDをテストデータから取得
df_test = pd.read_csv("test_origin.csv", index_col=0)
items_name = df_test.iloc[:, 0].unique()

# 予測する商品IDが含まれる行を抽出
df_train = df_train[df_train.iloc[:, 2].isin(items_name)]

In [223]:
# 日付をDateTime型に変換
def ToDate(row):
    
    row.iloc[0] = dt.datetime.strptime(row.iloc[0], '%Y-%m-%d')
    row.iloc[0] = row.iloc[0].date()

    return row


df_train = df_train.apply(ToDate, axis=1)

In [224]:
# 売上個数をintに変換
df_train["売上個数"] = df_train["売上個数"].astype('int')

In [225]:
# "day"カラムと"DoW"カラムを追加
df_train.insert(1, "day", 0)
df_train.insert(2, "DoW", 0)

day1 = dt.date(year=2018, month=1, day=1)

def AddDayDow(row):

    day = abs(row.iloc[0] - day1)
    row.iloc[1] = int(day.days)

    if row["day"] % 7 == 0:
        row.iloc[2] = 0
    if row["day"] % 7 == 1:
        row.iloc[2] = 1        
    if row["day"] % 7 == 2:
        row.iloc[2] = 2
    if row["day"] % 7 == 3:
        row.iloc[2] = 3
    if row["day"] % 7 == 4:
        row.iloc[2] = 4       
    if row["day"] % 7 == 5:
        row.iloc[2] = 5        
    if row["day"] % 7 == 6:
        row.iloc[2] = 6

    return row



df_train = df_train.apply(AddDayDow, axis=1)


In [226]:
# Holiday列を追加

df_train.insert(3, "Holiday", 0)

def IsHoliday(row):

    if row[0] in holidays.JP():
        row[3] = 1
    if row[2] == 5:
        row[3] = 1
    if row[2] == 6:
        row[3] = 1
    else:
        pass

    return row

df_train = df_train.apply(IsHoliday, axis=1)

In [227]:
# "日付"列を削除
df_train = df_train.drop(columns="日付")

In [228]:
# 列名を変更
df_train = df_train.rename(columns={'店舗ID': 'shopID', '商品ID': 'itemID', "商品価格": "price", "売上個数": "n" })

In [229]:
#Cat列を追加

df_train.insert(4, "Cat", 0)

def AddCat(row):

    id = str(row[5])
    row[4] = int(id[0:3])

    return row

df_train = df_train.apply(AddCat, axis=1)

In [230]:
df_train = df_train.drop(columns="day")

In [231]:
# "df_train"をtrain_perday.csvとして出力

df_train.to_csv("train_perday.csv", index=False)

# テストデータの準備

In [232]:
df_test = pd.read_csv("test_origin.csv")

In [233]:
# 2018/1/1を0日として2019/12/1と2019/12/30の日数を計算

dec1 = dt.date(year=2019, month=12, day=1)
dec30 = dt.date(year=2019, month=12, day=30)

firstday = abs(dec1 - day1)
lastday = abs(dec30 - day1)

firstday = int(firstday.days)
lastday = int(lastday.days)

In [234]:
# テストデータに格納する日付・店舗ID・商品IDをリストに格納

df_list = []

for d in trange(699, 729):
    for s in trange(18):
        for id in items_name:
            row_list = []
            row_list.append(d)
            row_list.append(s)
            row_list.append(id)
            df_list.append(row_list)
            

  0%|          | 0/30 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [235]:
# リストからデータフレーム"test"を作成

df_test = pd.DataFrame(df_list,
    columns=["day", "shopID", "itemID"]
)

In [236]:
# "test"に"dow"列を挿入

df_test.insert(1, "DoW", 0)


def AddDow(row):

    if row["day"] % 7 == 0:
        row.iloc[1] = 0
    if row["day"] % 7 == 1:
        row.iloc[1] = 1        
    if row["day"] % 7 == 2:
        row.iloc[1] = 2
    if row["day"] % 7 == 3:
        row.iloc[1] = 3
    if row["day"] % 7 == 4:
        row.iloc[1] = 4       
    if row["day"] % 7 == 5:
        row.iloc[1] = 5        
    if row["day"] % 7 == 6:
        row.iloc[1] = 6

    return row


df_test = df_test.apply(AddDow, axis=1)

In [237]:
df_test.insert(2, "Holiday", 0)

def TestIsHoliday(row):

    if row[1] == 5:
        row[2] = 1
    if row[1] == 6:
        row[2] = 1
    else:
        pass

    return row

df_test = df_test.apply(TestIsHoliday, axis=1)

In [238]:
def Nenmatsu(row):
    
    if row[0] == 726 or row[0] == 727 or row[0] == 728:
        row[2] = 1
    
    return row

df_test = df_test.apply(Nenmatsu, axis=1)

In [239]:
df_test.insert(4, "Cat", 0)

def AddCat(row):

    id = str(row[5])
    row[4] = int(id[0:3])

    return row

df_test = df_test.apply(AddCat, axis=1)

In [240]:
items_id = df_test.iloc[:, 5].unique()

df_train = pd.read_csv("sales_history.csv", index_col=None)

meanprice_dic = {}

for i in items_id:
    meanprice = df_train.loc[(df_train["商品ID"]==i), "商品価格"].mean()
    meanprice_dic[i] = meanprice


df_test.insert(6, "price", 0)


def AddPrice(row):

    row[6] = meanprice_dic[row[5]]

    return row

df_test = df_test.apply(AddPrice, axis=1)



In [241]:
df_test

Unnamed: 0,day,DoW,Holiday,shopID,Cat,itemID,price
0,699.0,6.0,1.0,0.0,100.0,1000001.0,367.813559
1,699.0,6.0,1.0,0.0,100.0,1000002.0,236.966667
2,699.0,6.0,1.0,0.0,100.0,1000003.0,228.661972
3,699.0,6.0,1.0,0.0,100.0,1000004.0,224.025157
4,699.0,6.0,1.0,0.0,100.0,1000005.0,245.250000
...,...,...,...,...,...,...,...
91795,728.0,0.0,1.0,17.0,330.0,3300001.0,1778.424658
91796,728.0,0.0,1.0,17.0,340.0,3400001.0,862.551724
91797,728.0,0.0,1.0,17.0,340.0,3400002.0,1671.180556
91798,728.0,0.0,1.0,17.0,340.0,3400003.0,1141.479714


In [242]:
df_test = df_test.drop(columns="day")

In [243]:
df_test = df_test.astype({"DoW": int, "Holiday": int, "shopID": int, "Cat": int, "itemID": int,})

In [244]:
# 出力

df_test.to_csv("test_perday.csv", index=False)