In [69]:
import pandas as pd
import numpy as np
import ydata_profiling as yp
import datetime as dt
import sklearn
from tqdm.notebook import trange
import math

pd.set_option('display.max_rows', 200)

In [70]:
df_train = pd.read_csv("sales_history.csv")

In [71]:
# 予測する商品IDをテストデータから取得
items_id = df_test.iloc[:, 0].unique()

# 予測する商品IDが含まれる行を抽出
df_train = df_train[df_train.iloc[:, 2].isin(items_id)]

In [72]:
# 日付をDateTime型に変換
def ToDate(row):
    
    row.iloc[0] = dt.datetime.strptime(row.iloc[0], '%Y-%m-%d')
    row.iloc[0] = row.iloc[0].date()

    return row


df_train = df_train.apply(ToDate, axis=1)

In [73]:
# 売上個数をintに変換
df_train["売上個数"] = df_train["売上個数"].astype('int')

In [74]:
df_train

Unnamed: 0,日付,店舗ID,商品ID,商品価格,売上個数
0,2018-01-01,9,1000001,420,1
1,2018-01-01,6,1000001,420,1
2,2018-01-01,10,1000001,420,1
364,2018-01-01,0,1200007,250,1
365,2018-01-01,15,1200007,130,1
...,...,...,...,...,...
1119551,2019-10-31,0,3200026,850,1
1119553,2019-10-31,10,3300001,2040,1
1119562,2019-10-31,15,3400002,1440,1
1119563,2019-10-31,17,3400003,1190,1


In [75]:
# "Month"列を追加
df_train.insert(1, "Month", 0)


def AddMonth(row):

    month = row[0].month
    row.iloc[1] = int(month)
    return row

df_train = df_train.apply(AddMonth, axis=1)

In [76]:
# "Months"列を追加
df_train.insert(2, "Months", 0)

t1 = dt.date(year=2018, month=1, day=1)


def AddMonths(row):

    months = abs(row[0].year - t1.year)*12 + (row[0].month - t1.month)
    row.iloc[2] = int(months)
    return row

df_train = df_train.apply(AddMonths, axis=1)

In [77]:
# 月・店舗・商品ごとに商品価格の平均と売上個数合計を計算しリストに追加

list_month_shopID_itemID_MeanPrice_n = []

for m in trange(22):
    for s in trange(18):
        for i in items_id:
            list = []
            MeanPrice = df_train.loc[(df_train["Months"]==m) & (df_train["店舗ID"]==s) & (df_train["商品ID"]==i), "商品価格"].mean()
            n = df_train.loc[(df_train["Months"]==m) & (df_train["店舗ID"]==s) & (df_train["商品ID"]==i), "売上個数"].sum()
            list.append(m)
            list.append(s)
            list.append(i)
            list.append(MeanPrice)
            list.append(n)
            list_month_shopID_itemID_MeanPrice_n.append(list)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [78]:
# リストから新しいデータフレームを作成

df_train = pd.DataFrame(list_month_shopID_itemID_MeanPrice_n,
                            columns=["Months", "shopID", "itemID", "MeanPrice", "n"]
                            )

In [79]:
# nが0の行のMeanPriceを埋めるため、itemIDごとに価格の全体平均を計算。

meanprice_dic = {}

for i in items_id:
    meanprice = df_train.loc[(df_train["itemID"]==i), "MeanPrice"].mean()
    meanprice_dic[i] = meanprice

In [80]:
# meanprice_dicからMeanPriceがNaNの項を埋める

def MeanPrice(row):

    if math.isnan(row[3]):
        row[3] = meanprice_dic[row[2]]
    else:
        pass

    return row

df_train = df_train.apply(MeanPrice, axis=1)

In [81]:
def MonthsToMonth(row):
    
    if int(row["Months"]) < 12:
        row["Months"] = int(row["Months"] + 1)
    if int(row["Months"]) >= 12:
        row["Months"] = (row["Months"] % 12) + 1
    
    return row

df_train = df_train.apply(MonthsToMonth, axis=1)

In [82]:
df_train[["Months", "shopID", "itemID", "n"]] = df_train[["Months", "shopID", "itemID", "n"]].astype(int)

In [85]:
df_train = df_train.rename(columns={"Months": "Month"})

In [87]:
df_train.to_csv("train_meanprice.csv", index=False)

In [100]:
df_test = pd.read_csv("test_origin.csv")
df_test = df_test.drop(columns='index')

In [101]:
df_test.insert(0, "Month", 10)
df_test.insert(2, "MeanPrice", 0)
df_test = df_test = df_test.rename(columns={"商品ID": "itemID", "店舗ID": "shopID"})
df_test = df_test.reindex(columns=["Month", "shopID", "itemID", "MeanPrice"])


def TestMeanPrice(row):

    if row[3] == 0:
        row[3] = meanprice_dic[row[2]]
    else:
        pass

    return row


df_test = df_test.apply(TestMeanPrice, axis=1)

df_test[["Month", "shopID", "itemID"]] = df_test[["Month", "shopID", "itemID"]].astype(int)

In [103]:
df_test.to_csv("test_meanprice.csv", index=False)