In [48]:
import pandas as pd
import numpy as np
import ydata_profiling as yp
import datetime as dt
import sklearn
from tqdm.notebook import trange

In [49]:
df_train = pd.read_csv("sales_history.csv")
df_test = pd.read_csv("test_origin.csv")
df_test = df_test.drop(columns='index')

In [50]:
# 予測する商品IDをテストデータから取得
items_id = df_test.iloc[:, 0].unique()

# 予測する商品IDが含まれる行を抽出
df_train = df_train[df_train.iloc[:, 2].isin(items_id)]

In [51]:
# 日付をDateTime型に変換
def ToDate(row):
    
    row.iloc[0] = dt.datetime.strptime(row.iloc[0], '%Y-%m-%d')
    row.iloc[0] = row.iloc[0].date()

    return row


df_train = df_train.apply(ToDate, axis=1)

In [52]:
# 売上個数をintに変換
df_train["売上個数"] = df_train["売上個数"].astype('int')

In [53]:
df_train

Unnamed: 0,日付,店舗ID,商品ID,商品価格,売上個数
0,2018-01-01,9,1000001,420,1
1,2018-01-01,6,1000001,420,1
2,2018-01-01,10,1000001,420,1
364,2018-01-01,0,1200007,250,1
365,2018-01-01,15,1200007,130,1
...,...,...,...,...,...
1119551,2019-10-31,0,3200026,850,1
1119553,2019-10-31,10,3300001,2040,1
1119562,2019-10-31,15,3400002,1440,1
1119563,2019-10-31,17,3400003,1190,1


In [54]:
# "Month"列を追加
df_train.insert(1, "Month", 0)


def AddMonth(row):

    month = row[0].month
    row.iloc[1] = int(month)
    return row

df_train = df_train.apply(AddMonth, axis=1)

In [55]:
# "Months"列を追加
df_train.insert(2, "Months", 0)

t1 = dt.date(year=2018, month=1, day=1)


def AddMonths(row):

    months = abs(row[0].year - t1.year)*12 + (row[0].month - t1.month)
    row.iloc[2] = int(months)
    return row

df_train = df_train.apply(AddMonths, axis=1)

In [56]:
# 月・店舗・商品ごとに商品価格の平均と売上個数合計を計算しリストに追加

list_month_shopID_itemID_MeanPrice_n = []

for m in trange(22):
    for s in trange(18):
        for i in items_id:
            list = []
            MeanPrice = df_train.loc[(df_train["Months"]==m) & (df_train["店舗ID"]==s) & (df_train["商品ID"]==i), "商品価格"].mean()
            n = df_train.loc[(df_train["Months"]==m) & (df_train["店舗ID"]==s) & (df_train["商品ID"]==i), "売上個数"].sum()
            list.append(m)
            list.append(s)
            list.append(i)
            list.append(MeanPrice)
            list.append(n)
            list_month_shopID_itemID_MeanPrice_n.append(list)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [57]:
df_train = pd.DataFrame(list_month_shopID_itemID_MeanPrice_n,
                            columns=["Months", "shopID", "itemID", "MeanPrice", "n"]
                            )

In [65]:
df_train

Unnamed: 0,Months,shopID,itemID,MeanPrice,n
0,0.0,0.0,1000001.0,420.0,6.0
1,0.0,0.0,1000002.0,250.0,2.0
2,0.0,0.0,1000003.0,,0.0
3,0.0,0.0,1000004.0,120.0,1.0
4,0.0,0.0,1000005.0,220.0,8.0
...,...,...,...,...,...
67315,21.0,17.0,3300001.0,,0.0
67316,21.0,17.0,3400001.0,850.0,1.0
67317,21.0,17.0,3400002.0,,0.0
67318,21.0,17.0,3400003.0,1190.0,2.0


In [61]:
meanprice_dic = {}

for i in items_id:
    meanprice = df_train.loc[(df_train["itemID"]==i), "MeanPrice"].mean()
    meanprice_dic[i] = meanprice

In [62]:
meanprice_dic

{1000001: 363.369708994709,
 1000002: 237.51288659793815,
 1000003: 226.9418960244648,
 1000004: 221.49193548387098,
 1000005: 245.84453781512605,
 1000006: 244.25837320574163,
 1000007: 241.3551401869159,
 1000008: 244.73333333333332,
 1000009: 222.8421052631579,
 1000010: 330.1928104575163,
 1000011: 243.59375,
 1002291: 411.3731380168388,
 1002377: 355.7831325301205,
 1002503: 341.49122807017545,
 1002582: 668.8386939472399,
 1100001: 422.42592592592587,
 1100002: 167.35185566067918,
 1200001: 508.37632275132273,
 1200002: 535.7174603174603,
 1200003: 493.0416666666667,
 1200004: 504.1940867906533,
 1200005: 266.4240506329114,
 1200006: 335.93283582089555,
 1200007: 291.33088118968516,
 1200008: 290.6962351727695,
 1200009: 292.90579710144925,
 1200010: 242.14285714285714,
 1200011: 442.5261869799314,
 1200012: 490.82062825130055,
 1200013: 503.10144927536237,
 1200014: 501.3157894736842,
 1200015: 326.62017167381975,
 1200016: 288.3773408239701,
 1200017: 337.0987654320988,
 120001

In [64]:
def MeanPrice(row):

    if row[3] == "NaN":
        row[3] = meanprice_dic[row[2]]
    else:
        pass

    return row

df_train = df_train.apply(MeanPrice, axis=1)