In [1]:
import pandas as pd
import numpy as np
import ydata_profiling as yp
import datetime as dt
import sklearn
from tqdm.notebook import trange
import math

In [2]:
df_train = pd.read_csv("sales_history.csv")
df_test = pd.read_csv("test_origin.csv")


df_train = df_train.query('0<=売上個数')


# 予測する商品IDをテストデータから取得
items_id = df_test.iloc[:, 1].unique()

# 予測する商品IDが含まれる行を抽出
df_train = df_train[df_train.iloc[:, 2].isin(items_id)]


# 日付をDateTime型に変換
def ToDate(row):
    
    row.iloc[0] = dt.datetime.strptime(row.iloc[0], '%Y-%m-%d')
    row.iloc[0] = row.iloc[0].date()

    return row

df_train = df_train.apply(ToDate, axis=1)


# 売上個数をintに変換
df_train["売上個数"] = df_train["売上個数"].astype('int')


# "Month"列を追加
df_train.insert(1, "Month", 0)

def AddMonth(row):

    month = row[0].month
    row.iloc[1] = int(month)
    return row

df_train = df_train.apply(AddMonth, axis=1)


# "Months"列を追加
df_train.insert(2, "Months", 0)

t1 = dt.date(year=2018, month=1, day=1)


def AddMonths(row):

    months = abs(row[0].year - t1.year)*12 + (row[0].month - t1.month)
    row.iloc[2] = int(months)
    return row

df_train = df_train.apply(AddMonths, axis=1)


list_month_shopID_itemID_MeanPrice_n = []

for m in trange(22):
    for s in trange(18):
        for i in items_id:
            list = []
            MeanPrice = df_train.loc[(df_train["Months"]==m) & (df_train["店舗ID"]==s) & (df_train["商品ID"]==i), "商品価格"].mean()
            n = df_train.loc[(df_train["Months"]==m) & (df_train["店舗ID"]==s) & (df_train["商品ID"]==i), "売上個数"].sum()
            if m <= 11:
                month = m + 1
                list.append(month)
            else:
                month = (m % 12) + 1
                list.append(month)
            list.append(s)
            list.append(i)
            list.append(MeanPrice)
            list.append(n)
            list_month_shopID_itemID_MeanPrice_n.append(list)


# リストから新しいデータフレームを作成
df_train = pd.DataFrame(list_month_shopID_itemID_MeanPrice_n,
                            columns=["Month", "shopID", "itemID", "MeanPrice", "n"]
                            )


# nが0の行のMeanPriceを埋めるため、itemIDごとに価格の全体平均を計算。
meanprice_dic = {}

for i in items_id:
    meanprice = df_train.loc[(df_train["itemID"]==i), "MeanPrice"].mean()
    meanprice_dic[i] = meanprice


# meanprice_dicからMeanPriceがNaNの項を埋める
def MeanPrice(row):

    if math.isnan(row[3]):
        row[3] = meanprice_dic[row[2]]
    else:
        pass

    return row

df_train = df_train.apply(MeanPrice, axis=1)


# Cat列を追加
df_train.insert(2, "Cat", 0)

def AddCat(row):

    id = str(row[3])
    row[2] = int(id[0:3])

    return row

df_train = df_train.apply(AddCat, axis=1)


# Month, shopID, Cat, itemID列をint型に変換
df_train[["Month", "shopID", "Cat", "itemID"]] = df_train[["Month", "shopID", "Cat", "itemID"]].astype(int)


# n > 15 のデータを捨てる
df_train = df_train.query('n<=15')

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

In [3]:
df_test = pd.read_csv("test_origin.csv")

df_test.insert(0, "Month", 12)
df_test.insert(1, "Cat", 0)
df_test.insert(2, "MeanPrice", 0)
df_test = df_test = df_test.rename(columns={"商品ID": "itemID", "店舗ID": "shopID"})
df_test = df_test.reindex(columns=["Month", "shopID", "Cat", "itemID", "MeanPrice"])


def TestAddCat(row):

    id = str(row[3])
    row[2] = int(id[0:3])

    return row


def TestMeanPrice(row):

    if row[4] == 0:
        row[4] = meanprice_dic[row[3]]
    else:
        pass

    return row


df_test = df_test.apply(TestMeanPrice, axis=1)
df_test = df_test.apply(TestAddCat, axis=1)


df_test[["Month", "shopID", "Cat", "itemID"]] = df_test[["Month", "shopID", "Cat", "itemID"]].astype(int)

In [4]:
df_train.to_csv("train_CNN.csv", index=False)
df_test.to_csv("test_CNN.csv", index=False)