In [17]:
import pandas as pd
import numpy as np
import ydata_profiling as yp
import datetime as dt
import sklearn
from tqdm.notebook import trange
import math
import featuretools as ft
from woodwork.logical_types import Categorical

## 学習データの整形

In [18]:
df_train = pd.read_csv("sales_history.csv")
df_test = pd.read_csv("test_origin.csv")


df_train = df_train.query('0<=売上個数')


# 予測する商品IDをテストデータから取得
items_id = df_test.iloc[:, 1].unique()

# 予測する商品IDが含まれる行を抽出
df_train = df_train[df_train.iloc[:, 2].isin(items_id)]


# 日付をDateTime型に変換
def ToDate(row):
    
    row.iloc[0] = dt.datetime.strptime(row.iloc[0], '%Y-%m-%d')
    row.iloc[0] = row.iloc[0].date()

    return row

df_train = df_train.apply(ToDate, axis=1)


# 売上個数をintに変換
df_train["売上個数"] = df_train["売上個数"].astype('int')


# "Month"列を追加
df_train.insert(1, "Month", 0)

def AddMonth(row):

    month = row[0].month
    row.iloc[1] = int(month)
    return row

df_train = df_train.apply(AddMonth, axis=1)


# "Months"列を追加
df_train.insert(2, "Months", 0)

t1 = dt.date(year=2018, month=1, day=1)


def AddMonths(row):

    months = abs(row[0].year - t1.year)*12 + (row[0].month - t1.month)
    row.iloc[2] = int(months)
    return row

df_train = df_train.apply(AddMonths, axis=1)


list_month_shopID_itemID_MeanPrice_n = []

for m in trange(22):
    for s in trange(18):
        for i in items_id:
            list = []
            MeanPrice = df_train.loc[(df_train["Months"]==m) & (df_train["店舗ID"]==s) & (df_train["商品ID"]==i), "商品価格"].mean()
            n = df_train.loc[(df_train["Months"]==m) & (df_train["店舗ID"]==s) & (df_train["商品ID"]==i), "売上個数"].sum()
            if m <= 11:
                month = m + 1
                list.append(month)
            else:
                month = (m % 12) + 1
                list.append(month)
            list.append(s)
            list.append(i)
            list.append(MeanPrice)
            list.append(n)
            list_month_shopID_itemID_MeanPrice_n.append(list)


# リストから新しいデータフレームを作成
df_train = pd.DataFrame(list_month_shopID_itemID_MeanPrice_n,
                            columns=["Month", "shopID", "itemID", "MeanPrice", "n"]
                            )


# nが0の行のMeanPriceを埋めるため、itemIDごとに価格の全体平均を計算。
meanprice_dic = {}

for i in items_id:
    meanprice = df_train.loc[(df_train["itemID"]==i), "MeanPrice"].mean()
    meanprice_dic[i] = meanprice


# meanprice_dicからMeanPriceがNaNの項を埋める
def MeanPrice(row):

    if math.isnan(row[3]):
        row[3] = meanprice_dic[row[2]]
    else:
        pass

    return row

df_train = df_train.apply(MeanPrice, axis=1)


# Cat列を追加
df_train.insert(2, "Cat", 0)

def AddCat(row):

    id = str(row[3])
    row[2] = int(id[0:3])

    return row

df_train = df_train.apply(AddCat, axis=1)


# Month, shopID, Cat, itemID列をint型に変換
df_train[["Month", "shopID", "Cat", "itemID"]] = df_train[["Month", "shopID", "Cat", "itemID"]].astype(int)


# インデックスを列に変換
df_train.reset_index(inplace=True)

  0%|          | 0/22 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

  0%|          | 0/18 [00:00<?, ?it/s]

# テストデータの整形

In [19]:
df_test = pd.read_csv("test_origin.csv")

df_test.insert(0, "Month", 12)
df_test.insert(1, "Cat", 0)
df_test.insert(2, "MeanPrice", 0)
df_test = df_test = df_test.rename(columns={"商品ID": "itemID", "店舗ID": "shopID"})
df_test = df_test.reindex(columns=["Month", "shopID", "Cat", "itemID", "MeanPrice"])


def TestAddCat(row):

    id = str(row[3])
    row[2] = int(id[0:3])

    return row


def TestMeanPrice(row):

    if row[4] == 0:
        row[4] = meanprice_dic[row[3]]
    else:
        pass

    return row


df_test = df_test.apply(TestMeanPrice, axis=1)
df_test = df_test.apply(TestAddCat, axis=1)


df_test[["Month", "shopID", "Cat", "itemID"]] = df_test[["Month", "shopID", "Cat", "itemID"]].astype(int)


df_test.reset_index(inplace=True)

In [20]:
df_test

Unnamed: 0,index,Month,shopID,Cat,itemID,MeanPrice
0,0,12,0,100,1000001,363.369709
1,1,12,1,100,1000001,363.369709
2,2,12,2,100,1000001,363.369709
3,3,12,3,100,1000001,363.369709
4,4,12,4,100,1000001,363.369709
...,...,...,...,...,...,...
3055,3055,12,13,350,3500001,416.063830
3056,3056,12,14,350,3500001,416.063830
3057,3057,12,15,350,3500001,416.063830
3058,3058,12,16,350,3500001,416.063830


## 新しい特徴量を作る

In [21]:
df_train_ft = df_train.drop('n', axis=1)

In [22]:
df_train_ft

Unnamed: 0,index,Month,shopID,Cat,itemID,MeanPrice
0,0,1,0,100,1000001,420.000000
1,1,1,0,100,1000002,250.000000
2,2,1,0,100,1000003,226.941896
3,3,1,0,100,1000004,120.000000
4,4,1,0,100,1000005,220.000000
...,...,...,...,...,...,...
67315,67315,10,17,330,3300001,1775.952381
67316,67316,10,17,340,3400001,850.000000
67317,67317,10,17,340,3400002,1672.455521
67318,67318,10,17,340,3400003,1190.000000


In [23]:
# EntitySetの設定
es = ft.EntitySet(id='NewFeatures')

In [24]:
# EntitySetにデータフレームを登録
es = es.add_dataframe(dataframe_name='train',
                      dataframe=df_train_ft,
                      index='index',
                      time_index='Month',
                      logical_types={
                          'Cat': Categorical,
                          'itemID': Categorical
                      })

es = es.normalize_dataframe(base_dataframe_name='train',
                            new_dataframe_name='train_cat',
                            index='itemID')


# EntitySetにデータフレームを登録
es = es.add_dataframe(dataframe_name='test',
                      dataframe=df_test,
                      index='index',
                      time_index='Month',
                      logical_types={
                          'Cat': Categorical,
                          'itemID': Categorical
                      })

es = es.normalize_dataframe(base_dataframe_name='test',
                            new_dataframe_name='test_cat',
                            index='itemID')

In [25]:
es['train']

Unnamed: 0,index,Month,shopID,Cat,itemID,MeanPrice
0,0,1,0,100,1000001,420.000000
1,1,1,0,100,1000002,250.000000
2,2,1,0,100,1000003,226.941896
3,3,1,0,100,1000004,120.000000
4,4,1,0,100,1000005,220.000000
...,...,...,...,...,...,...
36715,36715,12,17,330,3300001,1780.000000
36716,36716,12,17,340,3400001,850.000000
36717,36717,12,17,340,3400002,1530.000000
36718,36718,12,17,340,3400003,1102.500000


In [26]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_dataframe_name="train",
                                      trans_primitives=['add_numeric', 'subtract_numeric'],
                                      agg_primitives=['count', 'sum', 'mean'],
                                      max_depth=2)

feature_matrix

Unnamed: 0_level_0,Month,shopID,Cat,itemID,MeanPrice,MeanPrice + Month,MeanPrice + shopID,Month + shopID,MeanPrice - Month,MeanPrice - shopID,...,train_cat.MEAN(train.shopID),train_cat.SUM(train.MeanPrice),train_cat.SUM(train.Month),train_cat.SUM(train.shopID),MeanPrice + train_cat.first_train_time,Month + train_cat.first_train_time,shopID + train_cat.first_train_time,MeanPrice - train_cat.first_train_time,Month - train_cat.first_train_time,shopID - train_cat.first_train_time
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,100,1000001,420.000000,421.000000,420.000000,1.0,419.000000,420.000000,...,8.5,143894.404762,2394.0,3366.0,421.000000,2.0,1.0,419.000000,0.0,-1.0
1,1,0,100,1000002,250.000000,251.000000,250.000000,1.0,249.000000,250.000000,...,8.5,94055.103093,2394.0,3366.0,251.000000,2.0,1.0,249.000000,0.0,-1.0
2,1,0,100,1000003,226.941896,227.941896,226.941896,1.0,225.941896,226.941896,...,8.5,89868.990826,2394.0,3366.0,227.941896,2.0,1.0,225.941896,0.0,-1.0
3,1,0,100,1000004,120.000000,121.000000,120.000000,1.0,119.000000,120.000000,...,8.5,87710.806452,2394.0,3366.0,121.000000,2.0,1.0,119.000000,0.0,-1.0
4,1,0,100,1000005,220.000000,221.000000,220.000000,1.0,219.000000,220.000000,...,8.5,97354.436975,2394.0,3366.0,221.000000,2.0,1.0,219.000000,0.0,-1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36715,12,17,330,3300001,1780.000000,1792.000000,1797.000000,29.0,1768.000000,1763.000000,...,8.5,703277.142857,2394.0,3366.0,1781.000000,13.0,18.0,1779.000000,11.0,16.0
36716,12,17,340,3400001,850.000000,862.000000,867.000000,29.0,838.000000,833.000000,...,8.5,341331.734440,2394.0,3366.0,851.000000,13.0,18.0,849.000000,11.0,16.0
36717,12,17,340,3400002,1530.000000,1542.000000,1547.000000,29.0,1518.000000,1513.000000,...,8.5,662292.386185,2394.0,3366.0,1531.000000,13.0,18.0,1529.000000,11.0,16.0
36718,12,17,340,3400003,1102.500000,1114.500000,1119.500000,29.0,1090.500000,1085.500000,...,8.5,452852.624292,2394.0,3366.0,1103.500000,13.0,18.0,1101.500000,11.0,16.0


In [27]:
df_train_ft = feature_matrix

In [30]:
df_train_ft

Unnamed: 0_level_0,Month,shopID,Cat,itemID,MeanPrice,MeanPrice + Month,MeanPrice + shopID,Month + shopID,MeanPrice - Month,MeanPrice - shopID,...,train_cat.SUM(train.MeanPrice),train_cat.SUM(train.Month),train_cat.SUM(train.shopID),MeanPrice + train_cat.first_train_time,Month + train_cat.first_train_time,shopID + train_cat.first_train_time,MeanPrice - train_cat.first_train_time,Month - train_cat.first_train_time,shopID - train_cat.first_train_time,n
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,0,100,1000001,420.000000,421.000000,420.000000,1.0,419.000000,420.000000,...,143894.404762,2394.0,3366.0,421.000000,2.0,1.0,419.000000,0.0,-1.0,6.0
1,1,0,100,1000002,250.000000,251.000000,250.000000,1.0,249.000000,250.000000,...,94055.103093,2394.0,3366.0,251.000000,2.0,1.0,249.000000,0.0,-1.0,2.0
2,1,0,100,1000003,226.941896,227.941896,226.941896,1.0,225.941896,226.941896,...,89868.990826,2394.0,3366.0,227.941896,2.0,1.0,225.941896,0.0,-1.0,0.0
3,1,0,100,1000004,120.000000,121.000000,120.000000,1.0,119.000000,120.000000,...,87710.806452,2394.0,3366.0,121.000000,2.0,1.0,119.000000,0.0,-1.0,1.0
4,1,0,100,1000005,220.000000,221.000000,220.000000,1.0,219.000000,220.000000,...,97354.436975,2394.0,3366.0,221.000000,2.0,1.0,219.000000,0.0,-1.0,8.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36715,12,17,330,3300001,1780.000000,1792.000000,1797.000000,29.0,1768.000000,1763.000000,...,703277.142857,2394.0,3366.0,1781.000000,13.0,18.0,1779.000000,11.0,16.0,1.0
36716,12,17,340,3400001,850.000000,862.000000,867.000000,29.0,838.000000,833.000000,...,341331.734440,2394.0,3366.0,851.000000,13.0,18.0,849.000000,11.0,16.0,3.0
36717,12,17,340,3400002,1530.000000,1542.000000,1547.000000,29.0,1518.000000,1513.000000,...,662292.386185,2394.0,3366.0,1531.000000,13.0,18.0,1529.000000,11.0,16.0,2.0
36718,12,17,340,3400003,1102.500000,1114.500000,1119.500000,29.0,1090.500000,1085.500000,...,452852.624292,2394.0,3366.0,1103.500000,13.0,18.0,1101.500000,11.0,16.0,4.0


In [29]:
df_train_ft['n'] = df_train['n']

In [33]:
feature_matrix, feature_defs = ft.dfs(entityset=es,
                                      target_dataframe_name="test",
                                      trans_primitives=['add_numeric', 'subtract_numeric'],
                                      agg_primitives=['count', 'sum', 'mean'],
                                      max_depth=2)

In [34]:
df_test_ft = feature_matrix

In [35]:
df_train_ft.to_csv("train_ft.csv", index=False)

In [36]:
df_test_ft.to_csv("test_ft.csv", index=False)