In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from copy import deepcopy

tqdm.pandas()

ID_SIZE = 20

In [2]:
import os

if os.getcwd() != "/root/restaurants":
    os.chdir("..")
os.getcwd()

'/root/restaurants'

In [3]:
store_info = pd.read_csv("data/raw/air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

date_info = pd.read_csv("data/raw/date_info.csv")
date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

In [4]:
submission = pd.read_csv("data/raw/sample_submission.csv")
data = pd.read_csv("data/processed/data.csv")
data["date"] = pd.to_datetime(data["date"])
data.head()

new_submission = deepcopy(submission)
new_submission["store_id"] = new_submission["id"].str[:ID_SIZE]
new_submission["date"] = new_submission["id"].str[ID_SIZE + 1 :]

new_submission = pd.merge(new_submission, date_info, on="date")
new_submission = pd.merge(new_submission, store_info, on="store_id")

new_submission["date"] = pd.to_datetime(new_submission["date"])
new_submission["year"] = new_submission["date"].dt.year
new_submission["month"] = new_submission["date"].dt.month
new_submission["day"] = new_submission["date"].dt.day

In [5]:
new_submission.head(3)

Unnamed: 0,id,visitors,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,day
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23,Sunday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2017,4,23
1,air_00a91d42b08b08d9_2017-04-24,0,air_00a91d42b08b08d9,2017-04-24,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2017,4,24
2,air_00a91d42b08b08d9_2017-04-25,0,air_00a91d42b08b08d9,2017-04-25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2017,4,25


In [6]:
import pandas as pd

from sktime.transformations.base import BaseTransformer


class MyTransformer(BaseTransformer):
    THREE_WEEKS = 21
    FIVE_WEEKS = 35
    TWO_MONTH = 61
    ONE_QUARTER = 365 // 4
    HALF_YEAR = 365 // 2
    THREE_QUARTERS = 365 * 3 // 4
    YEAR = 365

    LAGS = [
        THREE_WEEKS,
        FIVE_WEEKS,
        TWO_MONTH,
        ONE_QUARTER,
        HALF_YEAR,
        THREE_QUARTERS,
        YEAR,
    ]

    def __init__(self):
        super().__init__()

    def fit(self, X, y=None):
        self.data = X

        if y is not None:
            self.data["visitors"] = y

        return self

    def get_store_column_lag(self, row, data, column):
        for lag in self.LAGS:
            column_name = f"store_id_{column}_{lag}days_mean"
            is_nan_column_name = f"is_nan_{column_name}"

            start_date = self.date_info["date"]
            end_date = start_date - pd.Timedelta(days=lag)

            value = data.loc[(data["date"] >= end_date), "visitors"].mean()

            if pd.isna(value):
                value = 0
                is_nan = 1
            else:
                is_nan = 0

            row[column_name] = value
            row[is_nan_column_name] = is_nan

        return row

    def get_store_features(self, row):
        store_id_data = self.data[self.data["store_id"] == row["store_id"]]
        holiday_flg_data = store_id_data[
            store_id_data["holiday_flg"] == self.date_info["holiday_flg"]
        ]
        day_of_week_data = store_id_data[
            store_id_data["day_of_week"] == self.date_info["day_of_week"]
        ]

        row = self.get_store_column_lag(row, day_of_week_data, "day_of_week")
        row = self.get_store_column_lag(row, holiday_flg_data, "holiday_flg")

        return row
    
    def get_area_genre_column_feature(self, group, data, column):
        for lag in self.LAGS:
            column_name = f"area_genre_{column}_{lag}days_mean"
            is_nan_column_name = f"is_nan_{column_name}"

            start_date = self.date_info["date"]
            end_date = start_date - pd.Timedelta(days=lag)

            new_data = data[data["date"] >= end_date]
            area_genre_data_mean = new_data.groupby(
                by=["date"], as_index=False
            ).visitors.mean()

            value = area_genre_data_mean.mean()["visitors"]

            if pd.isna(value):
                value = 0
                is_nan = 1
            else:
                is_nan = 0

            group[column_name] = value
            group[is_nan_column_name] = is_nan

        return group

    def get_area_genre_features(self, group, area_genre):
        area_name = area_genre[0]
        genre_name = area_genre[1]

        area_genre_data = self.data[
            (self.data["area_name"] == area_name)
            & (self.data["genre_name"] == genre_name)
        ]
        holiday_flg_data = area_genre_data[
            area_genre_data["holiday_flg"] == self.date_info["holiday_flg"]
        ]
        day_of_week_data = area_genre_data[
            area_genre_data["day_of_week"] == self.date_info["day_of_week"]
        ]

        group = self.get_area_genre_column_feature(group, day_of_week_data, "day_of_week")
        group = self.get_area_genre_column_feature(group, holiday_flg_data, "holiday_flg")

        return group

    def transform(self, X, y=None):
        self.date_info = dict()
        self.date_info["date"] = X.iloc[0]["date"]
        self.date_info["day_of_week"] = X.iloc[0]["day_of_week"]
        self.date_info["holiday_flg"] = X.iloc[0]["holiday_flg"]

        for lag in self.LAGS:
            for column in ["day_of_week", "holiday_flg"]:
                for type in ["store_id", "area_genre"]:
                    column_name = f"{type}_{column}_{lag}days_mean"
                    is_nan_column_name = f"is_nan_{column_name}"

                    X.loc[:, [column_name, is_nan_column_name]] = np.nan

        X_columns = X.columns

        X = X.transform(lambda row: self.get_store_features(row), axis=1)
        X = X.groupby(by=["area_name", "genre_name"], group_keys=False)[
            X_columns
        ].apply(lambda group: self.get_area_genre_features(group, area_genre=group.name))

        return X

    def compute_rolling(self, group, column_name, lag):
        group[column_name] = (
            group[["date", "visitors"]]
            .rolling(f"{lag}D", on="date", min_periods=1)
            .mean()
            .shift()["visitors"]
        )

        return group

    def add_store_features(self, lag, column):
        column_name = f"store_id_{column}_{lag}days_mean"
        is_nan_column_name = f"is_nan_{column_name}"

        data_columns = self.data.columns
        self.data = self.data.groupby(["store_id", column], group_keys=False)[
            data_columns
        ].apply(lambda group: self.compute_rolling(group, column_name, lag))

        self.data[is_nan_column_name] = pd.isna(self.data[column_name]).astype(int)
        self.data[column_name] = self.data[column_name].fillna(0)

        return

    def add_area_genre_features(self, lag, column):
        def area_genre_compute_rolling(area_genre_data):
            area_genre_data_mean = area_genre_data.groupby(by=["date"]).visitors.mean()
        
            area_genre_data = (
                area_genre_data.drop(columns=["visitors"])
                .merge(area_genre_data_mean, on=["date"], how="right")
                .drop_duplicates()
            )

            area_genre_columns = area_genre_data.columns
            area_genre_data = area_genre_data.groupby(column, group_keys=False)[area_genre_columns].apply(lambda group: self.compute_rolling(group, column_name, lag))
        
            return area_genre_data  
        
        column_name = f"area_genre_{column}_{lag}days_mean"
        is_nan_column_name = f"is_nan_{column_name}"

        visitors = self.data[["visitors"]].copy()
        data_columns = self.data.columns
        self.data = self.data.groupby(["area_name", "genre_name"], group_keys=False)[data_columns].apply(area_genre_compute_rolling)
        self.data["visitors"] = visitors["visitors"].values

        self.data[is_nan_column_name] = pd.isna(self.data[column_name]).astype(int)
        self.data[column_name] = self.data[column_name].fillna(0)

        return

    def fit_transform(self, X, y=None):
        print(self._is_fitted)
        if self._is_fitted:
            return self.transform(X)

        else :
            self._is_fitted = True
            self.data = X

            if y is not None:
                self.data["visitors"] = y

            for lag in tqdm(self.LAGS):
                for column in ["day_of_week", "holiday_flg"]:
                    self.add_area_genre_features(lag, column)
                    self.add_store_features(lag, column)

            return self.data.drop(columns=["visitors"])

In [7]:
new_data = data[
        [
            "store_id",
            "date",
            "day_of_week",
            "holiday_flg",
            "genre_name",
            "area_name",
            "latitude",
            "longitude",
            "year",
            "month",
            "day",
            "visitors",
        ]
    ]

In [8]:
tr = MyTransformer()


tr.fit_transform(new_data)
# y_pred = tr.transform(fh)

  0%|          | 0/7 [00:00<?, ?it/s]

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,...,store_id_holiday_flg_273days_mean,is_nan_store_id_holiday_flg_273days_mean,area_genre_day_of_week_365days_mean,is_nan_area_genre_day_of_week_365days_mean,store_id_day_of_week_365days_mean,is_nan_store_id_day_of_week_365days_mean,area_genre_holiday_flg_365days_mean,is_nan_area_genre_holiday_flg_365days_mean,store_id_holiday_flg_365days_mean,is_nan_store_id_holiday_flg_365days_mean
0,air_00a91d42b08b08d9,2016-07-01,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,...,0.000000,1,0.000000,1,0.000000,1,0.000000,1,0.000000,1
0,air_0241aa3964b7f861,2016-01-03,Sunday,1,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996,2016,1,...,0.000000,1,0.000000,1,0.000000,1,0.000000,1,0.000000,1
0,air_034a3d5b40d5b1b1,2016-07-01,Friday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229,2016,7,...,0.000000,1,0.000000,1,0.000000,1,0.000000,1,0.000000,1
0,air_036d4f1ee7285390,2016-07-01,Friday,0,Cafe/Sweets,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,2016,7,...,0.000000,1,0.000000,1,0.000000,1,0.000000,1,0.000000,1
0,air_04341b588bde96cd,2016-01-01,Friday,1,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,2016,1,...,0.000000,1,0.000000,1,0.000000,1,0.000000,1,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7416,air_dbf64f1ce38c7442,2017-04-22,Saturday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,2017,4,...,21.404255,0,20.230517,0,20.710526,0,20.047079,0,21.582031,0
7417,air_e57dd6884595f60d,2017-04-22,Saturday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,2017,4,...,20.282407,0,20.232908,0,19.961538,0,20.047491,0,20.555172,0
7418,air_eb120e6d384a17a8,2017-04-22,Saturday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,2017,4,...,20.330435,0,20.235294,0,20.061224,0,20.047904,0,21.285714,0
7419,air_eca5e0064dc9314a,2017-04-22,Saturday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,2017,4,...,20.873016,0,20.237675,0,20.595238,0,20.048317,0,20.494505,0


In [12]:
X = new_submission.sort_values("date").head(data["store_id"].nunique())

tr.fit_transform(X)

Unnamed: 0,id,visitors,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,...,area_genre_holiday_flg_273days_mean,is_nan_area_genre_holiday_flg_273days_mean,store_id_day_of_week_365days_mean,is_nan_store_id_day_of_week_365days_mean,area_genre_day_of_week_365days_mean,is_nan_area_genre_day_of_week_365days_mean,store_id_holiday_flg_365days_mean,is_nan_store_id_holiday_flg_365days_mean,area_genre_holiday_flg_365days_mean,is_nan_area_genre_holiday_flg_365days_mean
0,air_00a91d42b08b08d9_2017-04-23,0,air_00a91d42b08b08d9,2017-04-23 00:00:00,Sunday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,...,21.001984,0,4.0,0,19.399123,0,21.606061,0,21.049270,0
1053,air_08cb3c4ee6cd6a22_2017-04-23,0,air_08cb3c4ee6cd6a22,2017-04-23 00:00:00,Sunday,0,Izakaya,Hyōgo-ken Kakogawa-shi Kakogawachō Kitazaike,34.75695,134.841177,...,20.612420,0,22.857143,0,19.334501,0,21.925651,0,20.646921,0
30966,air_f8233ad00755c35c_2017-04-23,0,air_f8233ad00755c35c,2017-04-23 00:00:00,Sunday,0,Italian/French,Tōkyō-to Shibuya-ku Shibuya,35.661777,139.704051,...,21.369313,0,18.5,0,20.485333,0,20.374502,0,21.253962,0
3588,air_234d3dbf7f3d5a50_2017-04-23,0,air_234d3dbf7f3d5a50,2017-04-23 00:00:00,Sunday,0,Dining bar,Tōkyō-to Machida-shi Morino,35.546631,139.438619,...,21.337270,0,21.265306,0,24.402244,0,21.827586,0,21.252444,0
21177,air_a563896da3777078_2017-04-23,0,air_a563896da3777078,2017-04-23 00:00:00,Sunday,0,Izakaya,Tōkyō-to Toshima-ku Mejiro,35.7253,139.696188,...,21.520000,0,21.54,0,21.640000,0,21.113712,0,20.759091,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20593,air_a239a44805932bab_2017-04-24,0,air_a239a44805932bab,2017-04-24 00:00:00,Monday,0,Cafe/Sweets,Niigata-ken Kashiwazaki-shi Chūōchō,37.3719,138.558984,...,22.086694,0,25.897436,0,25.474359,0,21.909091,0,21.823151,0
30538,air_f26f36ec4dc5adb0_2017-04-24,0,air_f26f36ec4dc5adb0,2017-04-24 00:00:00,Monday,0,Izakaya,Tōkyō-to Shinjuku-ku Kabukichō,35.69384,139.703549,...,20.559721,0,20.431373,0,20.177358,0,20.335312,0,20.881570,0
5110,air_2d3afcb91762fe01_2017-04-24,0,air_2d3afcb91762fe01,2017-04-24 00:00:00,Monday,0,Izakaya,Fukuoka-ken Fukuoka-shi Hakata Ekimae,33.591358,130.414878,...,20.435067,0,41.0,0,19.604108,0,21.43299,0,20.834881,0
3901,air_258ad2619d7bff9a_2017-04-24,0,air_258ad2619d7bff9a,2017-04-24 00:00:00,Monday,0,Izakaya,Tōkyō-to Kōtō-ku Tōyō,35.672854,139.81741,...,20.279884,0,27.35,0,19.849533,0,21.681275,0,20.666554,0
