In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy

In [2]:
import os

if os.getcwd() != "../data/raw":
    os.chdir("../data/raw")
os.getcwd()

'/root/restaurants/data/raw'

In [3]:
visit_data = pd.read_csv("air_visit_data.csv")
visit_data["visit_date"] = pd.to_datetime(visit_data["visit_date"])
visit_data = visit_data.rename(columns={"air_store_id": "store_id"})
visit_data["visit_date"] = visit_data["visit_date"].dt.date
visit_data.head()

Unnamed: 0,store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6


In [4]:
store_info = pd.read_csv("air_store_info.csv")
store_info = store_info.rename(
    columns={
        "air_store_id": "store_id",
        "air_genre_name": "genre_name",
        "air_area_name": "area_name",
    }
)

In [12]:
date_info = pd.read_csv("date_info.csv")
visit_data = visit_data.rename(columns={"visit_date": "date"})
visit_data["date"] = visit_data["date"].astype("string")

date_info = date_info.rename(columns={"calendar_date": "date"})
date_info["date"] = date_info["date"].astype("string")

In [13]:
data = pd.merge(visit_data, date_info, on="date", how="left")
data = pd.merge(data, store_info, on="store_id", how="left")
data["date"] = pd.to_datetime(data["date"])
data["year"] = data["date"].dt.year
data["month"] = data["date"].dt.month
data["day"] = data["date"].dt.day
data = data.sort_values(["store_id", "date"]).reset_index(drop=True)
data.head()

Unnamed: 0,store_id,date,visitors,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,day
0,air_00a91d42b08b08d9,2016-07-01,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,1
1,air_00a91d42b08b08d9,2016-07-02,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,2
2,air_00a91d42b08b08d9,2016-07-04,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,4
3,air_00a91d42b08b08d9,2016-07-05,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,5
4,air_00a91d42b08b08d9,2016-07-06,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,6


In [14]:
def compute_rolling(group, column_name, lag):
    group[column_name] = (
        group[["date", "visitors"]]
        .rolling(f"{lag}D", on="date", min_periods=1)
        .mean().shift()["visitors"]
    )

    return group

In [15]:
def add_lag(df, lag, column):
    column_name = "lag_" + column + "_" + str(lag)

    if column_name in df.columns:
        return df
    
    df = df.groupby(["store_id", column], group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
    
    df['is_nan_' + column_name] = pd.isna(df[column_name]).astype(int)
    df[column_name] = df[column_name].fillna(0)

    return df

In [16]:
def get_area_genre_lag(df, lag, column):
    column_name = "lag_area_genre_" + column + "_" + str(lag)

    if column_name in df.columns:
        return df
    
    def area_genre_compute_rolling(area_genre_data):
        area_genre_data_mean = area_genre_data.groupby(by=["date"]).visitors.mean()
        
        area_genre_data = (
            area_genre_data.drop(columns=["visitors"])
            .merge(area_genre_data_mean, on=["date"], how="right")
            .drop_duplicates()
        )

        area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
        
        return area_genre_data  
    

    visitors = df[["visitors"]].copy()
    df = df.groupby(["area_name", "genre_name"], group_keys=False).apply(area_genre_compute_rolling)
    df["visitors"] = visitors["visitors"].values

    df['is_nan_' + column_name] = pd.isna(df[column_name]).astype(int)
    df[column_name] = df[column_name].fillna(0)

    return df

In [17]:
THREE_WEEKS = 21
FIVE_WEEKS = 35
TWO_MONTH = 61
ONE_QUARTER = 365 // 4
HALF_YEAR = 365 // 2
THREE_QUARTERS = 365 * 3 // 4
YEAR = 365

LAGS = [
    THREE_WEEKS,
    FIVE_WEEKS,
    TWO_MONTH,
    ONE_QUARTER,
    HALF_YEAR,
    THREE_QUARTERS,
    YEAR,
]

In [20]:
data

Unnamed: 0,store_id,date,visitors,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,day
0,air_00a91d42b08b08d9,2016-07-01,35,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,1
1,air_00a91d42b08b08d9,2016-07-02,9,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,2
2,air_00a91d42b08b08d9,2016-07-04,20,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,4
3,air_00a91d42b08b08d9,2016-07-05,25,Tuesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,5
4,air_00a91d42b08b08d9,2016-07-06,29,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,6
...,...,...,...,...,...,...,...,...,...,...,...,...
252103,air_fff68b929994bfbd,2017-04-18,6,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2017,4,18
252104,air_fff68b929994bfbd,2017-04-19,2,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2017,4,19
252105,air_fff68b929994bfbd,2017-04-20,2,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2017,4,20
252106,air_fff68b929994bfbd,2017-04-21,4,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,2017,4,21


In [21]:
new_data = deepcopy(data)

for lag in LAGS:
    new_data = get_area_genre_lag(new_data, lag, "day_of_week")
    new_data = get_area_genre_lag(new_data, lag, "holiday_flg")
    new_data = add_lag(new_data, lag, "day_of_week")
    new_data = add_lag(new_data, lag, "holiday_flg")

new_data

  area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
  area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
  area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
  area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
  area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
  area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
  area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(lambda group: compute_rolling(group, column_name, lag))
  area_genre_data = area_genre_data.groupby(column, group_keys=False).apply(

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,...,is_nan_lag_holiday_flg_273,lag_area_genre_day_of_week_365,is_nan_lag_area_genre_day_of_week_365,visitors,lag_area_genre_holiday_flg_365,is_nan_lag_area_genre_holiday_flg_365,lag_day_of_week_365,is_nan_lag_day_of_week_365,lag_holiday_flg_365,is_nan_lag_holiday_flg_365
0,air_00a91d42b08b08d9,2016-07-01,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,...,1,0.000000,1,24,0.000000,1,0.000000,1,0.000000,1
0,air_0241aa3964b7f861,2016-01-03,Sunday,1,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996,2016,1,...,1,0.000000,1,28,0.000000,1,0.000000,1,0.000000,1
0,air_034a3d5b40d5b1b1,2016-07-01,Friday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229,2016,7,...,1,0.000000,1,9,0.000000,1,0.000000,1,0.000000,1
0,air_036d4f1ee7285390,2016-07-01,Friday,0,Cafe/Sweets,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,2016,7,...,1,0.000000,1,10,0.000000,1,0.000000,1,0.000000,1
0,air_04341b588bde96cd,2016-01-01,Friday,1,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,2016,1,...,1,0.000000,1,12,0.000000,1,0.000000,1,0.000000,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7416,air_dbf64f1ce38c7442,2017-04-22,Saturday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,2017,4,...,0,22.411796,0,3,23.089882,0,25.078947,0,22.859375,0
7417,air_e57dd6884595f60d,2017-04-22,Saturday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,2017,4,...,0,22.414295,0,47,23.090176,0,24.730769,0,23.020690,0
7418,air_eb120e6d384a17a8,2017-04-22,Saturday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,2017,4,...,0,22.416789,0,35,23.090470,0,23.285714,0,22.698052,0
7419,air_eca5e0064dc9314a,2017-04-22,Saturday,0,Cafe/Sweets,Fukuoka-ken Fukuoka-shi Daimyō,33.589216,130.392813,2017,4,...,0,22.419278,0,21,23.090765,0,19.785714,0,21.761905,0


In [179]:
new_data.to_csv("../processed/data.csv", index=False)

In [180]:
new_data.head()

Unnamed: 0,store_id,date,day_of_week,holiday_flg,genre_name,area_name,latitude,longitude,year,month,...,is_nan_lag_holiday_flg_273,lag_area_genre_day_of_week_365,is_nan_lag_area_genre_day_of_week_365,visitors,lag_area_genre_holiday_flg_365,is_nan_lag_area_genre_holiday_flg_365,lag_day_of_week_365,is_nan_lag_day_of_week_365,lag_holiday_flg_365,is_nan_lag_holiday_flg_365
0,air_00a91d42b08b08d9,2016-07-01,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2016,7,...,1,0.0,1,24,0.0,1,0.0,1,0.0,1
0,air_0241aa3964b7f861,2016-01-03,Sunday,1,Izakaya,Tōkyō-to Taitō-ku Higashiueno,35.712607,139.779996,2016,1,...,1,0.0,1,28,0.0,1,0.0,1,0.0,1
0,air_034a3d5b40d5b1b1,2016-07-01,Friday,0,Cafe/Sweets,Ōsaka-fu Ōsaka-shi Ōhiraki,34.692337,135.472229,2016,7,...,1,0.0,1,9,0.0,1,0.0,1,0.0,1
0,air_036d4f1ee7285390,2016-07-01,Friday,0,Cafe/Sweets,Hyōgo-ken Takarazuka-shi Tōyōchō,34.799767,135.360073,2016,7,...,1,0.0,1,10,0.0,1,0.0,1,0.0,1
0,air_04341b588bde96cd,2016-01-01,Friday,1,Izakaya,Tōkyō-to Nerima-ku Toyotamakita,35.735623,139.651658,2016,1,...,1,0.0,1,12,0.0,1,0.0,1,0.0,1


: 

In [49]:
# df = deepcopy(data)
df = data[["visitors", "holiday_flg", "year", "month", "day", "store_id", "date"]].head(
    2000
)

# df[["day_of_week", "genre_name", "area_name"]] = df[["day_of_week", "genre_name", "area_name"]].apply(LabelEncoder().fit_transform)
y = data[["store_id", "visitors"]].head(2000).set_index("store_id").squeeze()
# X = extract_relevant_features(df, y, column_id='store_id', column_sort="date")
# X
y

store_id
air_00a91d42b08b08d9    35
air_00a91d42b08b08d9     9
air_00a91d42b08b08d9    20
air_00a91d42b08b08d9    25
air_00a91d42b08b08d9    29
                        ..
air_03963426c9312048    58
air_03963426c9312048     6
air_03963426c9312048    34
air_03963426c9312048    31
air_03963426c9312048    44
Name: visitors, Length: 2000, dtype: int64

In [50]:
from tsfresh import extract_features

extracted_features = extract_features(df, column_id="store_id", column_sort="date")

Feature Extraction: 100%|██████████| 20/20 [00:05<00:00,  3.59it/s]


In [51]:
extracted_features

Unnamed: 0,day__variance_larger_than_standard_deviation,day__has_duplicate_max,day__has_duplicate_min,day__has_duplicate,day__sum_values,day__abs_energy,day__mean_abs_change,day__mean_change,day__mean_second_derivative_central,day__median,...,month__fourier_entropy__bins_5,month__fourier_entropy__bins_10,month__fourier_entropy__bins_100,month__permutation_entropy__dimension_3__tau_1,month__permutation_entropy__dimension_4__tau_1,month__permutation_entropy__dimension_5__tau_1,month__permutation_entropy__dimension_6__tau_1,month__permutation_entropy__dimension_7__tau_1,month__query_similarity_count__query_None__threshold_0.0,month__mean_n_absolute_max__number_of_maxima_7
air_00a91d42b08b08d9,1.0,1.0,1.0,1.0,3519.0,70897.0,2.290043,0.090909,0.0,15.0,...,0.147416,0.233472,0.573694,0.055945,0.084198,0.112641,0.141274,0.170101,,12.0
air_0164b9927d20bcc3,1.0,1.0,1.0,1.0,2338.0,47566.0,2.439189,0.128378,0.0,16.0,...,0.141441,0.263667,0.762383,0.08141,0.122738,0.164491,0.206675,0.249296,,12.0
air_0241aa3964b7f861,1.0,1.0,1.0,1.0,6210.0,127416.0,2.23038,0.048101,0.0,16.0,...,0.125256,0.181214,0.440417,0.0354,0.053206,0.071083,0.089032,0.107053,,12.0
air_0328696196e46f18,1.0,1.0,1.0,1.0,1876.0,39162.0,3.991304,0.165217,0.0,17.0,...,0.318391,0.403106,1.105582,0.10048,0.151699,0.203587,0.256156,0.30942,,11.857143
air_034a3d5b40d5b1b1,1.0,1.0,1.0,1.0,3710.0,73292.0,2.172,0.084,0.0,14.0,...,0.138673,0.219798,0.612541,0.052317,0.078718,0.105283,0.132014,0.158912,,12.0
air_036d4f1ee7285390,1.0,1.0,1.0,1.0,4351.0,88263.0,1.903571,0.075,0.0,16.0,...,0.136002,0.181214,0.451164,0.04751,0.071463,0.095549,0.11977,0.144126,,12.0
air_0382c794b73b51ad,1.0,1.0,1.0,1.0,4715.0,96435.0,2.461279,0.043771,0.0,16.0,...,0.090729,0.136002,0.350689,0.045182,0.067951,0.09084,0.11385,0.136981,,12.0
air_03963426c9312048,1.0,1.0,1.0,1.0,4357.0,89349.0,1.996377,0.083333,0.0,16.0,...,0.045395,0.045395,0.136002,-0.0,-0.0,-0.0,-0.0,-0.0,,10.0


In [52]:
from tsfresh import select_features
from tsfresh.utilities.dataframe_functions import impute

impute(extracted_features)
features_filtered = select_features(extracted_features, y)
features_filtered

 'visitors__query_similarity_count__query_None__threshold_0.0'
 'holiday_flg__friedrich_coefficients__coeff_0__m_3__r_30'
 'holiday_flg__friedrich_coefficients__coeff_1__m_3__r_30'
 'holiday_flg__friedrich_coefficients__coeff_2__m_3__r_30'
 'holiday_flg__friedrich_coefficients__coeff_3__m_3__r_30'
 'holiday_flg__max_langevin_fixed_point__m_3__r_30'
 'holiday_flg__query_similarity_count__query_None__threshold_0.0'
 'year__friedrich_coefficients__coeff_0__m_3__r_30'
 'year__friedrich_coefficients__coeff_1__m_3__r_30'
 'year__friedrich_coefficients__coeff_2__m_3__r_30'
 'year__friedrich_coefficients__coeff_3__m_3__r_30'
 'year__max_langevin_fixed_point__m_3__r_30'
 'year__query_similarity_count__query_None__threshold_0.0'
 'month__friedrich_coefficients__coeff_0__m_3__r_30'
 'month__friedrich_coefficients__coeff_1__m_3__r_30'
 'month__friedrich_coefficients__coeff_2__m_3__r_30'
 'month__friedrich_coefficients__coeff_3__m_3__r_30'
 'month__max_langevin_fixed_point__m_3__r_30'
 'month__quer

AssertionError: X and y must contain the same number of samples.