In [1]:
import pandas as pd
import category_encoders as ce
import numpy as np
from sklearn.ensemble import GradientBoostingRegressor

In [5]:
# load in data
df = pd.read_csv('../data/master.csv', parse_dates=['visit_date', 'calendar_date'])

In [25]:
# sort values
df.sort_values(by=['id', 'visit_date'], inplace=True)

In [26]:
# fill in missing values with 0
df.isnull().sum()

id                  0
visit_date          0
visitors            0
calendar_date       0
day_of_week         0
holiday             0
genre               0
area                0
latitude            0
longitude           0
reserve_visitors    0
dtype: int64

In [27]:
# fill in missing values
df['reserve_visitors'] = df['reserve_visitors'].fillna(0)

In [39]:
# create a training and test set
train = df.groupby('visit_date').apply(lambda x: x.iloc[:-15])
test  = df.groupby('visit_date').apply(lambda x: x.iloc[-15:])

In [61]:
# create X and y
y_train = train['visitors']
X_train = train.drop('visitors', axis=1)

# and for the test
y_test = test['visitors']
X_test = test.drop('visitors', axis=1)

In [30]:
encoder = ce.TargetEncoder()

In [31]:
X_train = encoder.fit_transform(X_train, y_train)

In [32]:
gbm = GradientBoostingRegressor()

In [33]:
X_train.drop(['visit_date', 'calendar_date'], axis=1, inplace=True)

In [34]:
gbm.fit(X_train, y_train)

GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0, criterion='friedman_mse',
                          init=None, learning_rate=0.1, loss='ls', max_depth=3,
                          max_features=None, max_leaf_nodes=None,
                          min_impurity_decrease=0.0, min_impurity_split=None,
                          min_samples_leaf=1, min_samples_split=2,
                          min_weight_fraction_leaf=0.0, n_estimators=100,
                          n_iter_no_change=None, presort='deprecated',
                          random_state=None, subsample=1.0, tol=0.0001,
                          validation_fraction=0.1, verbose=0, warm_start=False)

In [35]:
gbm.score(X_train, y_train)

0.4714414519812945

In [37]:
ore = ce.OrdinalEncoder()
ohe = ce.OneHotEncoder()
te  = ce.TargetEncoder()

In [41]:
ore.fit_transform(X_train)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,visit_date,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
visit_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-01-01,43327,1,2016-01-01,2016-01-01,1,1,1,1,35.735623,139.651658,0.0
2016-01-01,113306,2,2016-01-01,2016-01-01,1,1,1,2,33.589216,130.392813,0.0
2016-01-01,16725,3,2016-01-01,2016-01-01,1,1,1,3,38.269076,140.870403,0.0
2016-01-01,116775,4,2016-01-01,2016-01-01,1,1,1,4,33.319286,130.508374,0.0
2016-01-01,20816,5,2016-01-01,2016-01-01,1,1,2,5,34.766093,135.628100,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2017-04-22,218499,751,2017-04-22,2017-04-22,2,0,2,13,35.646572,139.653247,37.0
2017-04-22,8237,752,2017-04-22,2017-04-22,2,0,1,12,34.386245,132.455018,37.0
2017-04-22,222776,753,2017-04-22,2017-04-22,2,0,6,82,35.629564,139.684992,37.0
2017-04-22,121098,754,2017-04-22,2017-04-22,2,0,5,6,35.693840,139.703549,37.0


In [44]:
ore.category_mapping

[{'col': 'id',
  'mapping': air_04341b588bde96cd      1
  air_05c325d315cc17f5      2
  air_08ba8cd01b3ba010      3
  air_09a845d5b5944b01      4
  air_1f7f8fa557bc0d55      5
                         ... 
  air_cf5ab75a0afb8af9    811
  air_1c0b150f9e696a5f    812
  air_900d755ebd2f7bbd    813
  air_a17f0778617c76e2    814
  NaN                      -2
  Length: 815, dtype: int64,
  'data_type': dtype('O')},
 {'col': 'day_of_week',
  'mapping': Friday       1
  Saturday     2
  Sunday       3
  Monday       4
  Tuesday      5
  Wednesday    6
  Thursday     7
  NaN         -2
  dtype: int64,
  'data_type': dtype('O')},
 {'col': 'genre',
  'mapping': Izakaya                          1
  Bar/Cocktail                     2
  Western food                     3
  Cafe/Sweets                      4
  Dining bar                       5
  Italian/French                   6
  Other                            7
  Japanese food                    8
  Yakiniku/Korean food             9
  Okonomiy

In [46]:
ohe.fit_transform(X_train['genre'], use_cat_names=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,genre_1,genre_2,genre_3,genre_4,genre_5,genre_6,genre_7,genre_8,genre_9,genre_10,genre_11,genre_12,genre_13,genre_14
visit_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-01-01,43327,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2016-01-01,113306,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2016-01-01,16725,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2016-01-01,116775,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2016-01-01,20816,0,1,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2017-04-22,218499,0,1,0,0,0,0,0,0,0,0,0,0,0,0
2017-04-22,8237,1,0,0,0,0,0,0,0,0,0,0,0,0,0
2017-04-22,222776,0,0,0,0,0,1,0,0,0,0,0,0,0,0
2017-04-22,121098,0,0,0,0,1,0,0,0,0,0,0,0,0,0


In [52]:
df.groupby('day_of_week')['visitors'].mean()

day_of_week
Friday       23.072737
Monday       17.177009
Saturday     26.313688
Sunday       23.873362
Thursday     18.922702
Tuesday      17.672137
Wednesday    19.230121
Name: visitors, dtype: float64

In [53]:
te.fit_transform(X_train, y_train)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,visit_date,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
visit_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-01-01,43327,35.870763,2016-01-01,2016-01-01,23.058297,1,22.994346,18.325632,35.735623,139.651658,0.0
2016-01-01,113306,22.495726,2016-01-01,2016-01-01,23.058297,1,22.994346,20.673379,33.589216,130.392813,0.0
2016-01-01,16725,10.366013,2016-01-01,2016-01-01,23.058297,1,22.994346,20.465089,38.269076,140.870403,0.0
2016-01-01,116775,11.628993,2016-01-01,2016-01-01,23.058297,1,22.994346,12.699479,33.319286,130.508374,0.0
2016-01-01,20816,4.695755,2016-01-01,2016-01-01,23.058297,1,13.390262,6.170648,34.766093,135.628100,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2017-04-22,218499,10.192308,2017-04-22,2017-04-22,26.360766,0,13.390262,19.126473,35.646572,139.653247,37.0
2017-04-22,8237,37.114286,2017-04-22,2017-04-22,26.360766,0,22.994346,23.610277,34.386245,132.455018,37.0
2017-04-22,222776,15.085714,2017-04-22,2017-04-22,26.360766,0,22.344232,13.578850,35.629564,139.684992,37.0
2017-04-22,121098,6.835938,2017-04-22,2017-04-22,26.360766,0,18.661331,19.690496,35.693840,139.703549,37.0


In [62]:
te.transform(X_test, y_test)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,visit_date,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
visit_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-01-01,104888,35.563559,2016-01-01,2016-01-01,23.058297,1,22.994346,20.673379,33.589216,130.392813,0.0
2016-01-01,35175,17.123153,2016-01-01,2016-01-01,23.058297,1,13.390262,23.334203,35.726118,139.716605,0.0
2016-01-01,51870,20.532294,2016-01-01,2016-01-01,23.058297,1,23.563552,19.690496,35.693840,139.703549,0.0
2016-01-01,14599,21.869841,2016-01-01,2016-01-01,23.058297,1,22.619626,22.989831,35.712607,139.779996,0.0
2016-01-01,2823,10.004246,2016-01-01,2016-01-01,23.058297,1,22.619626,23.532743,34.720228,135.265455,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2017-04-22,139767,20.964656,2017-04-22,2017-04-22,26.360766,0,19.913082,24.156822,34.710895,137.725940,37.0
2017-04-22,142424,20.964656,2017-04-22,2017-04-22,26.360766,0,22.344232,20.360123,34.695124,135.197852,37.0
2017-04-22,150322,20.964656,2017-04-22,2017-04-22,26.360766,0,19.760325,26.104597,34.815149,134.685353,37.0
2017-04-22,161396,20.964656,2017-04-22,2017-04-22,26.360766,0,22.994346,19.209442,35.658068,139.751599,37.0


In [58]:
X_test.drop(['visit_date', 'calendar_date'], axis=1, inplace=True)

In [67]:
te.transform(X_test, y_test)

Unnamed: 0_level_0,Unnamed: 1_level_0,id,visit_date,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
visit_date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
2016-01-01,104888,35.563559,2016-01-01,2016-01-01,23.058297,1,22.994346,20.673379,33.589216,130.392813,0.0
2016-01-01,35175,17.123153,2016-01-01,2016-01-01,23.058297,1,13.390262,23.334203,35.726118,139.716605,0.0
2016-01-01,51870,20.532294,2016-01-01,2016-01-01,23.058297,1,23.563552,19.690496,35.693840,139.703549,0.0
2016-01-01,14599,21.869841,2016-01-01,2016-01-01,23.058297,1,22.619626,22.989831,35.712607,139.779996,0.0
2016-01-01,2823,10.004246,2016-01-01,2016-01-01,23.058297,1,22.619626,23.532743,34.720228,135.265455,0.0
...,...,...,...,...,...,...,...,...,...,...,...
2017-04-22,139767,20.964656,2017-04-22,2017-04-22,26.360766,0,19.913082,24.156822,34.710895,137.725940,37.0
2017-04-22,142424,20.964656,2017-04-22,2017-04-22,26.360766,0,22.344232,20.360123,34.695124,135.197852,37.0
2017-04-22,150322,20.964656,2017-04-22,2017-04-22,26.360766,0,19.760325,26.104597,34.815149,134.685353,37.0
2017-04-22,161396,20.964656,2017-04-22,2017-04-22,26.360766,0,22.994346,19.209442,35.658068,139.751599,37.0


In [66]:
train.groupby('')['visitors'].mean()

id
air_00a91d42b08b08d9    26.081897
air_0164b9927d20bcc3     9.248322
air_0241aa3964b7f861     9.896465
air_0328696196e46f18     7.939655
air_034a3d5b40d5b1b1    14.828685
                          ...    
air_f96765e800907c77    37.114286
air_fa12b40b02fecfd8    15.085714
air_fa4ffc9057812fa2     6.835938
air_fab092c35776a9b1    10.056338
air_fb44f566d4f64a4e    23.363636
Name: visitors, Length: 814, dtype: float64

In [68]:
from sklearn.pipeline import make_pipeline

In [69]:
pipe = make_pipeline(te, gbm)

In [71]:
X_train.drop(['visit_date', 'calendar_date'], axis=1, inplace=True)

In [72]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', min_samples_leaf=1,
                               return_df=True, smoothing=1.0, verbose=0)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_..., loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                      

In [74]:
X_test.drop(['visit_date', 'calendar_date'], axis=1, inplace=True)

In [75]:
pipe.predict(X_test)

array([43.53941762, 22.20944956, 25.12478599, ..., 28.61792205,
       26.00678461, 28.61792205])

In [76]:
pipe1 = make_pipeline(ce.OrdinalEncoder(), GradientBoostingRegressor())
pipe2 = make_pipeline(ce.TargetEncoder(), GradientBoostingRegressor())

In [79]:
pipe1.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('ordinalencoder',
                 OrdinalEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                                drop_invariant=False, handle_missing='value',
                                handle_unknown='value',
                                mapping=[{'col': 'id', 'data_type': dtype('O'),
                                          'mapping': air_04341b588bde96cd      1
air_05c325d315cc17f5      2
air_08ba8cd01b3ba010      3
air_09a845d5b5944b01      4
air_1f7f8fa557bc0d55      5
                       ... 
air_cf5ab75a0afb8af...
                                           learning_rate=0.1, loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_lea

In [81]:
pipe1.score(X_test, y_test)

-0.004895557054887867

In [88]:
df.drop('calendar_date', axis=1, inplace=True)

In [82]:
# define some functions that we can reuse
def create_val_splits(df, val_units=15, return_val=False):
    """Function that will take in a dataset and split it up into training, validation, and test sets"""
    # split into training, validation, and test sets
    df = df.drop('visit_date', axis=1)
    train = df.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
    test  = df.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
    
    if return_val:
        val   = train.groupby('id').apply(lambda x: x.iloc[-val_units:]).reset_index(drop=True)
        train = train.groupby('id').apply(lambda x: x.iloc[:-val_units]).reset_index(drop=True)
        return train, val, test
    else:
        return train, test

In [90]:
pipe = make_pipeline(ce.TargetEncoder(), GradientBoostingRegressor())

In [98]:
df['day']        = df['visit_date'].dt.day
df['year']       = df['visit_date'].dt.year
df['month']      = df['visit_date'].dt.month
df['mov_avg_10'] = df.groupby('id')['visitors'].apply(lambda x: x.rolling(10).mean()).values

In [102]:
df['mov_avg_10'] = df['mov_avg_10'].bfill()

In [104]:
X_train = train.drop('visitors', axis=1)
y_train = train['visitors']

In [105]:
pipe.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('targetencoder',
                 TargetEncoder(cols=['id', 'day_of_week', 'genre', 'area'],
                               drop_invariant=False, handle_missing='value',
                               handle_unknown='value', min_samples_leaf=1,
                               return_df=True, smoothing=1.0, verbose=0)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(alpha=0.9, ccp_alpha=0.0,
                                           criterion='friedman_mse', init=None,
                                           learning_..., loss='ls',
                                           max_depth=3, max_features=None,
                                           max_leaf_nodes=None,
                                           min_impurity_decrease=0.0,
                                           min_impurity_split=None,
                                           min_samples_leaf=1,
                                      

In [106]:
X_val = val.drop('visitors', axis=1)
y_val = val['visitors']

In [107]:
pipe.score(X_val, y_val)

0.6014271300397391

In [103]:
train, val, test = create_val_splits(df, val_units=15, return_val=True)

In [87]:
test

Unnamed: 0,id,visitors,calendar_date,day_of_week,holiday,genre,area,latitude,longitude,reserve_visitors
0,air_00a91d42b08b08d9,35,2017-04-05,Wednesday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,2.0
1,air_00a91d42b08b08d9,29,2017-04-06,Thursday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,8.0
2,air_00a91d42b08b08d9,17,2017-04-07,Friday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,1.0
3,air_00a91d42b08b08d9,9,2017-04-08,Saturday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,33.0
4,air_00a91d42b08b08d9,17,2017-04-10,Monday,0,Italian/French,Tōkyō-to Chiyoda-ku Kudanminami,35.694003,139.753595,0.0
...,...,...,...,...,...,...,...,...,...,...
12430,air_fff68b929994bfbd,6,2017-04-18,Tuesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0
12431,air_fff68b929994bfbd,2,2017-04-19,Wednesday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,0.0
12432,air_fff68b929994bfbd,2,2017-04-20,Thursday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,1.0
12433,air_fff68b929994bfbd,4,2017-04-21,Friday,0,Bar/Cocktail,Tōkyō-to Nakano-ku Nakano,35.708146,139.666288,6.0


In [None]:
pipe = make_pipeline()