In [1]:
import pandas as pd

import numpy as np
import glob, re
import os
from datetime import datetime
from sklearn import preprocessing
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold,StratifiedKFold, TimeSeriesSplit
import matplotlib.pyplot as plt
import pickle
import seaborn as sns
%matplotlib inline
plt.rcParams.update({'font.size': 15,
                     'axes.labelsize':22,
                     'axes.titlesize':25,
                     'ytick.labelsize':22,
                     'xtick.labelsize':15})


## Work on Simple Aggregate Features

In [2]:
! cd ../data/to2ml/; ls

datacamp_round1.csv.gz	optimized_xgb_reg.csv  sess2_round1.csv.gz
final_fe_test.csv.gz	round1.csv.gz	       test_round1.csv
final_fe_train.csv.gz	round2_test.csv.gz     train_round1.csv
lgb_reg.csv		round2_train.csv.gz


In [3]:
path = '../data/to2ml/'
datapath = os.path.join(path,'sess2_round1.csv.gz')

In [4]:
agg_features = pd.read_csv(datapath)

In [5]:
agg_features.head(20)

Unnamed: 0,air_store_id,dow,min,max,mean,sum,median
0,air_00a91d42b08b08d9,0,1,47,22.457143,786,19.0
1,air_00a91d42b08b08d9,1,1,43,24.35,974,24.5
2,air_00a91d42b08b08d9,2,15,52,28.125,1125,28.0
3,air_00a91d42b08b08d9,3,15,47,29.868421,1135,30.0
4,air_00a91d42b08b08d9,4,17,57,36.5,1460,35.5
5,air_00a91d42b08b08d9,5,3,99,14.973684,569,11.0
6,air_00a91d42b08b08d9,6,2,2,2.0,2,2.0
7,air_0164b9927d20bcc3,0,2,19,7.5,150,6.0
8,air_0164b9927d20bcc3,1,1,24,9.56,239,8.0
9,air_0164b9927d20bcc3,2,2,27,9.678571,271,8.0


In [6]:
agg_features.shape

(5741, 7)

In [10]:
agg_features.dow.unique()

array([0, 1, 2, 3, 4, 5, 6])

### Prepare test data

In [11]:
test = pd.read_csv('../data/sample_submission.csv.gz')

In [12]:
test.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,0
1,air_00a91d42b08b08d9_2017-04-24,0
2,air_00a91d42b08b08d9_2017-04-25,0
3,air_00a91d42b08b08d9_2017-04-26,0
4,air_00a91d42b08b08d9_2017-04-27,0


In [13]:
test.shape

(32019, 2)

In [14]:
test['dow'] = pd.to_datetime(test['id'].apply(lambda x: x.rsplit('_',1)[-1])).dt.dayofweek

test['air_store_id'] = test['id'].apply(lambda x: x.rsplit('_',1)[0])

In [15]:
test['dow'] = pd.to_datetime(test['id'].apply(lambda x: x[-10:])).dt.dayofweek
test['air_store_id'] = test['id'].apply(lambda x: x[:-11])

In [16]:
test.head()

Unnamed: 0,id,visitors,dow,air_store_id
0,air_00a91d42b08b08d9_2017-04-23,0,6,air_00a91d42b08b08d9
1,air_00a91d42b08b08d9_2017-04-24,0,0,air_00a91d42b08b08d9
2,air_00a91d42b08b08d9_2017-04-25,0,1,air_00a91d42b08b08d9
3,air_00a91d42b08b08d9_2017-04-26,0,2,air_00a91d42b08b08d9
4,air_00a91d42b08b08d9_2017-04-27,0,3,air_00a91d42b08b08d9


In [17]:
test.merge(agg_features,how='left',on=['air_store_id','dow'])

Unnamed: 0,id,visitors,dow,air_store_id,min,max,mean,sum,median
0,air_00a91d42b08b08d9_2017-04-23,0,6,air_00a91d42b08b08d9,2.0,2.0,2.000000,2.0,2.0
1,air_00a91d42b08b08d9_2017-04-24,0,0,air_00a91d42b08b08d9,1.0,47.0,22.457143,786.0,19.0
2,air_00a91d42b08b08d9_2017-04-25,0,1,air_00a91d42b08b08d9,1.0,43.0,24.350000,974.0,24.5
3,air_00a91d42b08b08d9_2017-04-26,0,2,air_00a91d42b08b08d9,15.0,52.0,28.125000,1125.0,28.0
4,air_00a91d42b08b08d9_2017-04-27,0,3,air_00a91d42b08b08d9,15.0,47.0,29.868421,1135.0,30.0
...,...,...,...,...,...,...,...,...,...
32014,air_fff68b929994bfbd_2017-05-27,0,5,air_fff68b929994bfbd,2.0,18.0,7.439024,305.0,7.0
32015,air_fff68b929994bfbd_2017-05-28,0,6,air_fff68b929994bfbd,1.0,10.0,4.000000,120.0,3.5
32016,air_fff68b929994bfbd_2017-05-29,0,0,air_fff68b929994bfbd,1.0,14.0,4.200000,147.0,4.0
32017,air_fff68b929994bfbd_2017-05-30,0,1,air_fff68b929994bfbd,1.0,11.0,4.526316,172.0,4.0


In [18]:
['id']+agg_features.columns.tolist()

['id', 'air_store_id', 'dow', 'min', 'max', 'mean', 'sum', 'median']

In [19]:
test_df = test.merge(agg_features,how='left',on=['air_store_id','dow'])[['id']+agg_features.columns.tolist()].drop(['air_store_id'],axis=1)

In [20]:
test_df

Unnamed: 0,id,dow,min,max,mean,sum,median
0,air_00a91d42b08b08d9_2017-04-23,6,2.0,2.0,2.000000,2.0,2.0
1,air_00a91d42b08b08d9_2017-04-24,0,1.0,47.0,22.457143,786.0,19.0
2,air_00a91d42b08b08d9_2017-04-25,1,1.0,43.0,24.350000,974.0,24.5
3,air_00a91d42b08b08d9_2017-04-26,2,15.0,52.0,28.125000,1125.0,28.0
4,air_00a91d42b08b08d9_2017-04-27,3,15.0,47.0,29.868421,1135.0,30.0
...,...,...,...,...,...,...,...
32014,air_fff68b929994bfbd_2017-05-27,5,2.0,18.0,7.439024,305.0,7.0
32015,air_fff68b929994bfbd_2017-05-28,6,1.0,10.0,4.000000,120.0,3.5
32016,air_fff68b929994bfbd_2017-05-29,0,1.0,14.0,4.200000,147.0,4.0
32017,air_fff68b929994bfbd_2017-05-30,1,1.0,11.0,4.526316,172.0,4.0


In [21]:
test_df.to_csv('../data/to2ml/test_round1.csv.gz',index=False,compression='gzip')

In [22]:
feature_cols = ['dow','max','min','median','mean','sum']

In [23]:
test_X = test_df[feature_cols]

In [24]:
test_X.shape

(32019, 6)

In [25]:
test_X.head()

Unnamed: 0,dow,max,min,median,mean,sum
0,6,2.0,2.0,2.0,2.0,2.0
1,0,47.0,1.0,19.0,22.457143,786.0
2,1,43.0,1.0,24.5,24.35,974.0
3,2,52.0,15.0,28.0,28.125,1125.0
4,3,47.0,15.0,30.0,29.868421,1135.0


### Form Train Data

In [26]:
air_visitors = pd.read_csv('../data/air_visit_data.csv.gz')

In [27]:
air_visitors.head()

Unnamed: 0,air_store_id,visit_date,visitors
0,air_ba937bf13d40fb24,2016-01-13,25
1,air_ba937bf13d40fb24,2016-01-14,32
2,air_ba937bf13d40fb24,2016-01-15,29
3,air_ba937bf13d40fb24,2016-01-16,22
4,air_ba937bf13d40fb24,2016-01-18,6


In [28]:
air_visitors['dow'] = pd.to_datetime(air_visitors['visit_date']).dt.dayofweek

In [29]:
air_visitors.head()

Unnamed: 0,air_store_id,visit_date,visitors,dow
0,air_ba937bf13d40fb24,2016-01-13,25,2
1,air_ba937bf13d40fb24,2016-01-14,32,3
2,air_ba937bf13d40fb24,2016-01-15,29,4
3,air_ba937bf13d40fb24,2016-01-16,22,5
4,air_ba937bf13d40fb24,2016-01-18,6,0


In [30]:
agg_features.merge(air_visitors,how='left',on=['air_store_id','dow'])

Unnamed: 0,air_store_id,dow,min,max,mean,sum,median,visit_date,visitors
0,air_00a91d42b08b08d9,0,1,47,22.457143,786,19.0,2016-07-04,20
1,air_00a91d42b08b08d9,0,1,47,22.457143,786,19.0,2016-07-11,25
2,air_00a91d42b08b08d9,0,1,47,22.457143,786,19.0,2016-07-25,16
3,air_00a91d42b08b08d9,0,1,47,22.457143,786,19.0,2016-08-01,16
4,air_00a91d42b08b08d9,0,1,47,22.457143,786,19.0,2016-08-08,27
...,...,...,...,...,...,...,...,...,...
252103,air_fff68b929994bfbd,6,1,10,4.000000,120,3.5,2017-03-19,2
252104,air_fff68b929994bfbd,6,1,10,4.000000,120,3.5,2017-03-26,3
252105,air_fff68b929994bfbd,6,1,10,4.000000,120,3.5,2017-04-02,2
252106,air_fff68b929994bfbd,6,1,10,4.000000,120,3.5,2017-04-09,5


In [31]:
train_data = agg_features.merge(air_visitors,how='left',on=['air_store_id','dow']).drop(['air_store_id','visit_date'],axis=1)

In [32]:
train_data.head()

Unnamed: 0,dow,min,max,mean,sum,median,visitors
0,0,1,47,22.457143,786,19.0,20
1,0,1,47,22.457143,786,19.0,25
2,0,1,47,22.457143,786,19.0,16
3,0,1,47,22.457143,786,19.0,16
4,0,1,47,22.457143,786,19.0,27


In [33]:
train_data.to_csv('../data/to2ml/train_round1.csv.gz',index=False,compression='gzip')

In [34]:
y = train_data.visitors.values
train = train_data.drop(['visitors'],axis=1)[feature_cols]

In [35]:
train.head()

Unnamed: 0,dow,max,min,median,mean,sum
0,0,47,1,19.0,22.457143,786
1,0,47,1,19.0,22.457143,786
2,0,47,1,19.0,22.457143,786
3,0,47,1,19.0,22.457143,786
4,0,47,1,19.0,22.457143,786


In [36]:
test_X.head()

Unnamed: 0,dow,max,min,median,mean,sum
0,6,2.0,2.0,2.0,2.0,2.0
1,0,47.0,1.0,19.0,22.457143,786.0
2,1,43.0,1.0,24.5,24.35,974.0
3,2,52.0,15.0,28.0,28.125,1125.0
4,3,47.0,15.0,30.0,29.868421,1135.0


In [37]:
test_X.shape

(32019, 6)

In [38]:
test_df.head()

Unnamed: 0,id,dow,min,max,mean,sum,median
0,air_00a91d42b08b08d9_2017-04-23,6,2.0,2.0,2.0,2.0,2.0
1,air_00a91d42b08b08d9_2017-04-24,0,1.0,47.0,22.457143,786.0,19.0
2,air_00a91d42b08b08d9_2017-04-25,1,1.0,43.0,24.35,974.0,24.5
3,air_00a91d42b08b08d9_2017-04-26,2,15.0,52.0,28.125,1125.0,28.0
4,air_00a91d42b08b08d9_2017-04-27,3,15.0,47.0,29.868421,1135.0,30.0


In [39]:
test_df.fillna(0,inplace=True)

In [40]:
train.head()

Unnamed: 0,dow,max,min,median,mean,sum
0,0,47,1,19.0,22.457143,786
1,0,47,1,19.0,22.457143,786
2,0,47,1,19.0,22.457143,786
3,0,47,1,19.0,22.457143,786
4,0,47,1,19.0,22.457143,786


In [41]:
test_df[train.columns].head()

Unnamed: 0,dow,max,min,median,mean,sum
0,6,2.0,2.0,2.0,2.0,2.0
1,0,47.0,1.0,19.0,22.457143,786.0
2,1,43.0,1.0,24.5,24.35,974.0
3,2,52.0,15.0,28.0,28.125,1125.0
4,3,47.0,15.0,30.0,29.868421,1135.0


### Apply Base Regression Models

In [42]:
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor

In [43]:
scaler = preprocessing.MinMaxScaler()
scaled_train = scaler.fit_transform(train)
scaler = preprocessing.MinMaxScaler()
scaled_test = scaler.fit_transform(test_df[train.columns])

In [44]:
scaled_train.shape,y.shape,scaled_test.shape

((252108, 6), (252108,), (32019, 6))

In [45]:
model_lin_reg = LinearRegression(n_jobs=-1,normalize=True)
model_knn_reg = KNeighborsRegressor(n_jobs=-1,n_neighbors=6)

In [46]:
model1 = model_lin_reg.fit(scaled_train, np.log1p(y))

In [47]:
model2 = model_knn_reg.fit(scaled_train, np.log1p(y))

In [48]:
lin_reg_test_pred = model1.predict(scaled_test)

In [49]:
knn_reg_test_pred = model2.predict(scaled_test)

In [50]:
sub = pd.DataFrame()

In [51]:
sub['id'] = test_df['id']
sub['visitors'] = np.expm1(lin_reg_test_pred)
sub.to_csv('../data/tokaggle/linear_reg.csv',index=False)

In [52]:
sub.head()

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,6.067645
1,air_00a91d42b08b08d9_2017-04-24,16.815221
2,air_00a91d42b08b08d9_2017-04-25,19.411701
3,air_00a91d42b08b08d9_2017-04-26,22.146742
4,air_00a91d42b08b08d9_2017-04-27,24.076294


In [194]:
! kaggle competitions submit -c recruit-restaurant-visitor-forecasting -f submissions/linear_reg.csv -m "LinearReg session2"

100%|███████████████████████████████████████| 1.54M/1.54M [00:04<00:00, 374kB/s]
Successfully submitted to Recruit Restaurant Visitor Forecasting

In [128]:
sub['id'] = test_df['id']
sub['visitors'] = np.expm1(knn_reg_test_pred)
sub.to_csv('../data/tokaggle/knn_reg.csv',index=False)

In [129]:
! kaggle competitions submit -c recruit-restaurant-visitor-forecasting -f submissions/knn_reg.csv -m "DataCamp KNNReg"

100%|███████████████████████████████████████| 1.54M/1.54M [00:05<00:00, 298kB/s]
Successfully submitted to Recruit Restaurant Visitor Forecasting

### Mean Submission

pd.read_csv('../data/air_visit_data.csv.gz')

In [136]:
m = pd.read_csv('../data/air_visit_data.csv.gz').visitors.mean()

In [137]:
m

20.973761245180636

In [138]:
sub  = pd.read_csv('../data/sample_submission.csv.gz')

In [139]:
sub.visitors = m

In [140]:
sub

Unnamed: 0,id,visitors
0,air_00a91d42b08b08d9_2017-04-23,20.973761
1,air_00a91d42b08b08d9_2017-04-24,20.973761
2,air_00a91d42b08b08d9_2017-04-25,20.973761
3,air_00a91d42b08b08d9_2017-04-26,20.973761
4,air_00a91d42b08b08d9_2017-04-27,20.973761
...,...,...
32014,air_fff68b929994bfbd_2017-05-27,20.973761
32015,air_fff68b929994bfbd_2017-05-28,20.973761
32016,air_fff68b929994bfbd_2017-05-29,20.973761
32017,air_fff68b929994bfbd_2017-05-30,20.973761


In [141]:
sub.to_csv('../data/tokaggle/submit_mean.csv',index=False)

In [142]:
! kaggle competitions submit -c recruit-restaurant-visitor-forecasting -f submit_mean.csv -m "Mean Submissionon 2 Sessi"

100%|███████████████████████████████████████| 1.56M/1.56M [00:05<00:00, 282kB/s]
Successfully submitted to Recruit Restaurant Visitor Forecasting

In [204]:
! kaggle competitions submissions -c recruit-restaurant-visitor-forecasting 

fileName               date                 description                status    publicScore  privateScore  
---------------------  -------------------  -------------------------  --------  -----------  ------------  
lgb_reg.csv            2019-11-23 14:44:20  DataCamp Sess2 LGBM        complete  0.71537      0.74051       
lgb_reg.csv            2019-11-23 14:33:14  DataCamp Sess2 LGBM        complete  0.71569      0.74065       
lgb_reg.csv            2019-11-23 14:25:32  DataCamp Sess2 LGBM        complete  0.71097      0.72792       
linear_reg.csv         2019-11-23 13:01:40  LinearReg session2         complete  0.57544      0.60338       
submit_mean.csv        2019-11-23 11:38:45  Mean Submissionon 2 Sessi  complete  0.88649      0.87066       
sample_submission.csv  2019-11-23 11:25:41  zero sub                   complete  2.91459      2.94249       
knn_reg.csv            2019-11-23 09:44:29  DataCamp KNNReg            complete  0.57603      0.60369       
linear_reg.csv     