In [None]:
import time
import numpy as np
import pandas as pd
from dateutil.parser import parse
from datetime import date, timedelta
from sklearn.preprocessing import LabelEncoder

from sklearn import tree, neighbors, datasets, linear_model,metrics, preprocessing, ensemble
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV, KFold, TimeSeriesSplit
from sklearn.metrics import plot_confusion_matrix, make_scorer
from sklearn.svm import SVR
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
import datetime

In [None]:
pd.set_option('display.max_columns', None)

### Import dataset

In [None]:
air_reserve = pd.read_csv('air_reserve.csv').rename(columns={'air_store_id':'store_id'})
hpg_reserve = pd.read_csv('hpg_reserve.csv').rename(columns={'hpg_store_id':'store_id'})
air_store = pd.read_csv('air_store_info.csv').rename(columns={'air_store_id':'store_id'})
hpg_store = pd.read_csv('hpg_store_info.csv').rename(columns={'hpg_store_id':'store_id'})
air_visit = pd.read_csv('air_visit_data.csv').rename(columns={'air_store_id':'store_id'})
store_id_map = pd.read_csv('store_id_relation.csv').set_index('hpg_store_id',drop=False)
date_info = pd.read_csv('date_info.csv').rename(columns={'calendar_date': 'visit_date'}).drop('day_of_week',axis=1)

In [None]:
# submission
submission = pd.read_csv('sample_submission.csv')
submission['visit_date'] = submission['id'].str[-10:]
submission['store_id'] = submission['id'].str[:-11]

## New Features

### Reserve date diff 

In [None]:
store_id_map = store_id_map.reset_index(drop=True)
hpg_reserve = pd.merge(hpg_reserve, store_id_map, how='inner', left_on = 'store_id', right_on='hpg_store_id')
hpg_reserve['visit_datetime'] = pd.to_datetime(hpg_reserve['visit_datetime'])
hpg_reserve['visit_date'] = hpg_reserve['visit_datetime'].dt.date
hpg_reserve['reserve_datetime'] = pd.to_datetime(hpg_reserve['reserve_datetime'])
hpg_reserve['reserve_date'] = hpg_reserve['reserve_datetime'].dt.date
hpg_reserve['reserve_datediff'] = hpg_reserve.apply(lambda r: (r['visit_date'] - r['reserve_date']).days, axis=1)
hpg_reserve = hpg_reserve.groupby(['air_store_id','visit_date'], as_index=False)[['reserve_datediff', 'reserve_visitors']].sum().rename(columns={'air_store_id':'store_id'})

In [None]:
air_reserve['visit_datetime'] = pd.to_datetime(air_reserve['visit_datetime'])
air_reserve['visit_date'] = air_reserve['visit_datetime'].dt.date
air_reserve['reserve_datetime'] = pd.to_datetime(air_reserve['reserve_datetime'])
air_reserve['reserve_date'] = air_reserve['reserve_datetime'].dt.date
air_reserve['reserve_datediff'] = air_reserve.apply(lambda r: (r['visit_date'] - r['reserve_date']).days, axis=1)
air_reserve = air_reserve.groupby(['store_id','visit_date'], as_index=False)[['reserve_datediff', 'reserve_visitors']].sum()

### Time Frame

In [None]:
def weekinmonth(dates):
    """Get week number in a month.
    
    Parameters: 
        dates (pd.Series): Series of dates.
    Returns: 
        pd.Series: Week number in a month.
    """
    firstday_in_month = dates - pd.to_timedelta(dates.dt.day - 1, unit='d')
    return (dates.dt.day-1 + firstday_in_month.dt.weekday) // 7 + 1

In [None]:
# air_visit
air_visit['visit_date'] = pd.to_datetime(air_visit['visit_date'],errors='coerce')
air_visit['week_of_month'] = weekinmonth(air_visit['visit_date'])
air_visit['dow'] = air_visit['visit_date'].dt.dayofweek
air_visit['year'] =air_visit['visit_date'].dt.year
air_visit['month'] = air_visit['visit_date'].dt.month
air_visit['day_of_year'] = air_visit['visit_date'].dt.dayofyear
air_visit['week_of_year'] = air_visit['visit_date'].dt.isocalendar().week
air_visit['day_in_month'] = air_visit['visit_date'].dt.day
air_visit['ttl_days_in_month'] = air_visit['visit_date'].dt.days_in_month
air_visit['visit_date'] = air_visit['visit_date'].dt.date

In [None]:
submission['visit_date'] = submission['id'].map(lambda x: str(x).split('_')[2])
submission['air_store_id'] = submission['id'].map(lambda x: '_'.join(x.split('_')[:2]))
submission['visit_date'] = pd.to_datetime(submission['visit_date'])
submission['week_of_month'] = weekinmonth(submission['visit_date'])
submission['dow'] = submission['visit_date'].dt.dayofweek
submission['year'] = submission['visit_date'].dt.year
submission['month'] = submission['visit_date'].dt.month
submission['day_of_year'] = submission['visit_date'].dt.dayofyear
submission['week_of_year'] = submission['visit_date'].dt.isocalendar().week
submission['day_in_month'] = submission['visit_date'].dt.day
submission['ttl_days_in_month'] = submission['visit_date'].dt.days_in_month
submission['visit_date'] = submission['visit_date'].dt.date

### Min, Max, Median Visitors

In [None]:
unique_stores = air_visit['store_id'].unique()
stores = pd.concat([pd.DataFrame({'store_id': unique_stores, 'dow': [i]*len(unique_stores)}) for i in range(7)], axis=0, ignore_index=True).reset_index(drop=True)

In [None]:
tmp = air_visit.groupby(['store_id','dow'], as_index=False)['visitors'].min().rename(columns={'visitors':'min_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['store_id','dow']) 
tmp = air_visit.groupby(['store_id','dow'], as_index=False)['visitors'].mean().rename(columns={'visitors':'mean_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['store_id','dow'])
tmp = air_visit.groupby(['store_id','dow'], as_index=False)['visitors'].median().rename(columns={'visitors':'median_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['store_id','dow'])
tmp = air_visit.groupby(['store_id','dow'], as_index=False)['visitors'].max().rename(columns={'visitors':'max_visitors'})
stores = pd.merge(stores, tmp, how='left', on=['store_id','dow'])
tmp = air_visit.groupby(['store_id','dow'], as_index=False)['visitors'].count().rename(columns={'visitors':'count_observations'})
stores = pd.merge(stores, tmp, how='left', on=['store_id','dow']) 
stores

Unnamed: 0,store_id,dow,min_visitors,mean_visitors,median_visitors,max_visitors,count_observations
0,air_ba937bf13d40fb24,0,2.0,13.754386,12.0,34.0,57.0
1,air_25e9888d30b386df,0,1.0,1.666667,2.0,2.0,3.0
2,air_8e4360a64dbd4c50,0,2.0,24.428571,23.5,47.0,42.0
3,air_35512c42db0868da,0,2.0,8.189655,8.0,21.0,58.0
4,air_a271c9ba19e81d17,0,8.0,21.666667,20.0,44.0,42.0
...,...,...,...,...,...,...,...
5798,air_168441ada3e878e1,6,22.0,52.300000,51.0,86.0,40.0
5799,air_6c952e3c6e590945,6,1.0,13.203125,14.0,38.0,64.0
5800,air_0f2f96335f274801,6,1.0,4.650000,4.0,18.0,40.0
5801,air_c7d30ab0e07f31d5,6,1.0,8.444444,9.0,24.0,27.0


In [None]:
stores = pd.merge(stores, air_store, how='left', on=['store_id']) 

#area feature
stores[['city', 'district', 'area']] = stores['air_area_name'].str.split(' ', 2, expand=True)

In [None]:
date_info['visit_date'] = pd.to_datetime(date_info['visit_date'])
date_info['day_of_week'] = pd.to_datetime(date_info['visit_date']).dt.dayofweek
# convert visit_date to object 
date_info['visit_date'] = date_info['visit_date'].dt.date

In [None]:
train = pd.merge(air_visit, date_info, how='left', on=['visit_date']) 
test = pd.merge(submission, date_info, how='left', on=['visit_date']) 

In [None]:
# train = pd.merge(air_visit, stores, how='left', on=['store_id','dow']) 
# test = pd.merge(submission, stores, how='left', on=['store_id','dow'])
train = pd.merge(train, stores, how='left', on=['store_id','dow']) 
test = pd.merge(test, stores, how='left', on=['store_id','dow'])

In [None]:
train = pd.merge(train, air_reserve, how='left', on=['store_id','visit_date']) 
train = pd.merge(train, hpg_reserve, how='left', on=['store_id','visit_date'])

In [None]:
test = pd.merge(test, air_reserve, how='left', on=['store_id','visit_date']) 
test = pd.merge(test, hpg_reserve, how='left', on=['store_id','visit_date']) 

In [None]:
train = train.fillna(0)
test = test.fillna(0)
test = test.drop(['id', 'air_store_id'], axis=1)

### Number of restaurants of the same category within 1 km 

In [None]:
from geopy.distance import distance

def num_near_rest(df,n):
  tmp = df
  tmp["coordinate"] = tuple(zip(tmp.latitude,tmp.longitude))
  location = tmp[["store_id","coordinate"]].drop_duplicates()

  coordinate_list = location["coordinate"].tolist()

  dist = []
  for p in coordinate_list:
    point = []
    for pp in coordinate_list:
      point.append(distance(p,pp).km)
    dist.append(point)

  dist_df = pd.DataFrame(dist, columns = location["store_id"])
  less1 = dist_df.apply(lambda x:x<n,axis=1).sum()
  return pd.merge(df,less1.to_frame(name = "restaurant_within_{0}km".format(n)), how = 'left', on = 'store_id')

In [None]:
# feature: number of same genre restaurants within 3km

def near_rest_per_genre(df,n):
  tmp = df
  tmp["coordinate"] = tuple(zip(tmp.latitude,tmp.longitude))
  each_store = tmp[["store_id","coordinate","air_genre_name"]].drop_duplicates()
  genres = tmp["air_genre_name"].unique()

  genre_df = pd.DataFrame()

  for genre in genres:

    each_genre = each_store[each_store["air_genre_name"] == genre]
    coordinate_list = each_genre["coordinate"].tolist()

    dist = []
    for p in coordinate_list:
      point = []
      for pp in coordinate_list:
        point.append(distance(p,pp).km)
      dist.append(point)
    dist_df = pd.DataFrame(dist, columns = each_store[each_store["air_genre_name"] == genre]["store_id"])
    less3 = dist_df.apply(lambda x:x<n,axis=1).sum().to_frame(name = "same_genre_restaurant_within_{0}km".format(n))
    
    genre_df = pd.concat([genre_df, less3])
  
  return pd.merge(df,genre_df, how = 'left', on = 'store_id')

In [None]:
train_1 = num_near_rest(train,1)
train_2 = near_rest_per_genre(train_1,1)
test_1 = num_near_rest(test,1)
test_2 = near_rest_per_genre(test_1,1)

In [None]:
test_3 = test_2
train_3 = train_2

### Adjusted dow

In [None]:
# tmp find unique flg == 1 and date time
def adjust_dow(df):
  tmp = df[['visit_date','holiday_flg']]
  tmp = tmp[tmp.holiday_flg==1]
  tmp = tmp['visit_date'].unique()
  f1 = lambda x: x - datetime.timedelta(1)
  f2 = lambda x: x + datetime.timedelta(1)
  # before1d find the date before holiday 1 day
  # after1d find the date after holiday 1 day
  before1d = f1(tmp)
  after1d = f2(tmp)
  # find before1d row and day of week in monday - thursday change it to friday
  # find after1d row and day of week in tuesday - friday change it to monday
  df.loc[(df.visit_date.isin(after1d)) & (df.day_of_week.isin([1,2,3,4]) & (df.holiday_flg == 0)),'day_of_week'] = 0
  df.loc[(df.visit_date.isin(before1d)) & (df.day_of_week.isin([0,1,2,3]) & (df.holiday_flg == 0)),'day_of_week']= 4
  df.loc[(df.visit_date.isin(tmp)),'day_of_week'] = 5
  df = df.rename(columns={'day_of_week':"adjusted_dow"})
  return df

In [None]:
train_df = adjust_dow(train_3)
test_df = adjust_dow(test_3)

### Min, Max, Mean, and Std of the number of visitors in the same area/ same month/ same dow

In [None]:
tmp = train_df[['dow','air_genre_name','visitors','month']]
tmp = tmp.groupby(['air_genre_name', 'dow','month'], as_index=False)['visitors'].agg(['min','max','mean','std']).fillna(0)
tmp = tmp.reset_index()
tmp = tmp.rename(columns = {'min':'genre_dow_mon_min','max':'genre_dow_mon_max','mean':'genre_dow_mon_mean','std':'genre_dow_mon_std'})
train_df = pd.merge(train_df, tmp, how='left', on=['air_genre_name', 'dow','month']) 
test_df = pd.merge(test_df, tmp, how='left', on=['air_genre_name', 'dow','month']) 

### Weather for each restaurant per day

In [None]:
# feature: weather
weather_data = pd.read_csv('WeatherData.csv', parse_dates=['calendar_date'])
weather_data.columns = weather_data.columns.str.replace('area_name', 'station_id')
hpg_nearest = pd.read_csv('hpg_store_info_with_nearest_active_station.csv')
air_nearest = pd.read_csv('air_store_info_with_nearest_active_station.csv')
hpg_nearest.rename(columns={'hpg_store_id':'air_store_id', 
                            'hpg_genre_name': 'air_genre_name', 
                            'hpg_area_name': 'air_area_name'}, inplace=True)
nearest = pd.concat([air_nearest, hpg_nearest])
nearest = nearest.rename(columns = {'air_store_id':'store_id'}).copy()
train_weather = pd.merge(train_df, nearest.iloc[:, [0,7]], how = 'left', on='store_id')
test_weather = pd.merge(test_df, nearest.iloc[:, [0,7]], how = 'left', on='store_id')
train_weather['visit_date']=pd.to_datetime(train_weather['visit_date'])
test_weather['visit_date']=pd.to_datetime(test_weather['visit_date'])

# add weather
weather_station_data_filled = weather_data.interpolate(method='pad')
weather_station_data_filled = weather_station_data_filled[['station_id','calendar_date','precipitation','avg_temperature','hours_sunlight','avg_wind_speed',
                      'high_temperature','low_temperature','solar_radiation','avg_humidity','cloud_cover']]

train_weather2 = pd.merge(train_weather, weather_station_data_filled, how='left', 
                          left_on=['station_id', 'visit_date'], right_on=['station_id','calendar_date'])

test_weather2 = pd.merge(test_weather, weather_station_data_filled, how='left', 
                          left_on=['station_id', 'visit_date'], right_on=['station_id','calendar_date'])

In [None]:
train_final = train_weather2 
test_final = test_weather2



### Lag features


In [None]:
combined = pd.concat([train_final, test_final]).copy()
combined['lag_42'] = combined.groupby('store_id')['visitors'].shift(42)
combined['lag_168'] = combined.groupby('store_id')['visitors'].shift(168)
fill=combined[(combined['store_id']=='air_900d755ebd2f7bbd')&(combined['visit_date']>='2017-04-12')&(combined['visit_date']<='2017-04-22')]['visitors'].mean()
ids = combined[combined['lag_42'].isna()][(combined[combined['lag_42'].isna()]['visit_date']>='2017-04-23')&(combined[combined['lag_42'].isna()]['store_id']=='air_900d755ebd2f7bbd')].index
combined.loc[ids, 'lag_42']=fill
combined_filled = combined.interpolate(method='pad')
train_final = combined_filled[combined_filled['visit_date']< '2017-04-23']
test_final = combined_filled[combined_filled['visit_date']>= '2017-04-23']
train_final.isna().sum(0) 
train_final=train_final.dropna()

In [None]:
train_final.shape,test_final.shape

((251940, 49), (32019, 49))

### Final features

In [None]:
train_final.drop(['visit_date','coordinate','station_id','calendar_date'],axis=1,inplace=True)
test_final.drop(['visit_date','coordinate','station_id','calendar_date'],axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


### Final Feature processing

In [None]:
train_final['week_of_year']=train_final['week_of_year'].astype(float)
test_final['week_of_year']=test_final['week_of_year'].astype(float)
train_final['store_id']=train_final['store_id'].astype('category')
test_final['store_id']=test_final['store_id'].astype('category')
train_final['air_genre_name']=train_final['air_genre_name'].astype('category')
test_final['air_genre_name']=test_final['air_genre_name'].astype('category')
train_final['air_area_name']=train_final['air_area_name'].astype('category')
test_final['air_area_name']=test_final['air_area_name'].astype('category')
train_final['city']=train_final['city'].astype('category')
test_final['city']=test_final['city'].astype('category')
train_final['district']=train_final['district'].astype('category')
test_final['district']=test_final['district'].astype('category')
train_final['area']=train_final['area'].astype('category')
test_final['area']=test_final['area'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: htt

In [None]:
# LabelEncoder
lbl = LabelEncoder()

train_final['store_id'] = lbl.fit_transform(train_final['store_id'])
test_final['store_id'] = lbl.fit_transform(test_final['store_id'])
train_final['air_genre_name'] = lbl.fit_transform(train_final['air_genre_name'])
test_final['air_genre_name'] = lbl.fit_transform(test_final['air_genre_name'])
train_final['air_area_name'] = lbl.fit_transform(train_final['air_area_name'])
test_final['air_area_name'] = lbl.fit_transform(test_final['air_area_name'])
train_final['city'] = lbl.fit_transform(train_final['city'])
test_final['city'] = lbl.fit_transform(test_final['city'])
train_final['district'] = lbl.fit_transform(train_final['district'])
test_final['district'] = lbl.fit_transform(test_final['district'])
train_final['area'] = lbl.fit_transform(train_final['area'])
test_final['area'] = lbl.fit_transform(test_final['area'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  import sys
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: h

In [None]:
predictors = [f for f in train_final.columns if f not in (['visitors'])]

In [None]:
#Normalize training data
normalizer = preprocessing.Normalizer()
normalized_X_train = normalizer.fit_transform(train_final[predictors])

#Normalize testing data using the training data’s max and min. 
normalized_X_test = normalizer.transform(test_final[predictors])

In [None]:
train_final['visitors'] = np.log1p(train_final['visitors'])

## Modelling

### KNN

In [None]:
model2 = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=4)

In [None]:
model2.fit(train_final[predictors], train_final['visitors'].values)

In [None]:
predicted_train_y = model2.predict(train_final[predictors])
print('RMSLE score on training data:', round(
    metrics.mean_squared_log_error(train_final['visitors'], 
                                   predicted_train_y, squared=False),3))

In [None]:
pred2 = model2.predict(test_final[predictors])

### GradientBoosting

In [None]:
model1 = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3,
                    n_estimators=180, subsample=0.78, max_depth=5)

In [None]:
model1.fit(train_final[predictors], train_final['visitors'].values)

In [None]:
predicted_train_y = model1.predict(train_final[predictors])
print('RMSLE score on training data:', round(metrics.mean_squared_log_error(
    train_final['visitors'], predicted_train_y, squared=False),3))

In [None]:
pred1 = model1.predict(test_final[predictors])

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['visitors']=np.expm1(pred1)
sub['visitors'] =sub['visitors'].fillna(0)
sub.to_csv(r'submission.csv', index=False)

### XGBoost

In [None]:
import xgboost as xgb

In [None]:
model4 = xgb.XGBRegressor(learning_rate=0.2, random_state=3, n_estimators=280, subsample=0.8, 
                      colsample_bytree=0.8, max_depth =5)
model4.fit(train_final[predictors], train_final['visitors'])

In [None]:
predicted_train_y = model4.predict(train_final[predictors])
print('RMSLE score on training data:', round(metrics.mean_squared_log_error(train_final['visitors'], predicted_train_y, squared=False),3))

In [None]:
pred4 = model4.predict(test_final[predictors])

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['visitors']=np.expm1(pred4)
sub['visitors'] =sub['visitors'].fillna(0)
sub.to_csv(r'submission.csv', index=False)

#### BaysianOptimizer

In [None]:
params={'max_depth':[3,10],
      'gamma':[0.1,1],
      'n_estimators':[100,200],
      'subsample':[0.8],
      'eta':[0.1],
      'eval_metric':['rmse']}
tscv = TimeSeriesSplit(n_splits=4)
bayes_search = BayesSearchCV(estimator=xgb.XGBRegressor(), search_spaces=params,
                             n_jobs=-1, cv=tscv, verbose=1)
bayes_search.fit(train_df[predictors], train_df['visitors'])

### LightGBM GridSearch

In [None]:
import lightgbm as lgb

In [None]:
param_grid = {
    'num_leaves': [30,60,120],
    'subsample':[0.8],
    'learning_rate':[0.07,0.06,0.05],
    'max_depth':[5,6,7],
    'metric': ['rmse'],
    'objective': ['regression'],
    'boosting_type': ['gbdt'],
    'min_child_weight':[16]
    }

In [None]:
tscv = TimeSeriesSplit(n_splits=4)
model = lgb.LGBMRegressor()
model_clf = GridSearchCV(estimator=model, param_grid=param_grid, 
                         scoring='neg_mean_squared_log_error', cv=tscv, n_jobs=-1, verbose=0)

In [None]:
fitted_model = model_clf.fit(train_final[predictors], train_final['visitors'])



KeyboardInterrupt: ignored

In [None]:
pred = fitted_model.predict(test_final[predictors])

In [None]:
predicted_train_y = fitted_model.predict(train_final[predictors])
predicted_train_y = np.where(predicted_train_y<0, 0, predicted_train_y)
# RMSLE score on training data
print('RMSLE score on training data:', round(metrics.mean_squared_log_error(train_final['visitors'], predicted_train_y, squared=False),3))

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['visitors']=pred
sub['visitors'] =sub['visitors'].fillna(0)
sub.to_csv(r'submission.csv', index=False)

In [None]:
fitted_model.best_score_

### Run final LightGBM with best set of params


In [None]:
# params from Japanese notebook
params = {
    'objective':'regression',
    'num_leaves':60,
    'learning_rate':0.01,
    'n_estimators':10000
    }

In [None]:
lgb_train = lgb.Dataset(train_final[predictors], train_final['visitors'])
# lgb_test = lgb.Dataset(test_final[predictors], test_final['visitors'])

gbm = lgb.train(params, lgb_train, 2300)
pred = gbm.predict(test_final[predictors])

In [None]:
predicted_train_y = gbm.predict(train_final[predictors])
predicted_train_y = np.where(predicted_train_y<0, 0, predicted_train_y)
# RMSLE score on training data
print('RMSLE score on training data:', round(metrics.mean_squared_log_error(train_final['visitors'], predicted_train_y, squared=False),3))

In [None]:
lgb.plot_importance(gbm, figsize=(10,18))

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['visitors']=np.expm1(pred)
sub['visitors'] =sub['visitors'].fillna(0)
sub.to_csv(r'submission.csv', index=False)

# Stacking of KNN, Gradient Boosting and XGboost

In [None]:
# initialize two models to be stacked
import lightgbm as lgb
from sklearn.svm import SVR

knn = neighbors.KNeighborsRegressor(n_jobs=-1, n_neighbors=4)
gb = ensemble.GradientBoostingRegressor(learning_rate=0.2, random_state=3,
                    n_estimators=180, subsample=0.78, max_depth=5)
lgbr = lgb.LGBMRegressor(objective='regression',
                                         num_leaves=60,learning_rate=0.01,
                                         n_estimators=10000)
xgbr = xgb.XGBRegressor(max_depth=3,gamma=0.1, n_estimators=200, 
                        subsample=0.8,eta=0.1,eval_metric='rmse')

X, y = train_final[predictors], train_final['visitors']

# generate cross-val-prediction with rf and gb using TimeSeriesSplit
cross_val_predict = np.row_stack([
    np.column_stack([
        knn.fit(X.iloc[id_train], y.iloc[id_train]).predict(X.iloc[id_test]),
        gb.fit(X.iloc[id_train], y.iloc[id_train]).predict(X.iloc[id_test]),
        lgbr.fit(X.iloc[id_train], y.iloc[id_train]).predict(X.iloc[id_test]),
        xgbr.fit(X.iloc[id_train], y.iloc[id_train]).predict(X.iloc[id_test]),
        y[id_test]  # we add in the last position the corresponding fold labels
    ])
    for id_train,id_test in TimeSeriesSplit(n_splits=3).split(X)
])  # (test_size*n_splits, n_models_to_stack+1)

# final fit rf and gb with all the available data
knn.fit(X,y)
gb.fit(X,y)
lgbr.fit(X,y)
xgbr.fit(X,y)

# fit a linear stacking on cross_val_predict
stacking = SVR()
stacking.fit(cross_val_predict[:,:-1], cross_val_predict[:,-1])

# how generate predictions on new unseen data
pred = stacking.predict(
    np.column_stack([
        knn.predict(test_final[predictors]),
        gb.predict(test_final[predictors]),
        lgbr.predict(test_final[predictors]),
        xgbr.predict(test_final[predictors])
    ])
)

In [None]:
sub = pd.read_csv('sample_submission.csv')
sub['visitors']=np.expm1(pred)
sub['visitors'] =sub['visitors'].fillna(0)
sub.to_csv(r'submission.csv', index=False)