## モデリングと予測

In [0]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from datetime import datetime, timedelta
warnings.filterwarnings('ignore')
%matplotlib inline 


### 前処理

In [0]:
st = pd.read_csv('./store.csv')
train_df = pd.read_csv('./train.csv')
test_df = pd.read_csv('./test.csv')

In [0]:
test_df[test_df['Open'].isna()]
test_df = test_df.fillna(1)

In [0]:
cols1 = ['CompetitionDistance','CompetitionOpenSinceMonth','CompetitionOpenSinceYear']
st[cols1] = st[cols1].fillna(st[cols1].mean()).astype(int)

st['Promo2SinceYear'] = st['Promo2SinceYear'].fillna(2016)
st['Promo2SinceWeek'] = st['Promo2SinceWeek'].fillna(0)
st['PromoInterval'] = st['PromoInterval'].fillna('NoPromo')


In [0]:
def preprocess_df(dataframe):
  dataframe = pd.merge(dataframe,st,on='Store',how='left')
  dataframe['Date'] =  dataframe['Date'].astype('datetime64')
  dataframe['StateHoliday']=dataframe['StateHoliday'].astype(str)
  return dataframe

df = preprocess_df(train_df)

df['SalesPerCustomer'] = df['Sales']/df['Customers']
df = df[df['SalesPerCustomer']<=30] 


In [44]:
test_df = pd.merge(test_df,st,on='Store',how='left')
test_df.columns

Index(['Id', 'Store', 'DayOfWeek', 'Date', 'Open', 'Promo', 'StateHoliday',
       'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance',
       'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2',
       'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval'],
      dtype='object')

In [45]:
 # 1、ライバルの今までの存在期間
df['CompeLife'] = 12*(df['Date'].dt.year -  df['CompetitionOpenSinceYear']) +(df['Date'].dt.month -  df['CompetitionOpenSinceMonth'])
df = df.drop(['CompetitionOpenSinceYear','CompetitionOpenSinceMonth'],axis = 1)

#2、プロモーションの持続期間
df['PromoLife'] = 52*(df['Date'].dt.year -  df['Promo2SinceYear']) +(df['Date'].dt.month*4 -  df['Promo2SinceWeek'])
df = df.drop(['Promo2SinceYear','Promo2SinceWeek'],axis=1)

test_df['Date'] =  pd.to_datetime(test_df['Date'])
test_df['StateHoliday']=test_df['StateHoliday'].astype(str)

test_df['CompeLife'] = 12*(test_df['Date'].dt.year -  test_df['CompetitionOpenSinceYear']) +(test_df['Date'].dt.month -  test_df['CompetitionOpenSinceMonth'])
test_df = test_df.drop(['CompetitionOpenSinceYear','CompetitionOpenSinceMonth'],axis = 1)

test_df['PromoLife'] = 52*(test_df['Date'].dt.year -  test_df['Promo2SinceYear']) +(test_df['Date'].dt.month*4 -  test_df['Promo2SinceWeek'])
test_df = test_df.drop(['Promo2SinceYear','Promo2SinceWeek'],axis=1)
test_df = test_df.drop('Id',axis=1)

test_df.head()

Unnamed: 0,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,Promo2,PromoInterval,CompeLife,PromoLife
0,1,4,2015-09-17,1.0,1,0,0,c,a,1270,0,NoPromo,84,-16.0
1,3,4,2015-09-17,1.0,1,0,0,a,a,14130,1,"Jan,Apr,Jul,Oct",105,230.0
2,7,4,2015-09-17,1.0,1,0,0,a,c,24000,0,NoPromo,29,-16.0
3,8,4,2015-09-17,1.0,1,0,0,a,a,7520,0,NoPromo,11,-16.0
4,9,4,2015-09-17,1.0,1,0,0,a,c,2030,0,NoPromo,181,-16.0


### 特徴量作成
・ライバルの今までの存在期間  
・プロモーションの持続期間  
・過去120日、過去180日、過去1年、過去2年間の売上  
・holidayやpromotionがこの店舗に対する影響度    
　holidayにおける売上/普段の売上      
　promotionにおける売上/普段の売上      


In [0]:
def feature_engineering(df):
    #月、年、何周目、何シーズン目、
  df['Month'] = df['Date'].dt.month
  df['Year'] = df['Date'].dt.year
  df['Week']=df['Date'].dt.week
  df['Quarter']=df['Date'].dt.quarter
  df['Day']=df['Date'].dt.day
  df['DayOfYear']=df['Date'].dt.dayofyear

  
  #promo2進行中であるかどうか
  df['InPromo2'] = 0
  df.loc[(df['PromoInterval']=='Jan,Apr,Jul,Oct')  &  (df['Month'].isin([1,4,7,10])) , 'InPromo2'] = 1 
  df.loc[(df['PromoInterval']=='Feb,May,Aug,Nov') & (df['Month'].isin([2,5,8,11])) , 'InPromo2'] = 1 
  df.loc[(df['PromoInterval']=='Mar,Jun,Sept,Dec') & (df['Month'].isin([3,6,9,12])) , 'InPromo2'] = 1 
  df = df.drop('PromoInterval',axis=1)
  
  return df
df = feature_engineering(df)
test_df = feature_engineering(test_df)

In [47]:
df['Date'].max()
df['Date'].min()

Timestamp('2013-01-01 00:00:00')

In [48]:
start_dt = df['Date'].min()+timedelta(days=180)
print(start_dt)
dates = df.loc[df['Date']>=start_dt,'Date'].unique()
dates = pd.to_datetime(dates)

2013-06-30 00:00:00


In [0]:
def calculate_ndays_past_mean(df,days):
  sales_col_name = 'SalesBefore'+str(days)
  cust_col_name = 'CustBefore'+str(days)

  df_past_means = pd.DataFrame({
   'Date':[],
   'Store':[],
       sales_col_name:[],
      cust_col_name:[]
  })

  for date in dates :
    start_date = date- timedelta(days=days)
    
    if start_date < dates.min():
      start_date = dates.min()
    
    df_past_means_temp = df[(df['Date'] < date) & (df['Date'] >=start_date)].groupby('Store').mean()
    df_past_means_temp = df_past_means_temp.reset_index()[['Store','Sales','Customers']].rename(columns = {
        'Sales':sales_col_name,
       'Customers':cust_col_name 

    })
    df_past_means_temp['Date'] = date
    df_past_means = pd.concat([df_past_means,df_past_means_temp])
    
  return df_past_means

In [0]:
#過去120日、過去180日、過去1年、過去2年間の売上
for days in [120,180,365,730]:
  past_means=calculate_ndays_past_mean(df,days)
  df = pd.merge(df,past_means,on=['Store','Date'])


In [0]:
def make_store_features(df):
  
  store_features= df.groupby('Store').mean().reset_index()[['Store','Sales','Customers']]\
  .rename(columns={
      'Sales':'SalesMeanAll',
          'Customers':'CustomerMean'
  })
  
  max_date = df['Date'].max()

  for day in  [120,180,365,730]:
    before_n = max_date-timedelta(days=day)
    feature_name_sales =  'SalesBefore'+str(day)
    feature_name_sales_per = 'CustBefore'+str(day)

    feature_value = df.loc[df['Date']>=before_n]\
    .groupby('Store').mean().reset_index()[['Store','Sales','Customers']]
    store_features[feature_name_sales] = feature_value['Sales']
    store_features[feature_name_sales_per] = feature_value['Customers']

# ・holidayやpromotionがこの店舗に対する影響度    
# 　holidayにおける売上/普段の売上      
# 　promotionにおける売上/普段の売上      
  
  for feature in ['SchoolHoliday','Promo','InPromo2']:


    feature_true_sales = df[df[feature]==1].groupby('Store').mean()\
    .reset_index()[['Store','Sales']]


    feature_true_sales['FeatureFalse'] = df[df[feature]==0].groupby('Store').mean()\
    .reset_index()['Sales']

    col_name = feature + 'Influence'

    feature_true_sales[col_name] = feature_true_sales['Sales']/feature_true_sales['FeatureFalse']
    store_features[col_name] =  feature_true_sales[col_name] 

    store_features = store_features.fillna(store_features.mean())  
  
  return store_features


store_features = make_store_features(df)


In [0]:
store_features_to_join = store_features[['Store','SalesMeanAll','CustomerMean','SchoolHolidayInfluence','PromoInfluence','InPromo2Influence']]
df = pd.merge(df,store_features_to_join,on = 'Store',how='left')
test_df = pd.merge(test_df,store_features,on = 'Store',how='left')


カテゴリ変数のダミー化

In [0]:
categoritial_features = ['Open', 'Promo',
       'StateHoliday', 'SchoolHoliday', 'Promo2']

one_hot_target = ['DayOfWeek','StoreType', 'Assortment','StateHoliday','Month', 'Year', 'Quarter', 'Day']

In [0]:
df = pd.get_dummies(df, columns = one_hot_target)

test_df = pd.get_dummies(test_df, columns = one_hot_target)

In [55]:
df.columns

Index(['Store', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'SchoolHoliday',
       'CompetitionDistance', 'Promo2', 'SalesPerCustomer', 'CompeLife',
       'PromoLife', 'Week', 'DayOfYear', 'InPromo2', 'CustBefore120',
       'SalesBefore120', 'CustBefore180', 'SalesBefore180', 'CustBefore365',
       'SalesBefore365', 'CustBefore730', 'SalesBefore730', 'SalesMeanAll',
       'CustomerMean', 'SchoolHolidayInfluence', 'PromoInfluence',
       'InPromo2Influence', 'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3',
       'DayOfWeek_4', 'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7',
       'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d',
       'Assortment_a', 'Assortment_b', 'Assortment_c', 'StateHoliday_0',
       'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 'Month_1',
       'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7',
       'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Year_2013',
       'Year_2014', 'Year_2015', 'Quarter_1', 'Quarter

In [0]:
df = df.dropna(how='any')

In [0]:
from sklearn.model_selection import train_test_split

col_to_drop = ['Store', 'Date'
               , 'Sales','Customers','SalesPerCustomer','StateHoliday_c','StateHoliday_b']


# features = test_df.columns.drop(col_to_drop)
features = df.columns.drop(col_to_drop)


X_train_val = df.loc[\
    (df['Date']>'2013-01-01')&\
                (df['Date']<'2015-06-30'),features]
y_train_val = df.loc[\
    (df['Date']>'2013-01-01')&\
                (df['Date']<'2015-06-30'),'Sales']


X_test = df.loc[df['Date']>='2015-06-30',features]
y_test =  df.loc[df['Date']>='2015-06-30','Sales']

X_train_all = df[features]
y_train_all = df['Sales']



X_train,X_val,y_train,y_val = train_test_split(
    X_train_val, y_train_val, test_size=0.2, random_state=0,
)


In [58]:
df.columns

Index(['Store', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'SchoolHoliday',
       'CompetitionDistance', 'Promo2', 'SalesPerCustomer', 'CompeLife',
       'PromoLife', 'Week', 'DayOfYear', 'InPromo2', 'CustBefore120',
       'SalesBefore120', 'CustBefore180', 'SalesBefore180', 'CustBefore365',
       'SalesBefore365', 'CustBefore730', 'SalesBefore730', 'SalesMeanAll',
       'CustomerMean', 'SchoolHolidayInfluence', 'PromoInfluence',
       'InPromo2Influence', 'DayOfWeek_1', 'DayOfWeek_2', 'DayOfWeek_3',
       'DayOfWeek_4', 'DayOfWeek_5', 'DayOfWeek_6', 'DayOfWeek_7',
       'StoreType_a', 'StoreType_b', 'StoreType_c', 'StoreType_d',
       'Assortment_a', 'Assortment_b', 'Assortment_c', 'StateHoliday_0',
       'StateHoliday_a', 'StateHoliday_b', 'StateHoliday_c', 'Month_1',
       'Month_2', 'Month_3', 'Month_4', 'Month_5', 'Month_6', 'Month_7',
       'Month_8', 'Month_9', 'Month_10', 'Month_11', 'Month_12', 'Year_2013',
       'Year_2014', 'Year_2015', 'Quarter_1', 'Quarter

In [0]:
def rmspe(y,y_pred):
  return np.sqrt((np.mean((y-y_pred)/y)**2)*100)

R2score でモデル精度を判定 

In [0]:
# X_train.sample

In [61]:
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
import lightgbm as lgb
import time


lgb_params = {
    'task' : 'train',
        'boosting_type' : 'gbdt',
        'objective' : 'regression',
'metric' : {'rmse','mse'},
               'verbose' : 0
             }
lgb_train = lgb.Dataset(X_train,label=y_train)
lgb_eval = lgb.Dataset(X_val, y_val, reference=lgb_train)


evals_result = {}
md = lgb.train(lgb_params, 
               lgb_train,
               valid_sets = lgb_eval,
               num_boost_round = 1000,
               early_stopping_rounds=100)



print(md.best_iteration)


[1]	valid_0's l2: 8.39881e+06	valid_0's rmse: 2898.07
Training until validation scores don't improve for 100 rounds.
[2]	valid_0's l2: 7.21329e+06	valid_0's rmse: 2685.76
[3]	valid_0's l2: 6.24353e+06	valid_0's rmse: 2498.71
[4]	valid_0's l2: 5.4475e+06	valid_0's rmse: 2333.99
[5]	valid_0's l2: 4.79825e+06	valid_0's rmse: 2190.49
[6]	valid_0's l2: 4.25453e+06	valid_0's rmse: 2062.65
[7]	valid_0's l2: 3.81405e+06	valid_0's rmse: 1952.96
[8]	valid_0's l2: 3.44534e+06	valid_0's rmse: 1856.16
[9]	valid_0's l2: 3.13471e+06	valid_0's rmse: 1770.51
[10]	valid_0's l2: 2.88082e+06	valid_0's rmse: 1697.3
[11]	valid_0's l2: 2.6623e+06	valid_0's rmse: 1631.66
[12]	valid_0's l2: 2.47565e+06	valid_0's rmse: 1573.42
[13]	valid_0's l2: 2.3183e+06	valid_0's rmse: 1522.6
[14]	valid_0's l2: 2.18055e+06	valid_0's rmse: 1476.67
[15]	valid_0's l2: 2.06314e+06	valid_0's rmse: 1436.36
[16]	valid_0's l2: 1.96038e+06	valid_0's rmse: 1400.14
[17]	valid_0's l2: 1.87618e+06	valid_0's rmse: 1369.74
[18]	valid_0's l

In [62]:

ypred = md.predict(X_val)

print( 'lgb : ', r2_score(y_val,ypred))
print('rmspe:',rmspe(y_val,ypred))

lgb :  0.9513187425139384
rmspe: 0.10822627297478563


結果は悪くない、更にパラメータチューニングする必要があります。  


### LightGBMのパラメータチューニング

In [63]:
from sklearn.model_selection import GridSearchCV

param_test1={
        'max_depth': [13,15,17]
}

lgb_model = lgb.LGBMRegressor(num_boost_round = 800,scale_pos_weight=1,seed=27)


gsearch1 = GridSearchCV(lgb_model,param_grid=param_test1,n_jobs=-1,iid=False,cv=3,scoring = 'r2')
gsearch1.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=-1, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1,
                                     num_boost_round=1000, num_leaves=31,
                                     objective=None, random_state=None,
                                     reg_alpha=0.0, reg_lambda=0.0,
                                     scale_pos_weight=1, seed=27, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid=False, n_jobs=-1, param_grid={'max_depth': [13, 15, 17]},
        

In [64]:
gsearch1.best_params_, gsearch1.best_score_


({'max_depth': 15}, 0.949212997551852)

In [65]:
from sklearn.model_selection import GridSearchCV

param_test2={
        'learning_rate': [0.15,0.2,0.25]
}

lgb_model = lgb.LGBMRegressor(num_boost_round=800,max_depth=15, scale_pos_weight=1,seed=27)


gsearch2 = GridSearchCV(lgb_model,param_grid=param_test2,n_jobs=-1,iid=False,cv=3,scoring='r2')
gsearch2.fit(X_train,y_train)

GridSearchCV(cv=3, error_score='raise-deprecating',
             estimator=LGBMRegressor(boosting_type='gbdt', class_weight=None,
                                     colsample_bytree=1.0,
                                     importance_type='split', learning_rate=0.1,
                                     max_depth=15, min_child_samples=20,
                                     min_child_weight=0.001, min_split_gain=0.0,
                                     n_estimators=100, n_jobs=-1,
                                     num_boost_round=1000, num_leaves=31,
                                     objective=None, random_state=None,
                                     reg_alpha=0.0, reg_lambda=0.0,
                                     scale_pos_weight=1, seed=27, silent=True,
                                     subsample=1.0, subsample_for_bin=200000,
                                     subsample_freq=0),
             iid=False, n_jobs=-1,
             param_grid={'learning_rate': [0.15,

In [66]:
gsearch2.best_params_, gsearch2.best_score_


({'learning_rate': 0.25}, 0.9529839754288233)

In [67]:
lgb_model = lgb.LGBMRegressor(num_boost_round=800,max_depth=15,\
                              eta=0.2, scale_pos_weight=1,seed=27)
lgb_model.fit(X_train,y_train)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              eta=0.2, importance_type='split', learning_rate=0.1, max_depth=15,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_boost_round=1000, num_leaves=31,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              scale_pos_weight=1, seed=27, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [68]:
ypred2 = lgb_model.predict(X_val)

print( 'lgb : ', r2_score(y_val,ypred2))
print('rmspe:',rmspe(y_val,ypred2))

lgb :  0.9516169301339064
rmspe: 0.10733717012233343


In [69]:
# dtest = lgb.Dataset(X_test, label=y_test)
ypred_t = lgb_model.predict(X_test)

print( 'lgb : ', r2_score(y_test,ypred_t))
print('rmspe:',rmspe(y_test,ypred_t))

lgb :  0.9082698773744339
rmspe: 0.02262760212712538


# 結果の提出

In [70]:
model_lgb_final = lgb.LGBMRegressor(num_boost_round=800,max_depth=15,\
                              eta=0.2, scale_pos_weight=1,seed=27)  

model_lgb_final.fit(X_train_all,y_train_all)

LGBMRegressor(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
              eta=0.2, importance_type='split', learning_rate=0.1, max_depth=15,
              min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
              n_estimators=100, n_jobs=-1, num_boost_round=1000, num_leaves=31,
              objective=None, random_state=None, reg_alpha=0.0, reg_lambda=0.0,
              scale_pos_weight=1, seed=27, silent=True, subsample=1.0,
              subsample_for_bin=200000, subsample_freq=0)

In [0]:
li_test =set(test_df.columns)
li_df = set(df.columns)
li_new_features = li_df -  li_test
for i in list(li_new_features):
  test_df[i]=0

In [0]:
y_pred_all = model_lgb_final.predict(test_df[features])
submit_df = pd.DataFrame({
    'Id':pd.read_csv('test.csv')['Id'],
    'Sales':y_pred_all
}
)

In [0]:
submit_df.to_csv('./submit_0818.csv',index=False)