In [16]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

import warnings
warnings.filterwarnings(action='ignore')

In [17]:
from workalendar.asia import SouthKorea
import pendulum

In [18]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [19]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']

# train['중식참여율'] = train['중식계'] / train['식사가능자수']

In [20]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 7,
    6: 6,
    7: 10,
    8: 8,
    9: 5,
    10: 3,
    11: 9,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

weekday_rank4dinner = {
    '월': 1,
    '화': 2,
    '수': 4,
    '목': 3,
    '금': 5,
}

weekday_rank4lunch = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5,
}

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일'].map(weekday_rank4lunch)
test['요일(중식)'] = test['요일'].map(weekday_rank4lunch)

In [21]:
rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

In [22]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3
    if tomorrow in holidays:
        return 2
    elif yesterday in holidays:
        return 1
    else : 
        return 0

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

## 공휴일 변수 생성

In [23]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '식사가능자수', '월(석식)', '월(중식)',
       '요일(석식)', '요일(중식)', '주(중식)', '주(석식)', '공휴일전후', '몇주차', '인원변화'],
      dtype='object')

In [24]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

lunch_train = train[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
lunch_test = test[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

dinner_train= train[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
dinner_test = test[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

In [25]:
lunch_train.columns

Index(['공휴일전후', '몇주차', '인원변화', '요일(중식)', '월(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수'],
      dtype='object')

In [26]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 16)
(50, 16)


In [27]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 16)
(50, 16)


In [28]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: []
[]


In [42]:
y_lunch = np.array(train[['중식계']])


y_dinner = np.array(train[['석식계']])

#### 분포 확인 및 분포 조정

# 중식 예측모델

In [43]:
from lightgbm import LGBMRegressor

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import train_test_split, KFold

In [59]:
reg_candidate = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 100]

# space 정의, Hyperparameter의 이름을 key 값으로 입력
space={'max_depth': hp.quniform("max_depth", 5, 15, 1),
       'learning_rate': hp.quniform ('learning_rate', 0.01, 0.05, 0.005),
       'reg_alpha' : hp.choice('reg_alpha', reg_candidate),
       'reg_lambda' : hp.choice('reg_lambda', reg_candidate),
       'max_depth': hp.quniform("max_depth", 1, 20, 1),
       'subsample': hp.quniform('subsample', 0.6, 1, 0.05),
       'num_leaves': hp.quniform("num_leaves", 2, 256, 2),
       'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1, 0.05),
       'subsample_freq': hp.quniform("subsample_freq", 1, 10, 1),
       'min_child_samples': hp.quniform("min_child_samples", 5, 100, 5),
       'max_bin': hp.quniform("max_bin", 200, 500, 100),
       'n_estimators': hp.quniform('n_estimators', 200, 10000, 200)
      }

def objective_lunch(space):
    n_fold = 4
    kf = KFold(n_splits=n_fold, random_state=42, shuffle=True)
    mae = 0

    for train_index, test_index in kf.split(lunch_train):
        X_train, X_valid = lunch_train.iloc[train_index], lunch_train.iloc[test_index]
        y_train, y_valid = y_lunch[train_index], y_lunch[test_index]

        model = LGBMRegressor(n_estimators =int(space['n_estimators']), 
                              max_depth = int(space['max_depth']), 
                              learning_rate = space['learning_rate'],
                              reg_alpha = space['reg_alpha'],
                              reg_lambda = space['reg_lambda'],
                              colsample_bytree = space['colsample_bytree'],
                              num_leaves = int(space['num_leaves']),
                              subsample = space['subsample'],
                              subsample_freq = int(space['subsample_freq']),
                              min_child_samples = int(space['min_child_samples']),
                              max_bin = int(space['max_bin']),
                              random_state=42, 
                              )

        evaluation = [(X_train, y_train), (X_valid, y_valid)]

        model.fit(X_train, y_train,
                  eval_set=evaluation, 
                  eval_metric="mae",
                  early_stopping_rounds=100,
                  verbose=0)

        pred = model.predict(X_valid)
        mae += MAE(y_valid, pred)/n_fold
    
    return {'loss': mae, 'status': STATUS_OK, 'model': model}

In [61]:
trials = Trials()

best_L = fmin(fn=objective_lunch,
              space=space,
              algo=tpe.suggest,
              max_evals=1000,
              trials=trials,
              rstate= np.random.seed(42))

# best_L['max_depth'] = int(best_L['max_depth'])
# best_L['min_child_weight'] = int(best_L['min_child_weight'])
# best_L['n_estimators'] = int(best_L['n_estimators'])
# best_L['reg_alpha'] = reg_candidate[int(best_L['reg_alpha'])]
# best_L['reg_lambda'] = reg_candidate[int(best_L['reg_lambda'])]
# best_L['random_state'] = 42
print(best_L)

100%|█████████████████████████████████████████████| 1000/1000 [20:33<00:00,  1.23s/trial, best loss: 64.87415835306953]


KeyError: 'min_child_weight'

In [89]:
best_L['n_estimators'] =int(best_L['n_estimators'])
best_L['max_depth'] = int(best_L['max_depth'])
best_L['learning_rate'] = best_L['learning_rate']
best_L['reg_alpha'] = best_L['reg_alpha']
best_L['reg_lambda'] = best_L['reg_lambda']
best_L['colsample_bytree'] = best_L['colsample_bytree']
best_L['num_leaves'] = int(best_L['num_leaves'])
best_L['subsample'] = best_L['subsample']
best_L['subsample_freq'] = int(best_L['subsample_freq'])
best_L['min_child_samples'] = int(best_L['min_child_samples'])
best_L['max_bin'] = int(best_L['max_bin'])
best_L['random_state'] = 42
best_L

{'colsample_bytree': 1.0,
 'learning_rate': 0.03,
 'max_bin': 500,
 'max_depth': 3,
 'min_child_samples': 5,
 'n_estimators': 3800,
 'num_leaves': 192,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'subsample': 0.6,
 'subsample_freq': 8,
 'random_state': 42}

In [103]:
n_fold = 12
kf = KFold(n_splits=n_fold, random_state=42, shuffle=True)
mae = 0
lunch_pred = np.array([0]*50).astype('float64')

for train_index, test_index in kf.split(lunch_train):
    X_train, X_valid = lunch_train.iloc[train_index], lunch_train.iloc[test_index]
    y_train, y_valid = y_lunch[train_index], y_lunch[test_index]

    model = LGBMRegressor(**best_L)

    evaluation = [(X_train, y_train), (X_valid, y_valid)]

    model.fit(X_train, y_train,
              eval_set=evaluation, 
              eval_metric="mae",
              early_stopping_rounds=100,
              verbose=0)

    pred = model.predict(X_valid)
    mae += MAE(y_valid, pred)/n_fold
    
    lunch_pred += model.predict(lunch_test)/n_fold
    
print(mae)

64.5178111504131


In [88]:
lunch_pred

array([1006.20830477,  924.78541004,  634.31806768, 1293.68943466,
       1067.48369494, 1016.67449057,  962.2796037 ,  697.37758196,
       1288.3125234 , 1079.45719681,  849.87186624, 1292.15376788,
       1123.97555609, 1090.90522573,  941.29604359,  678.61670739,
       1238.89025445, 1066.49550133,  943.12806947,  888.86095364,
        606.85703237, 1159.79817032, 1020.80130802,  938.85652552,
        671.8927145 , 1310.14620387, 1156.27337253, 1029.92694838,
        943.10527776,  703.82809543, 1265.01750497, 1029.26461239,
       1017.35885328,  913.58017519,  657.9436712 , 1233.61477279,
       1003.73335645,  917.45531516,  845.43491634,  614.15734095,
       1217.58349532, 1012.54034195,  953.91216617,  856.16507513,
        636.37143761, 1227.69826118, 1023.58153816,  938.88585661,
        868.40183704,  622.47725378])

In [91]:
def objective_dinner(space):
    n_fold = 4
    kf = KFold(n_splits=n_fold, random_state=42, shuffle=True)
    mae = 0

    for train_index, test_index in kf.split(dinner_train):
        X_train, X_valid = dinner_train.iloc[train_index], dinner_train.iloc[test_index]
        y_train, y_valid = y_dinner[train_index], y_dinner[test_index]

        model = LGBMRegressor(n_estimators =int(space['n_estimators']), 
                              max_depth = int(space['max_depth']), 
                              learning_rate = space['learning_rate'],
                              reg_alpha = space['reg_alpha'],
                              reg_lambda = space['reg_lambda'],
                              colsample_bytree = space['colsample_bytree'],
                              num_leaves = int(space['num_leaves']),
                              subsample = space['subsample'],
                              subsample_freq = int(space['subsample_freq']),
                              min_child_samples = int(space['min_child_samples']),
                              max_bin = int(space['max_bin']),
                              random_state=42, 
                              )

        evaluation = [(X_train, y_train), (X_valid, y_valid)]

        model.fit(X_train, y_train,
                  eval_set=evaluation, 
                  eval_metric="mae",
                  early_stopping_rounds=100,
                  verbose=0)

        pred = model.predict(X_valid)
        mae += MAE(y_valid, pred)/n_fold
    
    return {'loss': mae, 'status': STATUS_OK, 'model': model}

In [92]:
trials = Trials()

best_D = fmin(fn=objective_dinner,
              space=space,
              algo=tpe.suggest,
              max_evals=1000,
              trials=trials,
              rstate= np.random.seed(42))

best_D['n_estimators'] =int(best_D['n_estimators'])
best_D['max_depth'] = int(best_D['max_depth'])
best_D['learning_rate'] = best_D['learning_rate']
best_D['reg_alpha'] = best_D['reg_alpha']
best_D['reg_lambda'] = best_D['reg_lambda']
best_D['colsample_bytree'] = best_D['colsample_bytree']
best_D['num_leaves'] = int(best_D['num_leaves'])
best_D['subsample'] = best_D['subsample']
best_D['subsample_freq'] = int(best_D['subsample_freq'])
best_D['min_child_samples'] = int(best_D['min_child_samples'])
best_D['max_bin'] = int(best_D['max_bin'])
best_D['random_state'] = 42
print(best_D)

100%|█████████████████████████████████████████████| 1000/1000 [19:42<00:00,  1.18s/trial, best loss: 54.01905622689223]
{'colsample_bytree': 0.9, 'learning_rate': 0.03, 'max_bin': 200, 'max_depth': 6, 'min_child_samples': 10, 'n_estimators': 400, 'num_leaves': 228, 'reg_alpha': 3, 'reg_lambda': 3, 'subsample': 0.65, 'subsample_freq': 10, 'random_state': 42}


In [94]:
mae = 0
dinner_pred = np.array([0]*50).astype('float64')

for train_index, test_index in kf.split(dinner_train):
    X_train, X_valid = dinner_train.iloc[train_index], dinner_train.iloc[test_index]
    y_train, y_valid = y_dinner[train_index], y_dinner[test_index]

    model = LGBMRegressor(**best_L)

    evaluation = [(X_train, y_train), (X_valid, y_valid)]

    model.fit(X_train, y_train,
              eval_set=evaluation, 
              eval_metric="mae",
              early_stopping_rounds=100,
              verbose=0)

    pred = model.predict(X_valid)
    mae += MAE(y_valid, pred)/n_fold
    
    dinner_pred += model.predict(dinner_test)/n_fold
    
print(mae)

56.103102153250475


In [96]:
dinner_pred

array([333.00637137, 430.73622136, 257.9416168 , 536.81350052,
       502.9112677 , 427.53007218, 467.8864336 , 364.74809982,
       619.92014959, 544.27200424, 209.86711857, 679.98307823,
       651.4348321 , 424.68787114, 528.64550848, 374.80033818,
       661.24256253, 617.60384353, 386.68338904, 503.53345018,
       304.96746972, 616.17709591, 452.67340615, 560.65509133,
       370.12992074, 698.41615296, 692.67068615, 461.90991194,
       558.27474237, 366.51762332, 695.70657757, 604.81910021,
       429.21486329, 508.10665447, 322.7080471 , 633.650023  ,
       602.75261776, 368.97280456, 470.81844781, 283.12657153,
       654.05436561, 630.56746666, 284.77708847, 456.93157758,
       314.94430083, 611.47957236, 590.15359255, 412.86747139,
       486.50898647, 314.27379273])

In [98]:
submission = pd.read_csv('../data/sample_submission.csv')

submission.iloc[:,1] = lunch_pred
submission.iloc[:,2] = dinner_pred
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1006.208305,333.006371
1,2021-01-28,924.78541,430.736221
2,2021-01-29,634.318068,257.941617
3,2021-02-01,1293.689435,536.813501
4,2021-02-02,1067.483695,502.911268


In [99]:
def compare_ans(DIR):
    answer = pd.read_csv(DIR)

    lunch_answer = np.array(answer.iloc[:,1])
    dinner_answer = np.array(answer.iloc[:,2])
    
    lunch_MAE = abs(submission.iloc[:,1] - lunch_answer).mean()
    dinner_MAE = abs(submission.iloc[:,2] - dinner_answer).mean()
    
    print("lunch_MAE : ", lunch_MAE)
    print("dinner_MAE : ", dinner_MAE)
    print("total_MAE : ", (lunch_MAE+dinner_MAE)/2)
    
    
compare_ans('../submission/20210701_lgbm_knfold_drop.csv')
compare_ans('../submission/20210715_pycaret_(2).csv')
compare_ans('../submission/20210719_best_data_ensemble.csv')
compare_ans('../submission/20210719_pycaret.csv')

lunch_MAE :  9.35133481665588
dinner_MAE :  21.41925496447046
total_MAE :  15.38529489056317
lunch_MAE :  58.99531348841612
dinner_MAE :  30.948559722159125
total_MAE :  44.971936605287624
lunch_MAE :  54.77214865162563
dinner_MAE :  26.76814218725579
total_MAE :  40.77014541944071
lunch_MAE :  31.069570573967003
dinner_MAE :  26.76814218725579
total_MAE :  28.918856380611395


In [100]:
import glob

files = glob.glob('../submission/*')

for file in files[:-1]:
    print(file, compare_ans(file), end="\n\n" )

lunch_MAE :  14.586279294849696
dinner_MAE :  29.354016302483068
total_MAE :  21.970147798666382
../submission\20210624_pycaret_(4-2)_제출해봐야할거.csv None

lunch_MAE :  46.18212811113167
dinner_MAE :  27.717074850537756
total_MAE :  36.949601480834716
../submission\20210625_cat.csv None

lunch_MAE :  37.703014300425345
dinner_MAE :  25.419208939411497
total_MAE :  31.56111161991842
../submission\20210625_cat_10fold.csv None

lunch_MAE :  37.375638681997884
dinner_MAE :  25.992076019521893
total_MAE :  31.683857350759887
../submission\20210627_cat_15_KJH.csv None

lunch_MAE :  57.003558261651904
dinner_MAE :  58.387851244392095
total_MAE :  57.695704753022
../submission\20210627_cat_after_2020.csv None

lunch_MAE :  37.375638681997884
dinner_MAE :  25.992076019521893
total_MAE :  31.683857350759887
../submission\20210627_cat_KJH.csv None

lunch_MAE :  37.375638681997884
dinner_MAE :  25.992076019521893
total_MAE :  31.683857350759887
../submission\20210627_pycaret_KJH.csv None

lunch_MAE : 

# 저장

In [102]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_hyperOpt_lgbm.csv', index =False)

오늘 날짜 : 20210722
