In [1]:
import pandas as pd
import numpy as np
from pycaret.regression import setup, compare_models, blend_models,tune_model,predict_model,get_config, finalize_model

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

In [2]:
from workalendar.asia import SouthKorea
import pendulum

In [3]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [4]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

In [5]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 7,
    6: 6,
    7: 10,
    8: 8,
    9: 5,
    10: 3,
    11: 9,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

weekday_rank4dinner = {
    '월': 1,
    '화': 2,
    '수': 4,
    '목': 3,
    '금': 5,
}

weekday_rank4lunch = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5,
}

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일'].map(weekday_rank4lunch)
test['요일(중식)'] = test['요일'].map(weekday_rank4lunch)

In [6]:
rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

In [7]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 'S'
    if tomorrow in holidays:
        return 'T'
    elif yesterday in holidays:
        return 'Y'
    else : 
        return 'N'

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

## 공휴일 변수 생성

In [8]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

lunch_train = train[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','중식계']]
lunch_test = test[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수']]

dinner_train= train[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','석식계']]
dinner_test = test[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수']]

In [30]:
lunch_train.columns

Index(['공휴일전후', '몇주차', '인원변화', '요일(중식)', '월(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '중식계'],
      dtype='object')

In [9]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 15)
(50, 14)


In [10]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 15)
(50, 14)


#### 분포 확인 및 분포 조정

# 중식 예측모델

In [11]:
lunch_regression_model = setup(data=lunch_train, target='중식계',
                               train_size=0.8,
                               n_jobs=-1,
                               fold=8,
                               numeric_imputation = 'mean',
                               normalize = True)

Unnamed: 0,Description,Value
0,session_id,6832
1,Target,중식계
2,Original Data,"(1187, 15)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(949, 37)"


In [12]:
# 최고 성능 모델 선정(2개)
lunch_regression_best_models = compare_models(fold=8, sort='MAE', n_select=2, exclude=['huber','llar','lar','par','lasso'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,67.3802,7941.1659,88.864,0.8133,0.1124,0.0828,1.0025
gbr,Gradient Boosting Regressor,67.3868,7927.4191,88.7261,0.8138,0.1098,0.0816,0.0475
rf,Random Forest Regressor,70.379,9105.6355,94.9577,0.7864,0.1188,0.0863,0.1562
ridge,Ridge Regression,71.0115,8990.0336,94.4018,0.789,0.1175,0.0859,0.0088
br,Bayesian Ridge,71.0158,8971.3659,94.3025,0.7894,0.1173,0.0858,0.0075
lr,Linear Regression,71.206,9050.8275,94.7295,0.7876,0.118,0.0862,1.125
xgboost,Extreme Gradient Boosting,71.218,8696.8768,93.0697,0.7949,0.1165,0.0869,0.2812
lightgbm,Light Gradient Boosting Machine,72.4009,9026.3989,94.639,0.7883,0.1194,0.0885,0.335
et,Extra Trees Regressor,74.0893,10341.3493,101.4138,0.7567,0.1273,0.091,0.13
ada,AdaBoost Regressor,84.3945,12037.7379,109.4364,0.7164,0.1357,0.1039,0.0388


In [13]:
lunch_regression_best_models_tuned = [tune_model(i, optimize='MAE') for i in lunch_regression_best_models]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,60.1716,6450.0668,80.3123,0.8474,0.1026,0.0762
1,77.4621,10300.5201,101.4915,0.7656,0.1134,0.0879
2,66.2129,6663.2828,81.6289,0.8206,0.1088,0.0823
3,69.7933,9616.6043,98.0643,0.7753,0.1211,0.0843
4,66.8674,8714.4451,93.3512,0.8042,0.1035,0.0743
5,71.133,8026.2782,89.5895,0.818,0.1174,0.0907
6,63.8945,7374.2564,85.8735,0.8185,0.1006,0.0758
7,74.0669,9766.4902,98.8256,0.7782,0.1274,0.0914
Mean,68.7002,8363.993,91.1421,0.8035,0.1118,0.0829
SD,5.2209,1371.9578,7.5572,0.0263,0.009,0.0064


In [14]:
lunch_model = blend_models(estimator_list=lunch_regression_best_models_tuned, fold=8, optimize='MAE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,61.1785,6497.564,80.6075,0.8463,0.104,0.0779
1,79.0957,10672.3322,103.307,0.7571,0.1152,0.0901
2,63.5752,6103.8204,78.127,0.8357,0.1033,0.079
3,63.738,7852.8562,88.6163,0.8165,0.1115,0.0776
4,66.9868,8410.2968,91.7077,0.811,0.1041,0.0761
5,72.6029,8805.9864,93.8402,0.8003,0.1304,0.0955
6,59.8092,6859.7682,82.8237,0.8312,0.0986,0.0716
7,73.1563,9558.3911,97.767,0.7829,0.1322,0.0927
Mean,67.5178,8095.1269,89.5995,0.8101,0.1124,0.0826
SD,6.3386,1478.6014,8.1883,0.0276,0.0119,0.0083


In [15]:
pred = predict_model(lunch_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,69.9204,8171.7839,90.3979,0.8363,0.1206,0.0912


### 테스트 데이터 예측

In [16]:
submission = pd.read_csv('../data/sample_submission.csv')

In [17]:
final_model_lunch = finalize_model(lunch_model)
prep_pipe_lunch = get_config('prep_pipe')
prep_pipe_lunch.steps.append(['trained_model', final_model_lunch])

In [18]:
pred_lunch = prep_pipe_lunch.predict(lunch_test)

submission.iloc[:,1] = pred_lunch
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,966.858545,0
1,2021-01-28,941.295149,0
2,2021-01-29,624.248693,0
3,2021-02-01,1269.35777,0
4,2021-02-02,1060.346,0


# 석식 예측모델

In [20]:
dinner_regression_model = setup(data=dinner_train,
                                target='석식계',
                                train_size=0.8,
                                n_jobs=-1,
                                fold = 8,
                                numeric_imputation = 'mean',
                                normalize = True)

Unnamed: 0,Description,Value
0,session_id,2813
1,Target,석식계
2,Original Data,"(1187, 15)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,4
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(949, 37)"


In [21]:
dinner_regression_best_models = compare_models(fold=8,
                                               n_select=3,
                                               exclude=['huber','llar','lar','par','lasso'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,52.1478,6105.8447,77.6201,0.673,0.9286,0.1126,0.955
gbr,Gradient Boosting Regressor,55.6045,6476.8438,80.1388,0.6567,0.9569,0.1183,0.0438
lightgbm,Light Gradient Boosting Machine,56.5975,6692.1966,81.3313,0.6422,0.9231,0.1212,0.0662
xgboost,Extreme Gradient Boosting,55.6523,7113.5637,83.5703,0.6256,0.929,0.1182,0.33
rf,Random Forest Regressor,56.7041,7044.2903,83.6056,0.6242,0.946,0.1212,0.1425
et,Extra Trees Regressor,56.7482,7281.4227,84.7498,0.6078,0.8694,0.1228,0.1175
knn,K Neighbors Regressor,63.3665,8269.9235,90.6755,0.5626,0.9967,0.1364,0.0088
br,Bayesian Ridge,67.7775,9339.0929,96.4239,0.5108,1.076,0.1331,0.0075
ridge,Ridge Regression,68.194,9382.312,96.6346,0.5084,1.0743,0.1347,0.0062
lr,Linear Regression,68.3995,9423.5525,96.8463,0.5064,1.0745,0.1353,0.0062


In [22]:
dinner_regression_best_models_tuned = [tune_model(i, optimize='MAE') for i in dinner_regression_best_models]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,67.3479,8332.6801,91.2835,0.4976,0.7442,0.1407
1,68.2543,8341.7278,91.3331,0.5929,1.2187,0.1423
2,54.062,6978.5773,83.5379,0.6036,0.766,0.1177
3,54.4582,6590.5485,81.1822,0.7016,1.277,0.1061
4,59.7522,8349.6229,91.3763,0.4002,0.7144,0.1364
5,63.887,8567.4434,92.5605,0.6373,1.2339,0.1407
6,54.2373,5239.3727,72.3835,0.7452,1.0816,0.1124
7,55.4757,5551.8053,74.5104,0.7165,0.7677,0.1396
Mean,59.6843,7243.9722,84.7709,0.6119,0.9754,0.1295
SD,5.6585,1262.123,7.6068,0.1094,0.2337,0.0139


In [23]:
dinner_model = blend_models(estimator_list=dinner_regression_best_models, fold=5, optimize='MAE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,55.0733,6027.0212,77.6339,0.6852,0.8795,0.1197
1,49.503,5888.2175,76.7347,0.6817,0.8512,0.1024
2,50.3903,5608.0451,74.8869,0.6675,0.9968,0.1013
3,54.0509,6819.3825,82.5796,0.7088,1.0909,0.1176
4,53.6883,5038.5017,70.9824,0.7418,0.9103,0.1239
Mean,52.5412,5876.2336,76.5635,0.697,0.9457,0.113
SD,2.1847,580.7117,3.777,0.026,0.0875,0.0093


In [24]:
pred = predict_model(dinner_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,55.3418,7282.8706,85.3397,0.6133,1.0494,0.1119


## 테스트

### 테스트 데이터 예측

In [25]:
final_model_dinner = finalize_model(dinner_model)
prep_pipe_dinner = get_config('prep_pipe')
prep_pipe_dinner.steps.append(['trained_model', final_model_dinner])

In [26]:
pred_dinner = prep_pipe_dinner.predict(dinner_test)
submission.iloc[:,2] = pred_dinner
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,966.858545,285.491704
1,2021-01-28,941.295149,417.762737
2,2021-01-29,624.248693,221.44805
3,2021-02-01,1269.35777,508.5544
4,2021-02-02,1060.346,439.432134


# 저장

In [27]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_pycaret_ADD_FEATURE.csv', index =False)

오늘 날짜 : 20210628
