In [84]:
import pandas as pd
import numpy as np
from pycaret.regression import setup, compare_models, blend_models,tune_model,predict_model,get_config, finalize_model

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

In [85]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

In [86]:
train.head()

Unnamed: 0,일자,요일,본사정원수,본사휴가자수,본사출장자수,본사시간외근무명령서승인건수,현본사소속재택근무자수,조식메뉴,중식메뉴,석식메뉴,중식계,석식계
0,2016-02-01,월,2601,50,150,238,0.0,모닝롤/찐빵 우유/두유/주스 계란후라이 호두죽/쌀밥 (쌀:국내산) 된장찌개 쥐...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 오징어찌개 쇠불고기 (쇠고기:호주산) 계란찜 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 육개장 자반고등어구이 두부조림 건파래무침 ...",1039.0,331.0
1,2016-02-02,화,2601,50,173,319,0.0,모닝롤/단호박샌드 우유/두유/주스 계란후라이 팥죽/쌀밥 (쌀:국내산) 호박젓국찌...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 김치찌개 가자미튀김 모둠소세지구이 마늘쫑무...","콩나물밥*양념장 (쌀,현미흑미:국내산) 어묵국 유산슬 (쇠고기:호주산) 아삭고추무...",867.0,560.0
2,2016-02-03,수,2601,56,180,111,0.0,모닝롤/베이글 우유/두유/주스 계란후라이 표고버섯죽/쌀밥 (쌀:국내산) 콩나물국...,"카레덮밥 (쌀,현미흑미:국내산) 팽이장국 치킨핑거 (닭고기:국내산) 쫄면야채무침 ...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 청국장찌개 황태양념구이 (황태:러시아산) 고기...",1017.0,573.0
3,2016-02-04,목,2601,104,220,355,0.0,"모닝롤/토마토샌드 우유/두유/주스 계란후라이 닭죽/쌀밥 (쌀,닭:국내산) 근대국...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 쇠고기무국 주꾸미볶음 부추전 시금치나물 ...","미니김밥*겨자장 (쌀,현미흑미:국내산) 우동 멕시칸샐러드 군고구마 무피클 포...",978.0,525.0
4,2016-02-05,금,2601,278,181,34,0.0,모닝롤/와플 우유/두유/주스 계란후라이 쇠고기죽/쌀밥 (쌀:국내산) 재첩국 방...,"쌀밥/잡곡밥 (쌀,현미흑미:국내산) 떡국 돈육씨앗강정 (돼지고기:국내산) 우엉잡채...","쌀밥/잡곡밥 (쌀,현미흑미:국내산) 차돌박이찌개 (쇠고기:호주산) 닭갈비 (닭고기:...",925.0,330.0


## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [87]:
train['월'] = pd.DatetimeIndex(train['일자']).month.astype(str)
test['월'] = pd.DatetimeIndex(test['일자']).month.astype(str)
train['주'] = pd.DatetimeIndex(train['일자']).week.astype(str)
test['주'] = pd.DatetimeIndex(test['일자']).week.astype(str)
train['일'] = pd.DatetimeIndex(train['일자']).day.astype(str)
test['일'] = pd.DatetimeIndex(test['일자']).day.astype(str)

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

## 공휴일 변수 생성

In [213]:
train['공휴일전후'] = 'N'
test['공휴일전후'] = 'N'

[4h, 17s, 152s, 221s, 226s, 245h, 309s, 310s,311s,330s,379s, 421h,502h,511s,565s,650h,651s,705s,709s,732h,815s,864s,951e,971h, 1038s,1129h, 1187e, ]

In [214]:
train['공휴일전후'][4] = 'H'
train['공휴일전후'][17] = 'S'
train['공휴일전후'][152] = 'S'
train['공휴일전후'][245] = 'H'
train['공휴일전후'][310] = 'S'
train['공휴일전후'][311] = 'S'
train['공휴일전후'][330] = 'S'
train['공휴일전후'][379] = 'S'
train['공휴일전후'][421] = 'H'
train['공휴일전후'][502] = 'H'
train['공휴일전후'][511] = 'S'
train['공휴일전후'][565] = 'S'
train['공휴일전후'][650] = 'H'
train['공휴일전후'][651] = 'S'
train['공휴일전후'][705] = 'C'
train['공휴일전후'][709] = 'S'
train['공휴일전후'][732] = 'H'
train['공휴일전후'][815] = 'S'
train['공휴일전후'][864] = 'S'
train['공휴일전후'][951] = 'C'
train['공휴일전후'][971] = 'H'
train['공휴일전후'][1038] = 'S'
train['공휴일전후'][1129] = 'H'
train['공휴일전후'][1187] = 'C'


test['공휴일전후'][10] = 'H'


In [215]:
train[train['공휴일전후'] != 'N'][need]

Unnamed: 0,일자,요일,중식계,석식계,휴가비율
4,2016-02-05,금,925.0,330.0,0.106882
17,2016-02-29,월,869.0,344.0,0.228758
152,2016-09-13,화,820.0,238.0,0.184543
245,2017-01-26,목,844.0,147.0,0.136819
310,2017-05-04,목,383.0,262.0,0.464164
311,2017-05-08,월,792.0,308.0,0.271141
330,2017-06-05,월,760.0,306.0,0.239048
379,2017-08-14,월,692.0,275.0,0.326276
421,2017-10-20,금,533.0,416.0,0.071834
502,2018-02-14,수,850.0,0.0,0.154414


## 텍스트(메뉴) 전처리

### train/test에서 중복 메뉴만 변수로서 사용

In [216]:
menu_train = train[['조식메뉴','중식메뉴','석식메뉴']]
menu_test = test[['조식메뉴','중식메뉴','석식메뉴']]
print(menu_train.shape)
print(menu_test.shape)

(1205, 3)
(50, 3)


In [217]:
def clean_split(df):
    df = df.split()
    for j in df:
        if '(' in j:
            del df[df.index(j)]    
    return df

In [218]:
menu_train['조식메뉴_prepro'] = menu_train['조식메뉴'].apply(clean_split)
menu_train['중식메뉴_prepro'] = menu_train['중식메뉴'].apply(clean_split)
menu_train['석식메뉴_prepro'] = menu_train['석식메뉴'].apply(clean_split)

menu_test['조식메뉴_prepro'] = menu_test['조식메뉴'].apply(clean_split)
menu_test['중식메뉴_prepro'] = menu_test['중식메뉴'].apply(clean_split)
menu_test['석식메뉴_prepro'] = menu_test['석식메뉴'].apply(clean_split)

In [219]:
lunch_train_f = [menu[:3] for menu in menu_train.중식메뉴_prepro]
lunch_test_f = [menu[:3] for menu in menu_test.중식메뉴_prepro]

dinner_train_f = [menu[:3] for menu in menu_train.석식메뉴_prepro]
dinner_test_f = [menu[:3] for menu in menu_test.석식메뉴_prepro]

In [220]:
lunch_train_menu = pd.DataFrame(lunch_train_f, columns = ['밥', '국', '메인'])
lunch_test_menu = pd.DataFrame(lunch_test_f, columns = ['밥', '국', '메인'])

dinner_train_menu = pd.DataFrame(dinner_train_f, columns = ['밥', '국', '메인'])
dinner_test_menu = pd.DataFrame(dinner_test_f, columns = ['밥', '국', '메인'])

## 최종 데이터 셋 구축

In [221]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '공휴일전후'],
      dtype='object')

In [253]:
lunch_train = pd.concat([train[['요일','월','일','주','본사시간외근무명령서승인건수', '본사출장자수', '본사휴가자수', '본사정원수','현본사소속재택근무자수', '공휴일전후','출근', '휴가비율', '출장비율', '야근비율', '재택비율','중식계']]],axis=1)
lunch_test = pd.concat([test[['요일','월','일','주','본사시간외근무명령서승인건수', '본사출장자수', '본사휴가자수', '본사정원수','현본사소속재택근무자수', '공휴일전후','출근', '휴가비율', '출장비율', '야근비율', '재택비율']]],axis=1)


dinner_train = pd.concat([train[['요일','월','일','주','본사시간외근무명령서승인건수', '본사출장자수', '본사휴가자수', '본사정원수','현본사소속재택근무자수', '공휴일전후','출근', '휴가비율', '출장비율', '야근비율', '재택비율','석식계']]],axis=1)
dinner_test = pd.concat([test[['요일','월','일','주','본사시간외근무명령서승인건수', '본사출장자수', '본사휴가자수', '본사정원수','현본사소속재택근무자수', '공휴일전후','출근', '휴가비율', '출장비율', '야근비율', '재택비율']]],axis=1)

In [254]:
print(lunch_train.shape)
print(lunch_test.shape)

(1205, 16)
(50, 15)


In [255]:
print(dinner_train.shape)
print(dinner_test.shape)

(1205, 16)
(50, 15)


#### 분포 확인 및 분포 조정

In [256]:
drop_index = dinner_train[dinner_train['석식계']==0].index

dinner_train.iloc[drop_index]

Unnamed: 0,요일,월,일,주,본사시간외근무명령서승인건수,본사출장자수,본사휴가자수,본사정원수,현본사소속재택근무자수,공휴일전후,출근,휴가비율,출장비율,야근비율,재택비율,석식계
204,수,11,30,48,0,207,68,2689,0.0,N,2414.0,0.025288,0.07698,0.0,0.0,0.0
224,수,12,28,52,0,225,166,2705,0.0,N,2314.0,0.061368,0.083179,0.0,0.0,0.0
244,수,1,25,4,0,203,79,2697,0.0,N,2415.0,0.029292,0.075269,0.0,0.0,0.0
262,수,2,22,8,0,252,75,2632,0.0,N,2305.0,0.028495,0.095745,0.0,0.0,0.0
281,수,3,22,12,0,235,53,2627,0.0,N,2339.0,0.020175,0.089456,0.0,0.0,0.0
306,수,4,26,17,0,304,45,2626,0.0,N,2277.0,0.017136,0.115765,0.0,0.0,0.0
327,수,5,31,22,0,265,43,2637,0.0,N,2329.0,0.016306,0.100493,0.0,0.0,0.0
346,수,6,28,26,0,259,58,2648,0.0,N,2331.0,0.021903,0.09781,0.0,0.0,0.0
366,수,7,26,30,0,246,254,2839,0.0,N,2339.0,0.089468,0.08665,0.0,0.0,0.0
392,금,9,1,35,45,303,177,2642,0.0,N,2162.0,0.066995,0.114686,0.020814,0.0,0.0


In [257]:
dinner_train.drop(drop_index, inplace=True)

print(dinner_train.shape)

(1162, 16)


# 중식 예측모델

In [260]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: ['요일', '월', '일', '주', '공휴일전후']
[0 1 2 3 9]


In [261]:
lunch_regression_model = setup(data=lunch_train, target='중식계', train_size=0.8,n_jobs=-1,fold = 5,
                               categorical_features = cat_features,
                               numeric_imputation = 'mean',
                               normalize = True)

Unnamed: 0,Description,Value
0,session_id,6123
1,Target,중식계
2,Original Data,"(1205, 16)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,5
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(964, 112)"


In [262]:
# 최고 성능 모델 선정(2개)
lunch_regression_best_models = compare_models(sort='MAE', fold=5, n_select=2)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,65.8251,7891.45,88.5893,0.8246,0.112,0.0811,0.984
gbr,Gradient Boosting Regressor,68.7135,8500.2164,91.963,0.8111,0.1154,0.0846,0.066
rf,Random Forest Regressor,71.0057,9303.3238,96.0589,0.7933,0.1228,0.0882,0.18
lightgbm,Light Gradient Boosting Machine,73.0409,9232.6812,95.9453,0.7939,0.1211,0.0895,0.29
xgboost,Extreme Gradient Boosting,73.2768,9530.1688,97.3396,0.7869,0.1221,0.09,0.414
lasso,Lasso Regression,73.4458,9544.7748,97.5237,0.7869,0.1231,0.0904,0.97
et,Extra Trees Regressor,73.8724,10150.2939,100.4686,0.7732,0.1283,0.0918,0.198
br,Bayesian Ridge,74.2193,9512.2765,97.4432,0.787,0.1239,0.091,0.012
ridge,Ridge Regression,74.7017,9709.5193,98.4373,0.7829,0.1272,0.0918,0.01
omp,Orthogonal Matching Pursuit,75.0396,10392.7903,101.5861,0.7699,0.1415,0.0942,0.008


In [263]:
lunch_regression_best_models_tuned = [tune_model(model, optimize='MAE') for model in lunch_regression_best_models]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,73.692,9989.9877,99.9499,0.7816,0.1165,0.0879
1,65.9684,7339.5791,85.6713,0.8229,0.1053,0.0815
2,65.4595,7385.062,85.9364,0.8112,0.1076,0.0796
3,66.5717,7440.2522,86.2569,0.8342,0.1075,0.0794
4,71.8473,9525.486,97.5986,0.8213,0.1338,0.0927
Mean,68.7078,8336.0734,91.0826,0.8142,0.1141,0.0842
SD,3.3858,1170.4746,6.3268,0.0179,0.0106,0.0053


In [264]:
lunch_model = blend_models(estimator_list=lunch_regression_best_models_tuned, fold=5, optimize='MAE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,70.5373,9396.3852,96.935,0.7946,0.1144,0.0851
1,65.7135,7454.7948,86.3412,0.8201,0.106,0.0811
2,64.2963,7059.7095,84.0221,0.8195,0.1064,0.079
3,66.4116,7196.0997,84.8298,0.8397,0.1072,0.0793
4,69.7032,9345.2629,96.6709,0.8246,0.1316,0.0897
Mean,67.3324,8090.4504,89.7598,0.8197,0.1131,0.0828
SD,2.3907,1053.2206,5.7993,0.0145,0.0097,0.0041


In [266]:
pred = predict_model(lunch_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,129.4112,23557.6845,153.4851,0.398,0.1704,0.1576


### 테스트 데이터 예측

In [267]:
submission = pd.read_csv('../data/sample_submission.csv')

In [268]:
final_model_lunch = finalize_model(lunch_model)
prep_pipe_lunch = get_config('prep_pipe')
prep_pipe_lunch.steps.append(['trained_model', final_model_lunch])

In [272]:
answer = pd.read_csv('../submission/제출해야될것.csv')['중식계']

abs(answer - pred_lunch).mean()

36.93714801900568

In [269]:
pred_lunch = prep_pipe_lunch.predict(lunch_test)

submission.iloc[:,1] = pred_lunch
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1030.240513,0
1,2021-01-28,951.382904,0
2,2021-01-29,616.514112,0
3,2021-02-01,1224.098803,0
4,2021-02-02,976.909941,0


# 석식 예측모델

In [273]:
dinner_regression_model = setup(data=dinner_train, target='석식계', train_size=0.8,n_jobs=-1,fold = 5,
                               categorical_features = cat_features,
                               numeric_imputation = 'mean',
                               normalize = True)

Unnamed: 0,Description,Value
0,session_id,8008
1,Target,석식계
2,Original Data,"(1162, 16)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,5
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(929, 111)"


In [274]:
dinner_regression_best_models = compare_models(sort='MAE', fold=5, n_select=3, exclude=['huber','llar','lar','par','lasso'])

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,45.3093,4009.9611,63.0331,0.6611,0.1515,0.1091,0.968
et,Extra Trees Regressor,46.3996,3985.8867,63.0041,0.6629,0.1555,0.1123,0.192
gbr,Gradient Boosting Regressor,47.448,4342.4268,65.627,0.633,0.1573,0.1127,0.064
rf,Random Forest Regressor,47.4622,4213.911,64.7584,0.6436,0.1581,0.1145,0.176
lightgbm,Light Gradient Boosting Machine,47.5172,4335.0982,65.6906,0.6326,0.1572,0.1139,0.308
xgboost,Extreme Gradient Boosting,49.1034,4497.8227,66.909,0.6197,0.1578,0.1154,0.44
knn,K Neighbors Regressor,50.2549,4701.9756,68.4417,0.6022,0.1678,0.1228,0.018
br,Bayesian Ridge,50.9913,4612.1862,67.7951,0.6102,0.1628,0.1216,0.014
ridge,Ridge Regression,51.4196,4717.5196,68.5121,0.6019,0.1645,0.1222,1.46
omp,Orthogonal Matching Pursuit,53.6566,5194.5775,71.94,0.5614,0.1733,0.1279,0.012


In [284]:
dinner_regression_best_models_tuned[0].get_feature_importance()

array([9.85390119e+00, 1.23191220e+01, 3.81041743e+00, 9.88517015e+00,
       4.57404490e+00, 2.89904811e+01, 7.81228759e+00, 1.13223587e+00,
       1.06109230e+00, 4.45137387e-02, 1.03815628e-01, 1.51786296e-01,
       2.76575759e+00, 5.13065869e-01, 7.58060966e-02, 2.89718741e+00,
       8.76355001e-01, 9.30572351e-01, 0.00000000e+00, 1.51908275e-01,
       1.97410203e-01, 5.36938932e-01, 3.84719331e-01, 3.81884536e-01,
       0.00000000e+00, 4.44316560e-02, 1.53654545e-01, 0.00000000e+00,
       0.00000000e+00, 1.13998746e-01, 0.00000000e+00, 4.35084226e-02,
       0.00000000e+00, 0.00000000e+00, 6.18264789e-02, 0.00000000e+00,
       2.41251630e-02, 3.60940865e-01, 4.59604235e-02, 1.45910205e-01,
       1.45824829e-01, 3.03944925e-02, 2.52825336e-02, 7.65229388e-03,
       0.00000000e+00, 0.00000000e+00, 7.73281793e-02, 5.71009241e-02,
       2.58346707e-02, 2.50265449e-01, 5.49365467e-02, 1.45098424e-01,
       6.75187491e-03, 1.97856732e-01, 3.37633342e-02, 0.00000000e+00,
      

In [276]:
dinner_regression_best_models_tuned = [tune_model(model, optimize='MAE') for model in lunch_regression_best_models]

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,46.8623,4212.9006,64.9069,0.6442,0.1453,0.1043
1,45.9561,3506.0913,59.2123,0.7001,0.1317,0.1023
2,50.6349,5521.6006,74.3075,0.5356,0.1786,0.1193
3,51.2972,4425.864,66.5272,0.6458,0.1688,0.1312
4,48.3624,4400.1674,66.3338,0.6088,0.1756,0.122
Mean,48.6226,4413.3248,66.2575,0.6269,0.16,0.1158
SD,2.0727,646.815,4.8237,0.0542,0.0184,0.011


In [277]:
dinner_model = blend_models(estimator_list=dinner_regression_best_models, fold=5, optimize='MAE')

Unnamed: 0,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,41.7308,3224.2788,56.7827,0.7277,0.1315,0.0948
1,42.866,3112.8588,55.793,0.7338,0.1228,0.0948
2,47.8771,5041.9565,71.0067,0.576,0.1673,0.1155
3,46.2995,3877.7496,62.2716,0.6897,0.1547,0.1173
4,45.0357,3971.0852,63.0165,0.647,0.1701,0.1157
Mean,44.7618,3845.5858,61.7741,0.6748,0.1493,0.1076
SD,2.2327,688.6216,5.4354,0.0584,0.019,0.0105


In [278]:
pred = predict_model(dinner_model)

Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE
0,Voting Regressor,107.0931,17300.2967,131.5306,-0.4582,0.2718,0.2552


## 테스트

### 테스트 데이터 예측

In [279]:
final_model_dinner = finalize_model(dinner_model)
prep_pipe_dinner = get_config('prep_pipe')
prep_pipe_dinner.steps.append(['trained_model', final_model_dinner])

In [280]:
pred_dinner = prep_pipe_dinner.predict(dinner_test)
submission.iloc[:,2] = pred_dinner
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1030.240513,383.74234
1,2021-01-28,951.382904,391.616347
2,2021-01-29,616.514112,247.968474
3,2021-02-01,1224.098803,550.17926
4,2021-02-02,976.909941,466.274002


# 저장

In [35]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_pycaret_KJH.csv', index =False)

오늘 날짜 : 20210627


In [285]:
def plot_feature_importance(importance, names, model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [294]:
len(dinner_regression_best_models_tuned[0].get_feature_importance())

111

In [292]:

plot_feature_importance(dinner_regression_best_models_tuned[0].get_feature_importance(),dinner_train.columns,"CATBOOST")

ValueError: arrays must all be same length