In [1]:
import pandas as pd
import numpy as np
from pycaret.regression import setup, compare_models, blend_models,tune_model,predict_model,get_config, finalize_model

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

In [2]:
from workalendar.asia import SouthKorea
import pendulum

In [3]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [4]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

In [5]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 7,
    6: 6,
    7: 10,
    8: 8,
    9: 5,
    10: 3,
    11: 9,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

weekday_rank4dinner = {
    '월': 1,
    '화': 2,
    '수': 4,
    '목': 3,
    '금': 5,
}

weekday_rank4lunch = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5,
}

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일'].map(weekday_rank4lunch)
test['요일(중식)'] = test['요일'].map(weekday_rank4lunch)

In [6]:
rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

In [7]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3
    if tomorrow in holidays:
        return 2
    elif yesterday in holidays:
        return 1
    else : 
        return 0

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

## 공휴일 변수 생성

In [8]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

lunch_train = train[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','중식계']]
lunch_test = test[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수']]

dinner_train= train[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','석식계']]
dinner_test = test[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수']]

In [9]:
lunch_train.columns

Index(['공휴일전후', '몇주차', '인원변화', '요일(중식)', '월(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '중식계'],
      dtype='object')

In [10]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 15)
(50, 14)


In [11]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 15)
(50, 14)


In [12]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: []
[]


In [13]:
y_lunch = train[['중식계']]
y_dinner = train[['석식계']]

drop_index = dinner_train[dinner_train['석식계']==0].index

dinner_train.drop(drop_index, inplace=True)
y_dinner.drop(drop_index, inplace=True)

print(dinner_train.shape)

(1144, 15)


In [14]:
lunch_train.drop(columns=['중식계'], inplace=True)
dinner_train.drop(columns=['석식계'], inplace=True)

#### 분포 확인 및 분포 조정

# 중식 예측모델

In [15]:
from automl_alex import LightGBMRegressor
from sklearn.metrics import mean_absolute_error
import sklearn

In [16]:
lunch_model = LightGBMRegressor(random_state=42)

lunch_model.opt(lunch_train,  y_lunch,
            verbose=3,  cold_start=120,folds=8,opt_lvl=3,early_stoping=120, auto_parameters=False,#metric=sklearn.metrics.roc_auc_score,
      timeout=1100,metric=sklearn.metrics.mean_absolute_error # optimization time in seconds,
      )

[32m15:31:27[0m | [1mregression optimize: minimize[0m
[32m15:31:27[0m | [1m##################################################[0m
[32m15:31:28[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m15:31:34[0m | [1m One iteration ~ 0.6 sec[0m
[32m15:31:34[0m | [1m Possible iters ~ 1712.0[0m
[32m15:31:34[0m | [1m--------------------------------------------------[0m
[32m15:31:34[0m | [1m  Pruned Threshold Score: 71.5295[0m
[32m15:31:34[0m | [1m##################################################[0m
[32m15:31:34[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m15:31:34[0m | [1m##################################################[0m
[32m15:31:34[0m | [1m> Start optimization with the parameters:[0m
[32m15:31:34[0m | [1mCV_Folds = 8[0m
[32m15:31:34[0m | [1mScore_CV_Folds = 2[0m
[32m15:31:34[0m | [1mFeature_Selection = False[0m
[32m15:31:34[0m | [1mOpt_lvl = 3[0m
[32m15:31:34[0m | [1mCold_start = 120

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


[32m15:35:09[0m | [1mBest Score: 63.952 mean_absolute_error[0m


Finished loading model, total used 300 iterations


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lgbm_bagging_fraction,params_lgbm_bagging_freq,params_lgbm_feature_fraction,params_lgbm_learning_rate,params_lgbm_min_child_samples,params_lgbm_num_iterations,params_lgbm_num_leaves,state
0,0,77.5172,2021-06-29 15:31:28.123238,2021-06-29 15:31:28.463454,0 days 00:00:00.340216,0.8,3.0,0.5,0.2537,33,300,7,COMPLETE
1,1,77.6122,2021-06-29 15:31:28.531646,2021-06-29 15:31:28.974350,0 days 00:00:00.442704,0.4,10.0,1.0,0.077254,29,400,57,COMPLETE
2,2,70.2852,2021-06-29 15:31:29.029918,2021-06-29 15:31:29.401662,0 days 00:00:00.371744,0.7,4.0,0.7,0.01866,5,700,3,COMPLETE
3,3,68.9396,2021-06-29 15:31:29.457393,2021-06-29 15:31:29.853595,0 days 00:00:00.396202,0.7,3.0,0.9,0.027011,7,700,3,COMPLETE
4,4,69.6791,2021-06-29 15:31:29.903856,2021-06-29 15:31:30.824328,0 days 00:00:00.920472,0.5,11.0,0.4,0.011712,19,1000,18,COMPLETE
5,5,66.6292,2021-06-29 15:31:30.879914,2021-06-29 15:31:32.397655,0 days 00:00:01.517741,0.8,3.0,0.7,0.028181,2,600,45,COMPLETE
6,6,69.2391,2021-06-29 15:31:32.463917,2021-06-29 15:31:32.775636,0 days 00:00:00.311719,0.8,7.0,0.6,0.220382,4,700,2,COMPLETE
7,7,75.5249,2021-06-29 15:31:32.826365,2021-06-29 15:31:33.134354,0 days 00:00:00.307989,1.0,,,0.270517,39,1000,3,COMPLETE
8,8,78.5434,2021-06-29 15:31:33.195339,2021-06-29 15:31:33.847415,0 days 00:00:00.652076,0.5,5.0,0.4,0.229996,2,600,19,COMPLETE
9,9,72.7738,2021-06-29 15:31:33.905840,2021-06-29 15:31:34.467833,0 days 00:00:00.561993,0.5,3.0,0.7,0.167551,7,900,5,COMPLETE


In [17]:
predicts_LGBM_lunch = lunch_model.predict(lunch_test)

In [21]:
dinner_model = LightGBMRegressor(random_state=42)

dinner_model.opt(dinner_train,  y_dinner,
            verbose=3,  cold_start=120,folds=8,opt_lvl=3,early_stoping=120, auto_parameters=False,#metric=sklearn.metrics.roc_auc_score,
      timeout=1100,metric=sklearn.metrics.mean_absolute_error # optimization time in seconds,
      )

[32m15:36:28[0m | [1mregression optimize: minimize[0m
[32m15:36:28[0m | [1m##################################################[0m
[32m15:36:28[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m15:36:39[0m | [1m One iteration ~ 1.0 sec[0m
[32m15:36:39[0m | [1m Possible iters ~ 1065.0[0m
[32m15:36:39[0m | [1m--------------------------------------------------[0m
[32m15:36:39[0m | [1m  Pruned Threshold Score: 44.2185[0m
[32m15:36:39[0m | [1m##################################################[0m
[32m15:36:39[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m15:36:39[0m | [1m##################################################[0m
[32m15:36:39[0m | [1m> Start optimization with the parameters:[0m
[32m15:36:39[0m | [1mCV_Folds = 8[0m
[32m15:36:39[0m | [1mScore_CV_Folds = 2[0m
[32m15:36:39[0m | [1mFeature_Selection = False[0m
[32m15:36:39[0m | [1mOpt_lvl = 3[0m
[32m15:36:39[0m | [1mCold_start = 120

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


[32m15:39:19[0m | [1mBest Score: 40.0092 mean_absolute_error[0m


Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lgbm_bagging_fraction,params_lgbm_bagging_freq,params_lgbm_feature_fraction,params_lgbm_learning_rate,params_lgbm_min_child_samples,params_lgbm_num_iterations,params_lgbm_num_leaves,state
0,0,46.2868,2021-06-29 15:36:28.702114,2021-06-29 15:36:29.150195,0 days 00:00:00.448081,0.8,3.0,0.5,0.2537,33,300,7,COMPLETE
1,1,44.7242,2021-06-29 15:36:29.257544,2021-06-29 15:36:29.970193,0 days 00:00:00.712649,0.4,10.0,1.0,0.077254,29,400,57,COMPLETE
2,2,43.7127,2021-06-29 15:36:30.066336,2021-06-29 15:36:30.614964,0 days 00:00:00.548628,0.7,4.0,0.7,0.01866,5,700,3,COMPLETE
3,3,42.7316,2021-06-29 15:36:30.701897,2021-06-29 15:36:31.245009,0 days 00:00:00.543112,0.7,3.0,0.9,0.027011,7,700,3,COMPLETE
4,4,43.1386,2021-06-29 15:36:31.333414,2021-06-29 15:36:32.972624,0 days 00:00:01.639210,0.5,11.0,0.4,0.011712,19,1000,18,COMPLETE
5,5,42.2816,2021-06-29 15:36:33.054824,2021-06-29 15:36:35.604924,0 days 00:00:02.550100,0.8,3.0,0.7,0.028181,2,600,45,COMPLETE
6,6,42.5269,2021-06-29 15:36:35.697853,2021-06-29 15:36:36.099644,0 days 00:00:00.401791,0.8,7.0,0.6,0.220382,4,700,2,COMPLETE
7,7,46.5734,2021-06-29 15:36:36.183777,2021-06-29 15:36:36.701977,0 days 00:00:00.518200,1.0,,,0.270517,39,1000,3,COMPLETE
8,8,49.4282,2021-06-29 15:36:36.788898,2021-06-29 15:36:38.066847,0 days 00:00:01.277949,0.5,5.0,0.4,0.229996,2,600,19,COMPLETE
9,9,46.5861,2021-06-29 15:36:38.153039,2021-06-29 15:36:38.938802,0 days 00:00:00.785763,0.5,3.0,0.7,0.167551,7,900,5,COMPLETE


In [22]:
predicts_LGBM_dinner = dinner_model.predict(dinner_test)

In [23]:
submission = pd.read_csv('../data/sample_submission.csv')

In [24]:
submission.iloc[:,1] = predicts_LGBM_lunch
submission.iloc[:,2] = predicts_LGBM_dinner
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1007.779865,392.469928
1,2021-01-28,938.584518,402.056005
2,2021-01-29,613.736675,249.632361
3,2021-02-01,1251.524352,514.816537
4,2021-02-02,1059.736324,455.112564


# 저장

In [25]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_lgbm_autoML.csv', index =False)

오늘 날짜 : 20210629
