In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

In [2]:
from workalendar.asia import SouthKorea
import pendulum

In [3]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [4]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']

# train['중식참여율'] = train['중식계'] / train['식사가능자수']

  train['주'] = pd.DatetimeIndex(train['일자']).week
  test['주'] = pd.DatetimeIndex(test['일자']).week


In [5]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 7,
    6: 6,
    7: 10,
    8: 8,
    9: 5,
    10: 3,
    11: 9,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

weekday_rank4dinner = {
    '월': 1,
    '화': 2,
    '수': 4,
    '목': 3,
    '금': 5,
}

weekday_rank4lunch = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5,
}

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일'].map(weekday_rank4lunch)
test['요일(중식)'] = test['요일'].map(weekday_rank4lunch)

In [6]:
rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

In [7]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3
    if tomorrow in holidays:
        return 2
    elif yesterday in holidays:
        return 1
    else : 
        return 0

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

train['day2']="0"
train.loc[train['일']>9, 'day2'] = "1"
train.loc[train['일']>19, 'day2'] = "2"

test['day2']="0"
test.loc[test['일']>9, 'day2'] = "1"
test.loc[test['일']>19, 'day2'] = "2"

## 공휴일 변수 생성

In [8]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '식사가능자수', '월(석식)', '월(중식)',
       '요일(석식)', '요일(중식)', '주(중식)', '주(석식)', '공휴일전후', '몇주차', '인원변화', 'day2'],
      dtype='object')

In [9]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

lunch_train = train[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
lunch_test = test[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

dinner_train= train[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
dinner_test = test[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

In [10]:
lunch_train.columns

Index(['공휴일전후', '몇주차', '인원변화', '요일(중식)', '월(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수'],
      dtype='object')

In [11]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 16)
(50, 16)


In [12]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 16)
(50, 16)


In [13]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: []
[]


In [14]:
y_lunch = train[['중식계']]
y_dinner = train[['석식계']]

#### 분포 확인 및 분포 조정

# 중식 예측모델

In [15]:
from automl_alex import LightGBMRegressor, CatBoost, AutoMLRegressor
from sklearn.metrics import mean_absolute_error as MAE
import sklearn

In [17]:
lunch_model = CatBoost(type_of_estimator="regression", random_state=42)

lunch_model.opt(lunch_train, y_lunch,
                verbose=3,
                cold_start=120,
                folds=12,
                opt_lvl=3,
                early_stoping=120,
                auto_parameters=False,
                timeout=1100,
                metric=sklearn.metrics.mean_absolute_error # optimization time in seconds,
            )

[32m11:57:54[0m | [1mregression optimize: minimize[0m
[32m11:57:54[0m | [1m##################################################[0m
[32m11:57:55[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
Custom logger is already specified. Specify more than one logger at same time is not thread safe.[32m12:00:39[0m | [1m One iteration ~ 16.4 sec[0m
[32m12:00:39[0m | [1m Possible iters ~ 66.0[0m
[32m12:00:39[0m | [33m[1m! Not enough time to find the optimal parameters. 
                     Possible iters < 100. 
                     Please, Increase the 'timeout' parameter for normal optimization.[0m
[32m12:00:39[0m | [1m--------------------------------------------------[0m
[32m12:00:39[0m | [1m  Pruned Threshold Score: 69.3406[0m
[32m12:00:39[0m | [1m##################################################[0m
[32m12:00:39[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m12:00:39[0m | [1m####################################

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_cb_bagging_temperature,params_cb_depth,params_cb_min_child_samples,params_cb_objective,params_cb_subsample,state
0,0,68.3913,2021-07-16 11:57:55.232549,2021-07-16 11:58:33.109979,0 days 00:00:37.877430,8,10,38,RMSE,0.6,COMPLETE
1,1,67.3388,2021-07-16 11:58:33.191680,2021-07-16 11:58:44.161965,0 days 00:00:10.970285,0,8,61,MAE,1.0,COMPLETE
2,2,68.2534,2021-07-16 11:58:44.233455,2021-07-16 11:58:50.737365,0 days 00:00:06.503910,4,7,31,MAE,0.3,COMPLETE
3,3,70.2232,2021-07-16 11:58:50.812131,2021-07-16 11:59:09.365409,0 days 00:00:18.553278,2,9,46,Quantile,0.6,COMPLETE
4,4,69.7512,2021-07-16 11:59:09.451158,2021-07-16 11:59:55.868437,0 days 00:00:46.417279,10,10,7,Quantile,0.9,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...
89,89,127.9266,2021-07-16 12:12:33.356242,2021-07-16 12:12:35.443328,0 days 00:00:02.087086,3,6,48,MAPE,1.0,PRUNED
90,90,120.2608,2021-07-16 12:12:35.517620,2021-07-16 12:12:36.936693,0 days 00:00:01.419073,0,5,94,MAPE,0.8,PRUNED
91,91,69.5574,2021-07-16 12:12:36.996097,2021-07-16 12:12:38.455584,0 days 00:00:01.459487,1,5,21,RMSE,0.9,PRUNED
92,92,69.7314,2021-07-16 12:12:38.520633,2021-07-16 12:12:48.743553,0 days 00:00:10.222920,4,9,40,Quantile,0.4,PRUNED


In [18]:
predicts_Auto_lunch = lunch_model.predict(lunch_test)

In [28]:
dinner_model = CatBoost(type_of_estimator="regression", random_state=42)

dinner_model.opt(dinner_train, y_dinner,
                 verbose=3,
                 cold_start=120,
                 folds=12,
                 opt_lvl=3,
                 early_stoping=120,
                 auto_parameters=False,#metric=sklearn.metrics.roc_auc_score,
                 timeout=1100,
                 metric=sklearn.metrics.mean_absolute_error # optimization time in seconds,
             )

[32m12:52:41[0m | [1mregression optimize: minimize[0m
[32m12:52:41[0m | [1m##################################################[0m
[32m12:52:41[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m12:55:11[0m | [1m One iteration ~ 15.0 sec[0m
[32m12:55:11[0m | [1m Possible iters ~ 73.0[0m
[32m12:55:11[0m | [33m[1m! Not enough time to find the optimal parameters. 
                     Possible iters < 100. 
                     Please, Increase the 'timeout' parameter for normal optimization.[0m
[32m12:55:11[0m | [1m--------------------------------------------------[0m
[32m12:55:11[0m | [1m  Pruned Threshold Score: 68.7502[0m
[32m12:55:11[0m | [1m##################################################[0m
[32m12:55:11[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m12:55:11[0m | [1m##################################################[0m
[32m12:55:11[0m | [1m> Start optimization with the parameters:[0m
[32m12:

Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_cb_bagging_temperature,params_cb_depth,params_cb_min_child_samples,params_cb_objective,params_cb_subsample,state
0,0,66.8533,2021-07-16 12:52:41.566379,2021-07-16 12:53:18.177758,0 days 00:00:36.611379,8,10,38,RMSE,0.6,COMPLETE
1,1,67.9914,2021-07-16 12:53:18.267076,2021-07-16 12:53:28.288886,0 days 00:00:10.021810,0,8,61,MAE,1.0,COMPLETE
2,2,68.6498,2021-07-16 12:53:28.351707,2021-07-16 12:53:34.564641,0 days 00:00:06.212934,4,7,31,MAE,0.3,COMPLETE
3,3,68.8507,2021-07-16 12:53:34.635276,2021-07-16 12:53:52.137279,0 days 00:00:17.502003,2,9,46,Quantile,0.6,COMPLETE
4,4,71.7005,2021-07-16 12:53:52.201062,2021-07-16 12:54:34.175312,0 days 00:00:41.974250,10,10,7,Quantile,0.9,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...
62,62,66.8436,2021-07-16 13:06:48.124130,2021-07-16 13:06:50.047466,0 days 00:00:01.923336,10,4,59,RMSE,1.0,COMPLETE
63,63,63.7105,2021-07-16 13:06:50.159426,2021-07-16 13:06:53.304653,0 days 00:00:03.145227,10,5,69,MAE,0.9,COMPLETE
64,64,75.0257,2021-07-16 13:06:53.433557,2021-07-16 13:07:36.072642,0 days 00:00:42.639085,0,10,94,Quantile,0.1,COMPLETE
65,65,64.3060,2021-07-16 13:07:36.175028,2021-07-16 13:07:40.368823,0 days 00:00:04.193795,10,6,60,MAE,0.9,COMPLETE


In [29]:
dinner_train.columns, dinner_test.columns, 

(Index(['공휴일전후', '몇주차', '인원변화', '요일(석식)', '월(석식)', '일', '주(석식)', '출근', '휴가비율',
        '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수'],
       dtype='object'),
 Index(['공휴일전후', '몇주차', '인원변화', '요일(석식)', '월(석식)', '일', '주(석식)', '출근', '휴가비율',
        '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수'],
       dtype='object'))

In [30]:
predicts_Auto_dinner = dinner_model.predict(dinner_test)

In [31]:
submission = pd.read_csv('../data/sample_submission.csv')

In [32]:
submission.iloc[:,1] = predicts_Auto_lunch
submission.iloc[:,2] = predicts_Auto_dinner
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,961.954478,333.077168
1,2021-01-28,913.409475,426.284691
2,2021-01-29,614.251658,209.834634
3,2021-02-01,1264.453764,551.838492
4,2021-02-02,1064.51957,497.719656


In [34]:
def compare_ans(DIR):
    answer = pd.read_csv(DIR)

    lunch_answer = np.array(answer.iloc[:,1])
    dinner_answer = np.array(answer.iloc[:,2])
    
    lunch_MAE = abs(predicts_Auto_lunch - lunch_answer).mean()
    dinner_MAE = abs(predicts_Auto_dinner - dinner_answer).mean()
    
    print("lunch_MAE : ", lunch_MAE)
    print("dinner_MAE : ", dinner_MAE)
    print("total_MAE : ", (lunch_MAE+dinner_MAE)/2)
    
    
compare_ans('../submission/20210715_lgbm_autoML.csv')
compare_ans('../submission/20210715_lgbm_autoML_ensenble.csv')

compare_ans('../submission/20210715_lgbm_sort_autoML.csv')

lunch_MAE :  24.191534996312008
dinner_MAE :  24.554900372545195
total_MAE :  24.373217684428603
lunch_MAE :  39.05159470045701
dinner_MAE :  21.11452758269141
total_MAE :  30.08306114157421
lunch_MAE :  29.634882771873407
dinner_MAE :  23.359800064721917
total_MAE :  26.497341418297662


In [None]:
import math

math.cos(math.pi*68.44/180)*69.5

# 저장

In [35]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_cat_autoML.csv', index =False)

오늘 날짜 : 20210716


In [None]:
answer = pd.read_csv('../submission/20210715_pycaret_(2).csv')

In [None]:
best_submission = pd.read_csv('../data/sample_submission.csv')
best_submission.iloc[:,1:] = submission.iloc[:,1:]*5/9 +  answer.iloc[:,1:]*4/9
best_submission.to_csv(f'../submission/{today}_lgbm_autoML_ensenble.csv', index =False)