In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

In [2]:
from workalendar.asia import SouthKorea
import pendulum

In [3]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [4]:
train['월'] = str(pd.DatetimeIndex(train['일자']).month)
test['월'] = str(pd.DatetimeIndex(test['일자']).month)
train['주'] = str(pd.DatetimeIndex(train['일자']).week)
test['주'] = str(pd.DatetimeIndex(test['일자']).week)
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']

  train['주'] = str(pd.DatetimeIndex(train['일자']).week)
  test['주'] = str(pd.DatetimeIndex(test['일자']).week)


In [5]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return '3'
    if tomorrow in holidays:
        return '2'
    elif yesterday in holidays:
        return '1'
    else : 
        return '0'

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return str(wom)
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

train['day2']=0
train.loc[train['일']>9, 'day2'] = 1
train.loc[train['일']>19, 'day2'] = 2

test['day2']=0
test.loc[test['일']>9, 'day2'] = 1
test.loc[test['일']>19, 'day2'] = 2

## 공휴일 변수 생성

In [6]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '식사가능자수', '공휴일전후', '몇주차', '인원변화',
       'day2'],
      dtype='object')

In [8]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

lunch_train = train[['공휴일전후', '몇주차', '인원변화', '요일','월','day2','주','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
lunch_test = test[['공휴일전후', '몇주차', '인원변화', '요일','월','day2','주','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

dinner_train= train[['공휴일전후', '몇주차', '인원변화', '요일','월','day2','주','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
dinner_test = test[['공휴일전후', '몇주차', '인원변화', '요일','월','day2','주','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

In [9]:
lunch_train.columns

Index(['공휴일전후', '몇주차', '인원변화', '요일', '월', 'day2', '주', '출근', '휴가비율', '출장비율',
       '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수'],
      dtype='object')

In [10]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 16)
(50, 16)


In [11]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 16)
(50, 16)


In [12]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: ['공휴일전후', '몇주차', '요일', '월', '주']
[0 1 3 4 6]


In [13]:
y_lunch = train[['중식계']]
y_dinner = train[['석식계']]

#### 분포 확인 및 분포 조정

# 중식 예측모델

In [14]:
from automl_alex import LightGBMRegressor, CatBoostRegressor, AutoMLRegressor
from sklearn.metrics import mean_absolute_error as MAE
import sklearn

In [15]:
lunch_model = AutoMLRegressor(random_state=42, metric=MAE)

lunch_model.fit(lunch_train, y_lunch,
                verbose=3,
                folds=12,
                opt_lvl=3,
                early_stoping=120,
                auto_parameters=False,
                timeout=1100
               )

[32m14:32:14[0m | [1m> Start Fit Base Model[0m
[32m14:32:33[0m | [1m##################################################[0m
[32m14:32:33[0m | [1m> Start Fit Models 2[0m
[32m14:32:33[0m | [1m##################################################[0m
[32m14:32:33[0m | [1m##################################################[0m
[32m14:32:34[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m14:33:06[0m | [1m One iteration ~ 3.2 sec[0m
[32m14:33:06[0m | [1m Possible iters ~ 304.0[0m
[32m14:33:06[0m | [1m--------------------------------------------------[0m
[32m14:33:06[0m | [1m  Pruned Threshold Score: 76.079[0m
[32m14:33:06[0m | [1m##################################################[0m
[32m14:33:06[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m14:33:06[0m | [1m##################################################[0m
[32m14:33:06[0m | [1m> Start optimization with the parameters:[0m
[32m14:33:06[0m | [1m

<automl_alex.automl_alex.AutoMLRegressor at 0x24a15aa6c10>

In [16]:
predicts_Auto_lunch = lunch_model.predict(lunch_test)

In [17]:
dinner_model = AutoMLRegressor(random_state=42, metric=MAE)

dinner_model.fit(dinner_train, y_dinner,         
                 verbose=3,
                 folds=12,
                 opt_lvl=3,
                 early_stoping=120,
                 auto_parameters=False,
                 timeout=1100
                )

[32m14:50:07[0m | [1m> Start Fit Base Model[0m
[32m14:50:26[0m | [1m##################################################[0m
[32m14:50:26[0m | [1m> Start Fit Models 2[0m
[32m14:50:26[0m | [1m##################################################[0m
[32m14:50:26[0m | [1m##################################################[0m
[32m14:50:26[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m14:50:58[0m | [1m One iteration ~ 3.2 sec[0m
[32m14:50:58[0m | [1m Possible iters ~ 302.0[0m
[32m14:50:58[0m | [1m--------------------------------------------------[0m
[32m14:50:58[0m | [1m  Pruned Threshold Score: 72.1806[0m
[32m14:50:58[0m | [1m##################################################[0m
[32m14:50:58[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m14:50:58[0m | [1m##################################################[0m
[32m14:50:58[0m | [1m> Start optimization with the parameters:[0m
[32m14:50:58[0m | [1

<automl_alex.automl_alex.AutoMLRegressor at 0x24a170d3460>

In [18]:
lunch_train.columns, lunch_test.columns, 

(Index(['공휴일전후', '몇주차', '인원변화', '요일', '월', 'day2', '주', '출근', '휴가비율', '출장비율',
        '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수'],
       dtype='object'),
 Index(['공휴일전후', '몇주차', '인원변화', '요일', '월', 'day2', '주', '출근', '휴가비율', '출장비율',
        '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수'],
       dtype='object'))

In [19]:
predicts_Auto_dinner = dinner_model.predict(dinner_test)

In [20]:
submission = pd.read_csv('../data/sample_submission.csv')

In [21]:
submission.iloc[:,1] = predicts_Auto_lunch
submission.iloc[:,2] = predicts_Auto_dinner
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1017.253521,368.439814
1,2021-01-28,919.18436,411.866025
2,2021-01-29,617.991654,244.698275
3,2021-02-01,1267.863473,503.58246
4,2021-02-02,1064.466851,430.450536


# 저장

In [25]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_autoML.csv', index =False)

오늘 날짜 : 20210716


In [26]:
lunch_model = AutoMLRegressor(random_state=42, metric=MAE)

lunch_model.fit(lunch_train, y_lunch,
                verbose=3,
                folds=12,
                opt_lvl=5,
                early_stoping=120,
                auto_parameters=False,
                timeout=1100
               )

[32m15:07:04[0m | [1m> Start Fit Base Model[0m
[32m15:07:22[0m | [1m##################################################[0m
[32m15:07:22[0m | [1m> Start Fit Models 2[0m
[32m15:07:22[0m | [1m##################################################[0m
[32m15:07:22[0m | [1m##################################################[0m
[32m15:07:23[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m15:08:12[0m | [1m One iteration ~ 4.9 sec[0m
[32m15:08:12[0m | [1m Possible iters ~ 195.0[0m
[32m15:08:12[0m | [1m--------------------------------------------------[0m
[32m15:08:12[0m | [1m  Pruned Threshold Score: 136.8445[0m
[32m15:08:12[0m | [1m##################################################[0m
[32m15:08:12[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m15:08:12[0m | [1m##################################################[0m
[32m15:08:12[0m | [1m> Start optimization with the parameters:[0m
[32m15:08:12[0m | [



















Optimize: : 119it [06:59,  3.52s/it, | Model: LightGBM | OptScore: 72.8929 | Best mean_absolute_error: 67.7803 ]


KeyboardInterrupt: 

In [None]:
predicts_Auto_lunch_5 = lunch_model.predict(lunch_test)

In [None]:
dinner_model = AutoMLRegressor(random_state=42, metric=MAE)

dinner_model.fit(dinner_train, y_dinner,         
                 verbose=3,
                 folds=12,
                 opt_lvl=5,
                 early_stoping=120,
                 auto_parameters=False,
                 timeout=1100
                )

In [None]:
predicts_Auto_dinner_5 = dinner_model.predict(dinner_test)

In [None]:
submission2 = pd.read_csv('../data/sample_submission.csv')

submission2.iloc[:,1] = predicts_Auto_lunch_5
submission2.iloc[:,2] = predicts_Auto_dinner_5
submission2.head()

In [None]:
submission2.to_csv(f'../submission/{today}_autoML_5.csv', index =False)