In [5]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

In [6]:
from workalendar.asia import SouthKorea
import pendulum

In [7]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [8]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']

# train['중식참여율'] = train['중식계'] / train['식사가능자수']

  train['주'] = pd.DatetimeIndex(train['일자']).week
  test['주'] = pd.DatetimeIndex(test['일자']).week


In [9]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 7,
    6: 6,
    7: 10,
    8: 8,
    9: 5,
    10: 3,
    11: 9,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

weekday_rank4dinner = {
    '월': 1,
    '화': 2,
    '수': 4,
    '목': 3,
    '금': 5,
}

weekday_rank4lunch = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5,
}

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일'].map(weekday_rank4lunch)
test['요일(중식)'] = test['요일'].map(weekday_rank4lunch)

In [10]:
rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

In [11]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3
    if tomorrow in holidays:
        return 2
    elif yesterday in holidays:
        return 1
    else : 
        return 0

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

## 공휴일 변수 생성

In [19]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '식사가능자수', '중식참여율', '월(석식)',
       '월(중식)', '요일(석식)', '요일(중식)', '주(중식)', '주(석식)', '공휴일전후', '몇주차', '인원변화'],
      dtype='object')

In [35]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

lunch_train = train[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
lunch_test = test[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

dinner_train= train[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
dinner_test = test[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

In [21]:
lunch_train.columns

Index(['공휴일전후', '몇주차', '인원변화', '요일(중식)', '월(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수',
       '본사정원수'],
      dtype='object')

In [22]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 17)
(50, 17)


In [23]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 17)
(50, 17)


In [24]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: []
[]


In [25]:
y_lunch = train[['중식계']]
y_dinner = train[['석식계']]

#### 분포 확인 및 분포 조정

# 중식 예측모델

In [29]:
from automl_alex import LightGBMRegressor
from sklearn.metrics import mean_absolute_error
import sklearn

In [30]:
lunch_model = LightGBMRegressor(random_state=42)

lunch_model.opt(lunch_train,  y_lunch,
            verbose=3,  cold_start=120,folds=8,opt_lvl=3,early_stoping=120, auto_parameters=False,#metric=sklearn.metrics.roc_auc_score,
      timeout=1100,metric=sklearn.metrics.mean_absolute_error # optimization time in seconds,
      )

[32m13:02:21[0m | [1mregression optimize: minimize[0m
[32m13:02:21[0m | [1m##################################################[0m
[32m13:02:22[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m13:02:28[0m | [1m One iteration ~ 0.6 sec[0m
[32m13:02:28[0m | [1m Possible iters ~ 1775.0[0m
[32m13:02:28[0m | [1m--------------------------------------------------[0m
[32m13:02:28[0m | [1m  Pruned Threshold Score: 71.9028[0m
[32m13:02:28[0m | [1m##################################################[0m
[32m13:02:28[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m13:02:28[0m | [1m##################################################[0m
[32m13:02:28[0m | [1m> Start optimization with the parameters:[0m
[32m13:02:28[0m | [1mCV_Folds = 8[0m
[32m13:02:28[0m | [1mScore_CV_Folds = 2[0m
[32m13:02:28[0m | [1mFeature_Selection = False[0m
[32m13:02:28[0m | [1mOpt_lvl = 3[0m
[32m13:02:28[0m | [1mCold_start = 120

Finished loading model, total used 1000 iterations
Finished loading model, total used 1000 iterations
Finished loading model, total used 1000 iterations
Finished loading model, total used 1000 iterations
Finished loading model, total used 1000 iterations
Finished loading model, total used 1000 iterations


[32m13:04:33[0m | [1mBest Score: 63.6644 mean_absolute_error[0m


Finished loading model, total used 1000 iterations
Finished loading model, total used 1000 iterations


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lgbm_bagging_fraction,params_lgbm_bagging_freq,params_lgbm_feature_fraction,params_lgbm_learning_rate,params_lgbm_min_child_samples,params_lgbm_num_iterations,params_lgbm_num_leaves,state
0,0,74.8096,2021-07-01 13:02:22.328043,2021-07-01 13:02:22.690179,0 days 00:00:00.362136,0.8,3.0,0.5,0.253700,33,300,7,COMPLETE
1,1,77.1228,2021-07-01 13:02:22.758491,2021-07-01 13:02:23.159955,0 days 00:00:00.401464,0.4,10.0,1.0,0.077254,29,400,57,COMPLETE
2,2,69.5350,2021-07-01 13:02:23.208010,2021-07-01 13:02:23.542026,0 days 00:00:00.334016,0.7,4.0,0.7,0.018660,5,700,3,COMPLETE
3,3,67.5431,2021-07-01 13:02:23.594282,2021-07-01 13:02:23.903080,0 days 00:00:00.308798,0.7,3.0,0.9,0.027011,7,700,3,COMPLETE
4,4,68.6008,2021-07-01 13:02:23.952232,2021-07-01 13:02:24.735050,0 days 00:00:00.782818,0.5,11.0,0.4,0.011712,19,1000,18,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
233,233,66.0136,2021-07-01 13:04:29.707968,2021-07-01 13:04:30.119617,0 days 00:00:00.411649,0.7,9.0,0.4,0.072467,2,1000,2,COMPLETE
234,234,66.1150,2021-07-01 13:04:30.198913,2021-07-01 13:04:30.589697,0 days 00:00:00.390784,0.7,9.0,0.4,0.069766,2,1000,2,COMPLETE
235,235,66.5454,2021-07-01 13:04:30.667780,2021-07-01 13:04:31.038271,0 days 00:00:00.370491,0.7,9.0,0.4,0.053288,2,1000,2,COMPLETE
236,236,65.9221,2021-07-01 13:04:31.105655,2021-07-01 13:04:31.516874,0 days 00:00:00.411219,0.7,9.0,0.4,0.065815,2,1000,2,COMPLETE


In [31]:
predicts_LGBM_lunch = lunch_model.predict(lunch_test)

In [32]:
dinner_model = LightGBMRegressor(random_state=42)

dinner_model.opt(dinner_train,  y_dinner,
            verbose=3,  cold_start=120,folds=8,opt_lvl=3,early_stoping=120, auto_parameters=False,#metric=sklearn.metrics.roc_auc_score,
      timeout=1100,metric=sklearn.metrics.mean_absolute_error # optimization time in seconds,
      )

[32m13:04:33[0m | [1mregression optimize: minimize[0m
[32m13:04:33[0m | [1m##################################################[0m
[32m13:04:33[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m13:04:40[0m | [1m One iteration ~ 0.7 sec[0m
[32m13:04:40[0m | [1m Possible iters ~ 1537.0[0m
[32m13:04:40[0m | [1m--------------------------------------------------[0m
[32m13:04:40[0m | [1m  Pruned Threshold Score: 61.7311[0m
[32m13:04:40[0m | [1m##################################################[0m
[32m13:04:40[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m13:04:40[0m | [1m##################################################[0m
[32m13:04:40[0m | [1m> Start optimization with the parameters:[0m
[32m13:04:40[0m | [1mCV_Folds = 8[0m
[32m13:04:40[0m | [1mScore_CV_Folds = 2[0m
[32m13:04:40[0m | [1mFeature_Selection = False[0m
[32m13:04:40[0m | [1mOpt_lvl = 3[0m
[32m13:04:40[0m | [1mCold_start = 120

Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations


[32m13:06:28[0m | [1mBest Score: 53.1557 mean_absolute_error[0m


Finished loading model, total used 600 iterations


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lgbm_bagging_fraction,params_lgbm_bagging_freq,params_lgbm_feature_fraction,params_lgbm_learning_rate,params_lgbm_min_child_samples,params_lgbm_num_iterations,params_lgbm_num_leaves,state
0,0,61.1484,2021-07-01 13:04:33.723520,2021-07-01 13:04:34.068895,0 days 00:00:00.345375,0.8,3.0,0.5,0.253700,33,300,7,COMPLETE
1,1,61.1083,2021-07-01 13:04:34.124859,2021-07-01 13:04:34.533735,0 days 00:00:00.408876,0.4,10.0,1.0,0.077254,29,400,57,COMPLETE
2,2,62.3138,2021-07-01 13:04:34.606914,2021-07-01 13:04:34.952385,0 days 00:00:00.345471,0.7,4.0,0.7,0.018660,5,700,3,COMPLETE
3,3,61.0402,2021-07-01 13:04:35.018077,2021-07-01 13:04:35.379691,0 days 00:00:00.361614,0.7,3.0,0.9,0.027011,7,700,3,COMPLETE
4,4,56.4423,2021-07-01 13:04:35.438289,2021-07-01 13:04:36.517238,0 days 00:00:01.078949,0.5,11.0,0.4,0.011712,19,1000,18,COMPLETE
...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,126,55.0308,2021-07-01 13:06:15.004138,2021-07-01 13:06:16.656314,0 days 00:00:01.652176,1.0,,,0.012248,2,600,37,COMPLETE
127,127,55.2912,2021-07-01 13:06:16.733187,2021-07-01 13:06:18.099837,0 days 00:00:01.366650,1.0,,,0.013185,2,600,34,COMPLETE
128,128,55.0185,2021-07-01 13:06:18.175042,2021-07-01 13:06:19.878066,0 days 00:00:01.703024,1.0,,,0.010064,2,600,39,COMPLETE
129,129,53.9956,2021-07-01 13:06:19.966517,2021-07-01 13:06:21.429692,0 days 00:00:01.463175,1.0,,,0.011544,2,600,31,COMPLETE


In [33]:
dinner_test['중식참여율'] = predicts_LGBM_lunch / dinner_test['식사가능자수']

In [36]:
predicts_LGBM_dinner = dinner_model.predict(dinner_test)

In [37]:
submission = pd.read_csv('../data/sample_submission.csv')

In [38]:
submission.iloc[:,1] = predicts_LGBM_lunch
submission.iloc[:,2] = predicts_LGBM_dinner
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,963.657635,317.671462
1,2021-01-28,916.444205,404.326876
2,2021-01-29,627.465356,236.054252
3,2021-02-01,1277.057998,528.280716
4,2021-02-02,1008.165386,434.64267


In [39]:
answer = pd.read_csv('../submission/20210630_lgbm_autoML.csv')

lunch_answer = np.array(answer.iloc[:,1])
dinner_answer = np.array(answer.iloc[:,2])

abs(predicts_LGBM_lunch - lunch_answer).mean(), abs(predicts_LGBM_dinner - dinner_answer).mean()

(25.891266998454604, 18.630703416949338)

# 저장

In [40]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_lgbm_autoML.csv', index =False)

오늘 날짜 : 20210701
