In [1]:
import pandas as pd
import numpy as np
from pycaret.regression import setup, compare_models, blend_models,tune_model,predict_model,get_config, finalize_model

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

from workalendar.asia import SouthKorea
import pendulum

RuntimeError: module compiled against API version 0xe but this version of numpy is 0xd

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [3]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']

In [4]:
weekday = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5
}

train['요일'] = train['요일'].map(weekday)
test['요일'] = test['요일'].map(weekday)

### 요일 레이블 인코딩

In [5]:
weekday_rank4dinner = {
    1: 1,
    2: 2,
    3: 5,
    4: 3,
    5: 4,
}

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일']
test['요일(중식)'] = test['요일']

### 월 레이블 인코딩

In [6]:
month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

In [7]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 8,
    6: 6,
    7: 9,
    8: 7,
    9: 5,
    10: 3,
    11: 10,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

## 주 EDA

In [8]:
rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

## 공휴일 변수 생성

In [9]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3
    if tomorrow in holidays:
        return 1
    elif yesterday in holidays:
        return 2
    else : 
        return 0

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

### train/test에서 중복 메뉴만 변수로서 사용

In [10]:
menu_train = train[['조식메뉴','중식메뉴','석식메뉴']]
menu_test = test[['조식메뉴','중식메뉴','석식메뉴']]
print(menu_train.shape)
print(menu_test.shape)

(1205, 3)
(50, 3)


In [11]:
def clean_split(df):
    df = df.split()
    for j in df:
        if '(' in j:
            del df[df.index(j)]
    for i in df:
        if '쌀밥' in i:
            del df[df.index(i)] 
    for q in df:
        if '김치' in q:
            del df[df.index(q)]
    
    return df

In [12]:
menu_train['조식메뉴_prepro'] = menu_train['조식메뉴'].apply(clean_split)
menu_train['중식메뉴_prepro'] = menu_train['중식메뉴'].apply(clean_split)
menu_train['석식메뉴_prepro'] = menu_train['석식메뉴'].apply(clean_split)

menu_test['조식메뉴_prepro'] = menu_test['조식메뉴'].apply(clean_split)
menu_test['중식메뉴_prepro'] = menu_test['중식메뉴'].apply(clean_split)
menu_test['석식메뉴_prepro'] = menu_test['석식메뉴'].apply(clean_split)

In [13]:
lunch_train= menu_train.중식메뉴_prepro.tolist()
lunch_test= menu_test.중식메뉴_prepro.tolist()
dinner_train = menu_train.석식메뉴_prepro.tolist()
dinner_test = menu_test.석식메뉴_prepro.tolist()

## 벡터화

In [14]:
lunch_list_train = []
for i in lunch_train:
    if len(i) == 0:
        continue
    else:
        string = i[0]
    for w in i[1:]:
        string += " "
        string += w
    lunch_list_train.append(string)

In [15]:
lunch_list_test = []
for i in lunch_test:
    if len(i) == 0:
        continue
    else:
        string = i[0]
    for w in i[1:]:
        string += " "
        string += w
    lunch_list_test.append(string)


In [16]:
dinner_list_train = []
for i in dinner_train:
    if len(i) == 0:
        i = ['.']
        string = i[0]
    else:
        string = i[0]
    for w in i[1:]:
        string += " "
        string += w
    dinner_list_train.append(string)

In [17]:
dinner_list_test = []
for i in dinner_test:
    if len(i) == 0:
        i = ['.']
        string = i[0]
    else:
        string = i[0]
    for w in i[1:]:
        string += " "
        string += w
    dinner_list_test.append(string)

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
lunch_matrix_train = vectorizer.fit_transform(lunch_list_train)
lunch_df_train = pd.DataFrame(lunch_matrix_train.toarray(), columns=vectorizer.get_feature_names())

vectorizer = TfidfVectorizer()
dinner_matrix_train = vectorizer.fit_transform(dinner_list_train)
dinner_df_train = pd.DataFrame(dinner_matrix_train.toarray(), columns=vectorizer.get_feature_names())

vectorizer = TfidfVectorizer()
lunch_matrix_test = vectorizer.fit_transform(lunch_list_test)
lunch_df_test = pd.DataFrame(lunch_matrix_test.toarray(), columns=vectorizer.get_feature_names())

vectorizer = TfidfVectorizer()
dinner_matrix_test = vectorizer.fit_transform(dinner_list_test)
dinner_df_test = pd.DataFrame(dinner_matrix_test.toarray(), columns=vectorizer.get_feature_names())

In [19]:
# 중복 메뉴 선정
lunch_intersection = list(set(lunch_df_train.columns) & set(lunch_df_test.columns))
print(len(lunch_intersection))
dinner_intersection = list(set(dinner_df_train.columns) & set(dinner_df_test.columns))
print(len(dinner_intersection))

146
109


## 최종 데이터 셋 구축

In [20]:
lunch_train = pd.concat([train[['식사가능자수','공휴일전후','일자', '몇주차', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','중식계']],lunch_df_train[lunch_intersection]],axis=1)
lunch_test = pd.concat([test[['식사가능자수','공휴일전후','일자', '몇주차', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수']],lunch_df_test[lunch_intersection]],axis=1)


dinner_train = pd.concat([train[['식사가능자수','공휴일전후','일자', '몇주차', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','석식계']],dinner_df_train[dinner_intersection]],axis=1)
dinner_test = pd.concat([test[['식사가능자수','공휴일전후','일자', '몇주차', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수']],dinner_df_test[dinner_intersection]],axis=1)

In [21]:
lunch_train = lunch_train[lunch_train['일자'] > '2016-03']
lunch_train['인원변화'] = lunch_train['일자'].apply(member_change)

dinner_train = dinner_train[dinner_train['일자'] > '2016-03']
dinner_train['인원변화'] = dinner_train['일자'].apply(member_change)

dinner_test['인원변화'] = dinner_test['일자'].apply(member_change)
lunch_test['인원변화'] = lunch_test['일자'].apply(member_change)

In [22]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 162)
(50, 161)


In [23]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 125)
(50, 124)


# 중식 예측모델

In [24]:
selected_lunch_feature = ['식사가능자수','공휴일전후','인원변화', '몇주차', '요일(중식)', '주(중식)', '출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수', '콩나물불고기', '동태매운탕', '파인d', '쇠고기숙주볶음', '삼색유자청무침', 
                       '소불고기', '바나나', '황도d', '콩나물파채무침', '버섯매운탕', '돈육간장불고기', '양상추샐러드', 
                       '석박지', '수박', '오리대패불고기', '오이생채', '흑임자d', '고등어구이', '닭볶음탕', '청경채찜', 
                       '우렁된장찌개', '요거트d']

lunch_train_ols = lunch_train[selected_lunch_feature]
lunch_y         = lunch_train[["중식계"]]

lunch_test_ols  = lunch_test[selected_lunch_feature]

In [None]:
drop_columns = [[],['식사가능자수'], ['본사휴가자수'], ['콩나물불고기', '동태매운탕', '파인d', '쇠고기숙주볶음', '삼색유자청무침', 
                       '소불고기', '바나나', '황도d', '콩나물파채무침', '버섯매운탕', '돈육간장불고기', '양상추샐러드', 
                       '석박지', '수박', '오리대패불고기', '오이생채', '흑임자d', '고등어구이', '닭볶음탕', '청경채찜', 
                       '우렁된장찌개', '요거트d'], ['인원변화']]
ks = [4,8,10,15]

In [25]:
from automl_alex import LightGBMRegressor
from sklearn.metrics import mean_absolute_error
import sklearn

In [26]:
random_seed = 0
pred_lunch = np.array([0]*50).astype(np.float64)

for dc in drop_columns:
    for k in ks:
        print(dc)
        
        lunch_model = LightGBMRegressor(random_state=random_seed)
        
        X_train = lunch_train_ols.drop(columns=dc)
        X_test = lunch_train_ols.drop(columns=dc)
        
        lunch_model.opt(X_train, y_lunch,
                        verbose=3,
                        cold_start=120,
                        folds=k,
                        opt_lvl=3,
                        early_stoping=120,
                        auto_parameters=False,
                        timeout=1100,
                        metric=MAE
                       )

        pred_lunch += lunch_model.predict(X_test)

        clear_output(True)
                
        random_seed += 1

[32m22:03:58[0m | [1mregression optimize: minimize[0m
[32m22:03:58[0m | [1m##################################################[0m
[32m22:03:59[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m22:04:06[0m | [1m One iteration ~ 0.7 sec[0m
[32m22:04:06[0m | [1m Possible iters ~ 1652.0[0m
[32m22:04:06[0m | [1m--------------------------------------------------[0m
[32m22:04:06[0m | [1m  Pruned Threshold Score: 77.5968[0m
[32m22:04:06[0m | [1m##################################################[0m
[32m22:04:06[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m22:04:06[0m | [1m##################################################[0m
[32m22:04:06[0m | [1m> Start optimization with the parameters:[0m
[32m22:04:06[0m | [1mCV_Folds = 20[0m
[32m22:04:06[0m | [1mScore_CV_Folds = 2[0m
[32m22:04:06[0m | [1mFeature_Selection = False[0m
[32m22:04:06[0m | [1mOpt_lvl = 3[0m
[32m22:04:06[0m | [1mCold_start = 12

Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations
Finished loading model, total used 600 iterations


[32m22:08:22[0m | [1mBest Score: 67.8209 mean_absolute_error[0m


Finished loading model, total used 600 iterations


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lgbm_bagging_fraction,params_lgbm_bagging_freq,params_lgbm_feature_fraction,params_lgbm_learning_rate,params_lgbm_min_child_samples,params_lgbm_num_iterations,params_lgbm_num_leaves,state
0,0,74.2052,2021-07-14 22:03:59.673909,2021-07-14 22:04:00.312480,0 days 00:00:00.638571,0.8,3.0,0.5,0.2537,33,300,7,COMPLETE
1,1,81.0884,2021-07-14 22:04:00.390882,2021-07-14 22:04:00.748759,0 days 00:00:00.357877,0.4,10.0,1.0,0.077254,29,400,57,COMPLETE
2,2,77.7724,2021-07-14 22:04:00.816123,2021-07-14 22:04:01.137669,0 days 00:00:00.321546,0.7,4.0,0.7,0.01866,5,700,3,COMPLETE
3,3,77.4213,2021-07-14 22:04:01.203727,2021-07-14 22:04:01.552105,0 days 00:00:00.348378,0.7,3.0,0.9,0.027011,7,700,3,COMPLETE
4,4,76.3044,2021-07-14 22:04:01.621389,2021-07-14 22:04:02.516105,0 days 00:00:00.894716,0.5,11.0,0.4,0.011712,19,1000,18,COMPLETE
5,5,72.8994,2021-07-14 22:04:02.692818,2021-07-14 22:04:04.123100,0 days 00:00:01.430282,0.8,3.0,0.7,0.028181,2,600,45,COMPLETE
6,6,77.2006,2021-07-14 22:04:04.192467,2021-07-14 22:04:04.570937,0 days 00:00:00.378470,0.8,7.0,0.6,0.220382,4,700,2,COMPLETE
7,7,82.4136,2021-07-14 22:04:04.641471,2021-07-14 22:04:04.994184,0 days 00:00:00.352713,1.0,,,0.270517,39,1000,3,COMPLETE
8,8,86.1178,2021-07-14 22:04:05.060537,2021-07-14 22:04:05.689257,0 days 00:00:00.628720,0.5,5.0,0.4,0.229996,2,600,19,COMPLETE
9,9,80.5612,2021-07-14 22:04:05.773043,2021-07-14 22:04:06.252709,0 days 00:00:00.479666,0.5,3.0,0.7,0.167551,7,900,5,COMPLETE


# 석식 예측모델

In [27]:
selected_dinner_feature=['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','오므라이스']

dinner_train_ols = dinner_train[selected_dinner_feature]
dinner_y         = dinner_train[["석식계"]]

dinner_test_ols  = dinner_test[selected_dinner_feature]

In [28]:
dinner_model = LightGBMRegressor(random_state=42)

dinner_model.opt(dinner_train_ols,  dinner_y,
            verbose=3,  cold_start=120,folds=20,opt_lvl=3,early_stoping=120, auto_parameters=False,#metric=sklearn.metrics.roc_auc_score,
      timeout=1100,metric=sklearn.metrics.mean_absolute_error # optimization time in seconds,
      )

[32m22:08:23[0m | [1mregression optimize: minimize[0m
[32m22:08:23[0m | [1m##################################################[0m
[32m22:08:23[0m | [1m> Step 1: calc parameters and pruned score: get test 10 trials[0m
[32m22:08:30[0m | [1m One iteration ~ 0.7 sec[0m
[32m22:08:30[0m | [1m Possible iters ~ 1606.0[0m
[32m22:08:30[0m | [1m--------------------------------------------------[0m
[32m22:08:30[0m | [1m  Pruned Threshold Score: 61.206[0m
[32m22:08:30[0m | [1m##################################################[0m
[32m22:08:30[0m | [1m> Step 2: Full opt with Threshold Score Pruner[0m
[32m22:08:30[0m | [1m##################################################[0m
[32m22:08:30[0m | [1m> Start optimization with the parameters:[0m
[32m22:08:30[0m | [1mCV_Folds = 20[0m
[32m22:08:30[0m | [1mScore_CV_Folds = 2[0m
[32m22:08:30[0m | [1mFeature_Selection = False[0m
[32m22:08:30[0m | [1mOpt_lvl = 3[0m
[32m22:08:30[0m | [1mCold_start = 120

Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations
Finished loading model, total used 300 iterations


[32m22:12:34[0m | [1mBest Score: 48.9114 mean_absolute_error[0m


Finished loading model, total used 300 iterations


Unnamed: 0,number,value,datetime_start,datetime_complete,duration,params_lgbm_bagging_fraction,params_lgbm_bagging_freq,params_lgbm_feature_fraction,params_lgbm_learning_rate,params_lgbm_min_child_samples,params_lgbm_num_iterations,params_lgbm_num_leaves,state
0,0,59.7758,2021-07-14 22:08:23.436009,2021-07-14 22:08:24.089572,0 days 00:00:00.653563,0.8,3.0,0.5,0.2537,33,300,7,COMPLETE
1,1,68.7302,2021-07-14 22:08:24.174835,2021-07-14 22:08:24.563654,0 days 00:00:00.388819,0.4,10.0,1.0,0.077254,29,400,57,COMPLETE
2,2,58.6271,2021-07-14 22:08:24.642761,2021-07-14 22:08:25.022938,0 days 00:00:00.380177,0.7,4.0,0.7,0.01866,5,700,3,COMPLETE
3,3,57.8531,2021-07-14 22:08:25.098543,2021-07-14 22:08:25.455583,0 days 00:00:00.357040,0.7,3.0,0.9,0.027011,7,700,3,COMPLETE
4,4,59.5972,2021-07-14 22:08:25.531235,2021-07-14 22:08:26.347051,0 days 00:00:00.815816,0.5,11.0,0.4,0.011712,19,1000,18,COMPLETE
5,5,56.0303,2021-07-14 22:08:26.452654,2021-07-14 22:08:27.946552,0 days 00:00:01.493898,0.8,3.0,0.7,0.028181,2,600,45,COMPLETE
6,6,65.3568,2021-07-14 22:08:28.049101,2021-07-14 22:08:28.446039,0 days 00:00:00.396938,0.8,7.0,0.6,0.220382,4,700,2,COMPLETE
7,7,62.7872,2021-07-14 22:08:28.521525,2021-07-14 22:08:28.907678,0 days 00:00:00.386153,1.0,,,0.270517,39,1000,3,COMPLETE
8,8,62.6362,2021-07-14 22:08:28.983910,2021-07-14 22:08:29.655409,0 days 00:00:00.671499,0.5,5.0,0.4,0.229996,2,600,19,COMPLETE
9,9,67.8467,2021-07-14 22:08:29.744220,2021-07-14 22:08:30.195173,0 days 00:00:00.450953,0.5,3.0,0.7,0.167551,7,900,5,COMPLETE


In [29]:
predicts_LGBM_lunch = lunch_model.predict(lunch_test_ols)
predicts_LGBM_dinner = dinner_model.predict(dinner_test_ols)

In [30]:
submission = pd.read_csv('../data/sample_submission.csv')

submission.iloc[:,1] = predicts_LGBM_lunch
submission.iloc[:,2] = predicts_LGBM_dinner
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1003.369876,344.148592
1,2021-01-28,953.946179,419.615368
2,2021-01-29,614.93658,209.807867
3,2021-02-01,1264.52702,553.696388
4,2021-02-02,1064.007291,468.465924


### 테스트 데이터 예측

# 저장

In [31]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_lgbm_autoML_20.csv', index =False)

오늘 날짜 : 20210714
