In [1]:
import pandas as pd
import numpy as np
from pycaret.regression import setup, compare_models, blend_models,tune_model,predict_model,get_config, finalize_model

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

In [2]:
from sklearn.model_selection import KFold
from catboost import CatBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import GradientBoostingRegressor

In [3]:
from workalendar.asia import SouthKorea
import pendulum

In [4]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [6]:
train['월'] = pd.DatetimeIndex(train['일자']).month.astype(str)
test['월'] = pd.DatetimeIndex(test['일자']).month.astype(str)
train['주'] = pd.DatetimeIndex(train['일자']).week.astype(str)
test['주'] = pd.DatetimeIndex(test['일자']).week.astype(str)
train['일'] = pd.DatetimeIndex(train['일자']).day.astype(str)
test['일'] = pd.DatetimeIndex(test['일자']).day.astype(str)

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

## 공휴일 변수 생성

In [7]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3 #'S'
    if tomorrow in holidays:
        return 2 # 'T'
    elif yesterday in holidays:
        return 1 #'Y'
    else : 
        return 0 #'N'

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

In [8]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 7,
    6: 6,
    7: 10,
    8: 8,
    9: 5,
    10: 3,
    11: 9,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

weekday_rank4dinner = {
    '월': 1,
    '화': 2,
    '수': 4,
    '목': 3,
    '금': 5,
}

weekday_rank4lunch = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5,
}

rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일'].map(weekday_rank4lunch)
test['요일(중식)'] = test['요일'].map(weekday_rank4lunch)

## 최종 데이터 셋 구축

In [9]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '공휴일전후', '몇주차', '인원변화', '월(석식)',
       '월(중식)', '주(중식)', '주(석식)', '요일(석식)', '요일(중식)'],
      dtype='object')

In [10]:
lunch_train = train.drop(columns=['본사정원수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수','일자', '요일','주', '월', '석식계', '요일(석식)','조식메뉴', '중식메뉴', '석식메뉴','주(석식)', '월(석식)'])
lunch_test = test.drop(columns=['본사정원수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수','일자', '요일','주', '월', '요일(석식)', '월(석식)', '조식메뉴', '중식메뉴','주(석식)' , '석식메뉴'])


dinner_train = train.drop(columns=['본사정원수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수','일자', '요일','주', '월', '중식계', '요일(중식)','조식메뉴', '중식메뉴', '석식메뉴','주(중식)' , '월(중식)'])
dinner_test = test.drop(columns=['본사정원수', '본사시간외근무명령서승인건수', '현본사소속재택근무자수','일자', '요일', '주','월', '요일(중식)', '월(중식)', '조식메뉴', '중식메뉴','주(중식)' , '석식메뉴'])

In [11]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 15)
(50, 14)


In [12]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 15)
(50, 14)


In [13]:
lunch_train.columns

Index(['본사휴가자수', '본사출장자수', '중식계', '일', '출근', '휴가비율', '출장비율', '야근비율', '재택비율',
       '공휴일전후', '몇주차', '인원변화', '월(중식)', '주(중식)', '요일(중식)'],
      dtype='object')

#### 분포 확인 및 분포 조정

In [14]:
drop_index = dinner_train[dinner_train['석식계']==0].index

dinner_train.iloc[drop_index]

Unnamed: 0,본사휴가자수,본사출장자수,석식계,일,출근,휴가비율,출장비율,야근비율,재택비율,공휴일전후,몇주차,인원변화,월(석식),주(석식),요일(석식)
222,206,194,328.0,26,2305.0,0.076155,0.071719,0.027332,0.0,1,5,16,,1,1
242,108,202,509.0,23,2387.0,0.040044,0.074898,0.165899,0.0,0,5,-8,,4,1
262,75,252,0.0,22,2305.0,0.028495,0.095745,0.0,0.0,0,4,-65,,47,4
280,55,222,647.0,21,2350.0,0.020936,0.084507,0.193191,0.0,0,4,-5,,41,2
299,82,220,479.0,17,2324.0,0.031226,0.083778,0.186747,0.0,0,4,-1,,39,1
324,164,302,462.0,26,2171.0,0.062192,0.114524,0.027637,0.0,0,4,11,,28,5
345,59,256,575.0,27,2333.0,0.022281,0.096677,0.132447,0.0,0,5,11,,7,2
364,237,211,486.0,24,2391.0,0.08348,0.074322,0.167712,0.0,0,5,191,,6,1
384,74,213,604.0,22,2358.0,0.027977,0.080529,0.205259,0.0,0,4,-194,,46,2
410,70,265,0.0,27,2307.0,0.026495,0.100303,0.0,0.0,0,5,-3,,16,4


In [15]:
dinner_train.drop(drop_index, inplace=True)

print(dinner_train.shape)

(1144, 15)


# 중식 예측모델

In [16]:
y_lunch = np.array(lunch_train['중식계'])
y_lunch = y_lunch.reshape(-1)
lunch_train.drop(columns=['중식계'], inplace=True)

y_dinner = np.array(dinner_train['석식계'])
y_dinner = y_dinner.reshape(-1)
dinner_train.drop(columns=['석식계'], inplace=True)

In [17]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: ['일']
[2]


In [20]:
n_fold = 8

kfold = KFold(n_splits=n_fold, shuffle=False)
# for each fold

lunch_models_cat = [CatBoostRegressor(n_estimators = 3000, loss_function='MAE') for _ in range(n_fold)]
lunch_preds_cat = []

lunch_models_gbr = [GradientBoostingRegressor() for _ in range(n_fold)]
lunch_preds_gbr = []

lunch_trues = []

for idx, (tr_idx, val_idx) in enumerate(kfold.split(lunch_train, y_lunch)):
    X_tr, X_val = lunch_train.iloc[tr_idx], lunch_train.iloc[val_idx]
    y_tr, y_val = y_lunch[tr_idx], y_lunch[val_idx]

    temp_model_cat = lunch_models_cat[idx]
    temp_model_cat.fit(X_tr, y_tr,
                       eval_set=[(X_tr, y_tr), (X_val, y_val)],
                       cat_features=cat_features,
                       early_stopping_rounds=50,
                       verbose=400)
    
    lunch_models_cat[idx] = temp_model_cat
    
    pred_cat = temp_model_cat.predict(X_val)
    lunch_preds_cat.extend(pred_cat)

    temp_model_gbr = lunch_models_gbr[idx]
    temp_model_gbr.fit(X_tr, y_tr)
    
    lunch_models_gbr[idx] = temp_model_gbr
    
    pred_gbr = temp_model_gbr.predict(X_val)
    
    lunch_preds_gbr.extend(pred_lgb)
    
    true = y_val
    lunch_trues.extend(true)

0:	learn: 168.5447004	test: 168.5447004	test1: 138.1675830	best: 138.1675830 (0)	total: 16ms	remaining: 48s
Stopped by overfitting detector  (50 iterations wait)

bestTest = 71.39108893
bestIteration = 163

Shrink model to first 164 iterations.


ValueError: Input contains NaN, infinity or a value too large for dtype('float32').

### 테스트 데이터 예측

In [None]:
lunch_preds_cat = np.array(lunch_preds_cat)
lunch_preds_xgb = np.array(lunch_preds_xgb)
lunch_preds_lgb = np.array(lunch_preds_lgb)

lunch_trues = np.array(lunch_trues)

abs(lunch_trues-lunch_preds_cat).mean(), abs(lunch_trues-lunch_preds_xgb).mean(), abs(lunch_trues-lunch_preds_lgb).mean()

In [None]:
lunch = np.array([0]*50).astype(np.float64)

for model in lunch_models_cat:
    lunch += model.predict(lunch_test)
    
for model in lunch_models_xgb:
    lunch += model.predict(lunch_test)
    
for model in lunch_models_lgb:
    lunch += model.predict(lunch_test)
    
lunch /= n_fold * 3

# 석식 예측모델

In [None]:
dinner_models_cat = [CatBoostRegressor(n_estimators = 3000, loss_function='MAE') for _ in range(n_fold)]
dinner_preds_cat = []

dinner_models_lgb = [GradientBoostingRegressor() for _ in range(n_fold)]
dinner_preds_lgb = []

dinner_trues = []

for idx, (tr_idx, val_idx) in enumerate(kfold.split(dinner_train, y_dinner)):
    X_tr, X_val = dinner_train.iloc[tr_idx], dinner_train.iloc[val_idx]
    y_tr, y_val = y_dinner[tr_idx], y_dinner[val_idx]

    temp_model_cat = dinner_models_cat[idx]
    temp_model_cat.fit(X_tr, y_tr,
                   eval_set=[(X_tr, y_tr), (X_val, y_val)],
                   early_stopping_rounds=50,
                   verbose=400)
    
    dinner_models_cat[idx] = temp_model_cat
    
    pred_cat = temp_model_cat.predict(X_val)
    dinner_preds_cat.extend(pred_cat)

    temp_model_xgb = dinner_models_xgb[idx]
    temp_model_xgb.fit(X_tr, y_tr,
                   eval_set=[(X_tr, y_tr), (X_val, y_val)],
                   early_stopping_rounds=50,
                   verbose=400)
    
    dinner_models_xgb[idx] = temp_model_xgb
    
    pred_xgb = temp_model_xgb.predict(X_val)
    dinner_preds_xgb.extend(pred_xgb)

    temp_model_lgb = dinner_models_lgb[idx]
    temp_model_lgb.fit(X_tr, y_tr)
#                    eval_set=[(X_tr, y_tr), (X_val, y_val)],
#                    early_stopping_rounds=50,
#                    verbose=400)
    
    dinner_models_lgb[idx] = temp_model_lgb
    
    pred_lgb = temp_model_lgb.predict(X_val)
    
    dinner_preds_lgb.extend(pred_lgb)
    
    true = y_val
    dinner_trues.extend(true)

In [None]:
dinner_preds_cat = np.array(dinner_preds_cat)
dinner_preds_xgb = np.array(dinner_preds_xgb)
dinner_preds_lgb = np.array(dinner_preds_lgb)

dinner_trues = np.array(dinner_trues)

abs(dinner_trues-dinner_preds_cat).mean(), abs(dinner_trues-dinner_preds_xgb).mean(), abs(dinner_trues-dinner_preds_lgb).mean()

In [None]:
singles = set()
for idx, model in enumerate(dinner_models_xgb):
    singles.add((f'model_{idx}', model))

In [None]:
dinner = np.array([0]*50).astype(np.float64)

for model in dinner_models_cat:
    dinner += model.predict(dinner_test)
    
for model in dinner_models_xgb:
    dinner += model.predict(dinner_test)
    
for model in dinner_models_lgb:
    dinner += model.predict(dinner_test)
    
dinner /= n_fold * 3

## 테스트

### 테스트 데이터 예측

In [None]:
submission = pd.read_csv('../data/sample_submission.csv')
submission.iloc[:,1] = lunch
submission.iloc[:,2] = dinner

submission.head()

In [None]:
answer = pd.read_csv('../submission/제출해야될것.csv')

lunch_answer = np.array(answer.iloc[:,1])
dinner_answer = np.array(answer.iloc[:,2])

abs(lunch - lunch_answer).mean(), abs(dinner - dinner_answer).mean()

In [None]:
submission.head(11)

# 저장

In [None]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_cat_xgb_8.csv', index =False)

In [None]:
def plot_feature_importance(importance, names, model_type):
    
    feature_importance = np.array(importance)
    feature_names = np.array(names)
    
    data={'feature_names':feature_names,'feature_importance':feature_importance}
    fi_df = pd.DataFrame(data)
    
    fi_df.sort_values(by=['feature_importance'], ascending=False,inplace=True)

    plt.figure(figsize=(10,8))

    sns.barplot(x=fi_df['feature_importance'], y=fi_df['feature_names'])

    plt.title(model_type + ' Feature Importance')
    plt.xlabel('Feature Importance')
    plt.ylabel('Feature Names')

In [None]:
for i in range(n_fold):
    plot_feature_importance(dinner_models[0].get_feature_importance(),dinner_train.iloc[:,:-1].columns,"CATBOOST")

In [None]:
lunch_train