In [31]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

In [32]:
from workalendar.asia import SouthKorea
import pendulum

In [33]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [34]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']

# train['중식참여율'] = train['중식계'] / train['식사가능자수']

  train['주'] = pd.DatetimeIndex(train['일자']).week
  test['주'] = pd.DatetimeIndex(test['일자']).week


In [35]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3
    if tomorrow in holidays:
        return 2
    elif yesterday in holidays:
        return 1
    else : 
        return 0

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

In [36]:
weekday = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5
}

train['요일'] = train['요일'].map(weekday)
test['요일'] = test['요일'].map(weekday)

## 공휴일 변수 생성

In [37]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '식사가능자수', '공휴일전후', '몇주차', '인원변화'],
      dtype='object')

In [52]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

X_data = train[['공휴일전후', '몇주차', '인원변화', '요일','월','일','주','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
X_test = test[['공휴일전후', '몇주차', '인원변화', '요일','월','일','주','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

In [53]:
cat_features = [f for f in X_train.columns if X_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(X_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: []
[]


In [54]:
y_data = np.array(train[['중식계', '석식계']])

y_data

array([[1127.,  631.],
       [1000.,  473.],
       [ 837.,  673.],
       ...,
       [ 579.,  217.],
       [1145.,  502.],
       [1015.,  480.]])

#### 분포 확인 및 분포 조정

# 중식 예측모델

In [55]:
from sklearn.multioutput import MultiOutputRegressor
from xgboost import XGBRegressor

import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.metrics import mean_absolute_error as MAE
from sklearn.model_selection import KFold

In [80]:
reg_candidate = [1e-5, 1e-4, 1e-3, 1e-2, 0.1, 1, 5, 10, 100]

# space 정의, Hyperparameter의 이름을 key 값으로 입력
space={'max_depth': hp.quniform("max_depth", 5, 15, 1),
       'learning_rate': hp.quniform ('learning_rate', 0.01, 0.05, 0.005),
       'reg_alpha' : hp.choice('reg_alpha', reg_candidate),
       'reg_lambda' : hp.choice('reg_lambda', reg_candidate),
       'subsample': hp.quniform('subsample', 0.6, 1, 0.05),
       'colsample_bytree' : hp.quniform('colsample_bytree', 0.6, 1, 0.05),
       'min_child_weight' : hp.quniform('min_child_weight', 1, 10, 1),
       'n_estimators': hp.quniform('n_estimators', 200, 1500, 100)
      }
y_lunch = np.array(y_lunch).reshape(-1,1)

def hyperparameter_tuning(space):
    n_fold = 4
    kf = KFold(n_splits=n_fold, random_state=42, shuffle=True)
    mae_log = 0
    
    for train_index, test_index in kf.split(X_data):
        X_train, X_valid = X_data.iloc[train_index], X_data.iloc[test_index]
        y_train, y_valid = y_data[train_index], y_data[test_index]        
        
        model=XGBRegressor(n_estimators =int(space['n_estimators']), 
                           max_depth = int(space['max_depth']), 
                           learning_rate = space['learning_rate'],
                           reg_alpha = space['reg_alpha'],
                           reg_lambda = space['reg_lambda'],
                           subsample = space['subsample'],
                           colsample_bytree = space['colsample_bytree'], 
                           min_child_weight = int(space['min_child_weight']),
                           random_state=42, 
                          )
        
        clf = MultiOutputRegressor(model)
        clf.fit(X_train, y_train, verbose=0)

        pred = clf.predict(X_valid)
        mae= MAE(y_valid, pred)   
        
        mae_log += mae/n_fold
    
    return {'loss':mae_log, 'status': STATUS_OK, 'model': model}

In [82]:
# Trials 객체 선언합니다.
trials = Trials()
# best에 최적의 하이퍼 파라미터를 return 받습니다.
best = fmin(fn=hyperparameter_tuning,
              space=space,
              algo=tpe.suggest,
              max_evals=200, # 최대 반복 횟수를 지정합니다.
              trials=trials,
              rstate= np.random.seed(42))

# 최적화된 결과를 int로 변환해야하는 파라미터는 타입 변환을 수행합니다.
best['max_depth'] = int(best['max_depth'])
best['min_child_weight'] = int(best['min_child_weight'])
best['n_estimators'] = int(best['n_estimators'])
best['reg_alpha'] = reg_candidate[int(best['reg_alpha'])]
best['reg_lambda'] = reg_candidate[int(best['reg_lambda'])]
best['random_state'] = 42
print(best)

  0%|                                                                          | 0/200 [00:00<?, ?trial/s, best loss=?]







  0%|▏                                                | 1/200 [00:14<48:10, 14.52s/trial, best loss: 62.10857594681997]







  1%|▌                                                 | 2/200 [00:25<41:41, 12.64s/trial, best loss: 61.7690167959811]







  2%|▊                                                 | 3/200 [00:32<32:09,  9.80s/trial, best loss: 61.7690167959811]







  2%|█                                                 | 4/200 [00:43<33:51, 10.37s/trial, best loss: 61.7690167959811]







  2%|█▎                                                | 5/200 [00:55<35:44, 11.00s/trial, best loss: 61.7690167959811]







  3%|█▌                                                | 6/200 [01:11<41:03, 12.70s/trial, best loss: 61.7690167959811]







  4%|█▋                                               | 7/200 [01:21<38:12, 11.88s/trial, best loss: 61.51887460451227]







  4%|█▋                                               | 7/200 [01:39<45:40, 14.20s/trial, best loss: 61.51887460451227]


KeyboardInterrupt: 

In [83]:
X_train, X_valid, y_train, y_valid = train_test_split(lunch_train, y_lunch, test_size=0.2)

lunch_model = XGBRegressor(**lunch_param)
lunch_model.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            early_stopping_rounds=100,
            verbose=False,
        )

NameError: name 'train_test_split' is not defined

In [52]:
submission = pd.read_csv('../data/sample_submission.csv')

submission.iloc[:,1] = lunch_predict
submission.iloc[:,2] = dinner_predict
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,912.140625,314.359772
1,2021-01-28,899.830078,390.585327
2,2021-01-29,675.871826,219.946182
3,2021-02-01,1162.267822,508.577972
4,2021-02-02,995.791382,460.730774


In [53]:
def compare_ans(DIR):
    answer = pd.read_csv(DIR)

    lunch_answer = np.array(answer.iloc[:,1])
    dinner_answer = np.array(answer.iloc[:,2])
    
    lunch_MAE = abs(submission.iloc[:,1] - lunch_answer).mean()
    dinner_MAE = abs(submission.iloc[:,2] - dinner_answer).mean()
    
    print("lunch_MAE : ", lunch_MAE)
    print("dinner_MAE : ", dinner_MAE)
    print("total_MAE : ", (lunch_MAE+dinner_MAE)/2)

In [2]:
import glob

files = glob.glob('../submission/*')

for file in files[:-1]:
    print(file, compare_ans(file), end="\n\n" )

NameError: name 'compare_ans' is not defined

In [1]:
print(files)

NameError: name 'files' is not defined

In [None]:
import math

math.cos(math.pi*68.44/180)*69.5

# 저장

In [None]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_optuna_AutoML.csv', index =False)