In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.rcParams['font.family'] = 'Gulim'

In [3]:
from workalendar.asia import SouthKorea
import pendulum

In [4]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## 전처리
- 일자에서 월과 일을 분리
- 요일을 레이블 인코딩화(EDA로 요일의 중요도 순 파악)
- 월 별, 일 별 중식 석식 수요 차이 파악

In [5]:
train['월'] = pd.DatetimeIndex(train['일자']).month
test['월'] = pd.DatetimeIndex(test['일자']).month
train['주'] = pd.DatetimeIndex(train['일자']).week
test['주'] = pd.DatetimeIndex(test['일자']).week
train['일'] = pd.DatetimeIndex(train['일자']).day
test['일'] = pd.DatetimeIndex(test['일자']).day

train['출근'] = train['본사정원수']-(train['본사휴가자수']+train['본사출장자수']+train['현본사소속재택근무자수'])
train['휴가비율'] = train['본사휴가자수']/train['본사정원수']
train['출장비율'] = train['본사출장자수']/train['본사정원수']
train['야근비율'] = train['본사시간외근무명령서승인건수']/train['출근']
train['재택비율'] = train['현본사소속재택근무자수']/train['본사정원수']

test['출근'] = test['본사정원수']-(test['본사휴가자수']+test['본사출장자수']+test['현본사소속재택근무자수'])
test['휴가비율'] = test['본사휴가자수']/test['본사정원수']
test['출장비율'] = test['본사출장자수']/test['본사정원수']
test['야근비율'] = test['본사시간외근무명령서승인건수']/test['출근']
test['재택비율'] = test['현본사소속재택근무자수']/test['본사정원수']

train['식사가능자수'] = train['본사정원수'] - train['본사휴가자수'] - train['현본사소속재택근무자수']
test['식사가능자수'] = test['본사정원수'] - test['본사휴가자수'] - test['현본사소속재택근무자수']

# train['중식참여율'] = train['중식계'] / train['식사가능자수']

  train['주'] = pd.DatetimeIndex(train['일자']).week
  test['주'] = pd.DatetimeIndex(test['일자']).week


In [6]:
month_rank4dinner = {
    1: 11,
    2: 2,
    3: 1,
    4: 4,
    5: 7,
    6: 6,
    7: 10,
    8: 8,
    9: 5,
    10: 3,
    11: 9,
    12: 12
}
train['월(석식)'] = train['월'].map(month_rank4dinner)
test['월(석식)'] = test['월'].map(month_rank4dinner)

month_rank4lunch = {
    1: 3,
    2: 1,
    3: 2,
    4: 6,
    5: 7,
    6: 8,
    7: 10,
    8: 9,
    9: 5,
    10: 4,
    11: 11,
    12: 12
}
train['월(중식)'] = train['월'].map(month_rank4lunch)
test['월(중식)'] = test['월'].map(month_rank4lunch)

weekday_rank4dinner = {
    '월': 1,
    '화': 2,
    '수': 4,
    '목': 3,
    '금': 5,
}

weekday_rank4lunch = {
    '월': 1,
    '화': 2,
    '수': 3,
    '목': 4,
    '금': 5,
}

train['요일(석식)'] = train['요일'].map(weekday_rank4dinner)
test['요일(석식)'] = test['요일'].map(weekday_rank4dinner)

train['요일(중식)'] = train['요일'].map(weekday_rank4lunch)
test['요일(중식)'] = test['요일'].map(weekday_rank4lunch)

In [7]:
rank = pd.DataFrame(range(1,53))
week_rank_lunch = pd.pivot_table(train,values='중식계',index='주').sort_values(by='중식계').reset_index().drop('중식계',axis=1)
week_rank_dinner = pd.pivot_table(train,values='석식계',index='주').sort_values(by='석식계').reset_index().drop('석식계',axis=1)


week_rank4lunch = {}
for i in range(len(rank)):
    week_rank4lunch[week_rank_lunch['주'][i]] = rank[0][i]


week_rank4dinner = {}
for i in range(len(rank)):
    week_rank4dinner[week_rank_dinner['주'][i]] = rank[0][i]
    
    
train['주(중식)'] = train['주'].map(week_rank4lunch)
test['주(중식)'] = test['주'].map(week_rank4lunch)

train['주(석식)'] = train['주'].map(week_rank4dinner)
test['주(석식)'] = test['주'].map(week_rank4dinner)

In [8]:
def is_holiday(date):
    holidays = list(map(str, pd.Series(np.array(SouthKorea().holidays(int(date[:4])))[:, 0])))
    
    yesterday = str(np.datetime64(date) - 1)
    tomorrow = str(np.datetime64(date) + 1)

    if tomorrow in holidays and yesterday in holidays:
        return 3
    if tomorrow in holidays:
        return 2
    elif yesterday in holidays:
        return 1
    else : 
        return 0

def week_of_month(x):
    dt = pendulum.parse(x)
    
    wom = dt.week_of_month
    if wom < 0:
        wom += 52
    return wom
    

df = pd.concat([train[['본사정원수', '일자']], test[['본사정원수', '일자']]])
df['년월'] = df['일자'].apply(lambda x : x[:7])
df = df[['년월', '본사정원수']].groupby(by=['년월'], as_index=False).mean()

def member_change(date):
    this_month = date[:7]
    last_month = str(np.datetime64(this_month) - 1)
    
    this_month_member = int(df[df['년월'] == this_month]['본사정원수'])
    last_month_member = int(df[df['년월'] == last_month]['본사정원수'])
    
    
    return  this_month_member - last_month_member

train['공휴일전후'] = train['일자'].apply(is_holiday)
test['공휴일전후'] = test['일자'].apply(is_holiday)

train['몇주차'] = train['일자'].apply(week_of_month)
test['몇주차'] = test['일자'].apply(week_of_month)

train = train[train['일자'] > '2016-03']
train['인원변화'] = train['일자'].apply(member_change)
test['인원변화'] = test['일자'].apply(member_change)

## 공휴일 변수 생성

In [9]:
train.columns

Index(['일자', '요일', '본사정원수', '본사휴가자수', '본사출장자수', '본사시간외근무명령서승인건수',
       '현본사소속재택근무자수', '조식메뉴', '중식메뉴', '석식메뉴', '중식계', '석식계', '월', '주', '일',
       '출근', '휴가비율', '출장비율', '야근비율', '재택비율', '식사가능자수', '월(석식)', '월(중식)',
       '요일(석식)', '요일(중식)', '주(중식)', '주(석식)', '공휴일전후', '몇주차', '인원변화'],
      dtype='object')

In [10]:
# 메뉴 변수 없이 사용할떄 해당 코드 사용['공휴일전후', '몇주차', '인원변화']

lunch_train = train[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
lunch_test = test[['공휴일전후', '몇주차', '인원변화', '요일(중식)','월(중식)','일','주(중식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

dinner_train= train[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]
dinner_test = test[['공휴일전후', '몇주차', '인원변화', '요일(석식)','월(석식)','일','주(석식)','출근', '휴가비율', '출장비율', '야근비율', '재택비율','본사출장자수','본사휴가자수','식사가능자수','본사시간외근무명령서승인건수']]

In [11]:
lunch_train.columns

Index(['공휴일전후', '몇주차', '인원변화', '요일(중식)', '월(중식)', '일', '주(중식)', '출근', '휴가비율',
       '출장비율', '야근비율', '재택비율', '본사출장자수', '본사휴가자수', '식사가능자수', '본사시간외근무명령서승인건수'],
      dtype='object')

In [12]:
print(lunch_train.shape)
print(lunch_test.shape)

(1187, 16)
(50, 16)


In [13]:
print(dinner_train.shape)
print(dinner_test.shape)

(1187, 16)
(50, 16)


In [14]:
cat_features = [f for f in lunch_train.columns if lunch_train[f].dtype == 'object']

def column_index(df, cat_features):
    cols = df.columns.values
    sidx = np.argsort(cols)
    return sidx[np.searchsorted(cols, cat_features, sorter=sidx)]

cat_features_idx = column_index(lunch_train, cat_features)    
print("Cat features are: %s" % [f for f in cat_features])
print(cat_features_idx)

Cat features are: []
[]


In [15]:
y_lunch = train[['중식계']]
y_dinner = train[['석식계']]

#### 분포 확인 및 분포 조정

# 중식 예측모델

In [19]:
import optuna
from lightgbm import LGBMRegressor
from optuna import Trial
from optuna.samplers import TPESampler
from sklearn.metrics import log_loss
from sklearn.model_selection import train_test_split

In [20]:
# from automl_alex import LightGBMRegressor, CatBoostRegressor, AutoMLRegressor
from sklearn.metrics import mean_absolute_error as MAE
import sklearn

In [30]:
def objective_lunch(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "metric": "mae",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(lunch_train, y_lunch, test_size=0.2)

    model = LGBMRegressor(**params_lgb)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=False,
    
    )

    lgb_pred = model.predict(X_valid)
    log_score = MAE(y_valid, lgb_pred)
    
    return log_score

In [39]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective_lunch, n_trials=1000)
lunch_param = study.best_trial.params

print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

Best Score: 56.90992958536936
Best trial: {'reg_alpha': 1.622564509910422e-05, 'reg_lambda': 0.0727906280606794, 'max_depth': 12, 'num_leaves': 7, 'colsample_bytree': 0.7041252632987905, 'subsample': 0.9930902085857553, 'subsample_freq': 5, 'min_child_samples': 24, 'max_bin': 437}


In [41]:
X_train, X_valid, y_train, y_valid = train_test_split(lunch_train, y_lunch, test_size=0.2)

lunch_model = LGBMRegressor(**lunch_param)
lunch_model.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            early_stopping_rounds=100,
            verbose=False,
        )

LGBMRegressor(boosting_type='gbdt', class_weight=None,
       colsample_bytree=0.7041252632987905, importance_type='split',
       learning_rate=0.1, max_bin=437, max_depth=12, min_child_samples=24,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=7, objective=None, random_state=None,
       reg_alpha=1.622564509910422e-05, reg_lambda=0.0727906280606794,
       silent=True, subsample=0.9930902085857553, subsample_for_bin=200000,
       subsample_freq=5)

In [45]:
lunch_predict = lunch_model.predict(lunch_test)
lunch_predict

array([1014.9777623 ,  909.47280049,  620.73854459, 1275.52885314,
       1120.1193701 , 1127.0910706 ,  986.24555525,  708.89722304,
       1283.51582216, 1122.94235589,  796.14195167, 1245.84667052,
       1162.00161252, 1141.89353195,  993.62469507,  752.00117324,
       1275.99392291, 1094.0339315 ,  927.92435468,  868.64589183,
        586.10241297, 1189.69317269, 1063.32115747,  961.21331004,
        642.78917573, 1277.47589973, 1129.91660887, 1066.60266004,
        965.59432887,  717.72467543, 1272.38249029, 1040.64620536,
       1042.94201421,  940.76896329,  685.8227375 , 1234.42374383,
       1020.99966125,  906.05714052,  852.16730766,  596.73486267,
       1213.45282297, 1059.7517666 ,  952.15550966,  839.7069533 ,
        606.02660743, 1229.69615668, 1032.93963096,  955.71522891,
        867.748619  ,  595.39489614])

In [51]:
def objective_dinner(trial: Trial) -> float:
    params_lgb = {
        "random_state": 42,
        "verbosity": -1,
        "learning_rate": 0.05,
        "n_estimators": 10000,
        "metric": "mae",
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-8, 3e-5),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-8, 9e-2),
        "max_depth": trial.suggest_int("max_depth", 1, 20),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.4, 1.0),
        "subsample": trial.suggest_float("subsample", 0.3, 1.0),
        "subsample_freq": trial.suggest_int("subsample_freq", 1, 10),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
        "max_bin": trial.suggest_int("max_bin", 200, 500),
    }
    
    X_train, X_valid, y_train, y_valid = train_test_split(dinner_train, y_dinner, test_size=0.2)

    model = LGBMRegressor(**params_lgb)
    model.fit(
        X_train,
        y_train,
        eval_set=[(X_train, y_train), (X_valid, y_valid)],
        early_stopping_rounds=100,
        verbose=False,
    
    )

    lgb_pred = model.predict(X_valid)
    log_score = MAE(y_valid, lgb_pred)
    
    return log_score

In [52]:
sampler = TPESampler(seed=42)
study = optuna.create_study(
    study_name="lgbm_parameter_opt",
    direction="minimize",
    sampler=sampler,
)
study.optimize(objective_dinner, n_trials=1000)
dinner_param = study.best_trial.params

print("Best Score:", study.best_value)
print("Best trial:", study.best_trial.params)

Best Score: 45.37486993081729
Best trial: {'reg_alpha': 1.718625207505066e-06, 'reg_lambda': 0.03536226882919215, 'max_depth': 13, 'num_leaves': 149, 'colsample_bytree': 0.7238257801656021, 'subsample': 0.9792806922458114, 'subsample_freq': 3, 'min_child_samples': 26, 'max_bin': 423}


In [53]:
X_train, X_valid, y_train, y_valid = train_test_split(dinner_train, y_dinner, test_size=0.2)

dinner_model = LGBMRegressor(**dinner_param)
dinner_model.fit(
            X_train,
            y_train,
            eval_set=[(X_train, y_train), (X_valid, y_valid)],
            early_stopping_rounds=100,
            verbose=False,
        )

LGBMRegressor(boosting_type='gbdt', class_weight=None,
       colsample_bytree=0.7238257801656021, importance_type='split',
       learning_rate=0.1, max_bin=423, max_depth=13, min_child_samples=26,
       min_child_weight=0.001, min_split_gain=0.0, n_estimators=100,
       n_jobs=-1, num_leaves=149, objective=None, random_state=None,
       reg_alpha=1.718625207505066e-06, reg_lambda=0.03536226882919215,
       silent=True, subsample=0.9792806922458114, subsample_for_bin=200000,
       subsample_freq=3)

In [54]:
dinner_predict = dinner_model.predict(dinner_test)

In [58]:
submission = pd.read_csv('../data/sample_submission.csv')

submission.iloc[:,1] = lunch_predict
submission.iloc[:,2] = dinner_predict
submission.head()

Unnamed: 0,일자,중식계,석식계
0,2021-01-27,1014.977762,296.478621
1,2021-01-28,909.4728,416.123827
2,2021-01-29,620.738545,201.016733
3,2021-02-01,1275.528853,525.25684
4,2021-02-02,1120.11937,445.076138


In [62]:
def compare_ans(DIR):
    answer = pd.read_csv(DIR)

    lunch_answer = np.array(answer.iloc[:,1])
    dinner_answer = np.array(answer.iloc[:,2])
    
    lunch_MAE = abs(submission.iloc[:,1] - lunch_answer).mean()
    dinner_MAE = abs(submission.iloc[:,2] - dinner_answer).mean()
    
    print("lunch_MAE : ", lunch_MAE)
    print("dinner_MAE : ", dinner_MAE)
    print("total_MAE : ", (lunch_MAE+dinner_MAE)/2)
    
    
compare_ans('../submission/20210701_lgbm_knfold_drop.csv')
compare_ans('../submission/20210715_pycaret_(2).csv')
compare_ans('../submission/20210719_best_data_ensemble.csv')
compare_ans('../submission/20210719_best_data_ensemble.csv')
compare_ans('../submission/2021071')

lunch_MAE :  27.932682612863115
dinner_MAE :  25.158157576966723
total_MAE :  26.545420094914917
lunch_MAE :  57.73127840756888
dinner_MAE :  41.33405452565974
total_MAE :  49.53266646661431
lunch_MAE :  52.07852964790631
dinner_MAE :  29.854066446514597
total_MAE :  40.96629804721045


In [70]:
import glob

files = glob.glob('../submission/*')

for file in files[:-1]:
    print(file, compare_ans(file), end="\n\n" )

lunch_MAE :  28.054411287262766
dinner_MAE :  33.613523594133746
total_MAE :  30.833967440698256
../submission\20210624_pycaret_(4-2)_제출해봐야할거.csv None

lunch_MAE :  55.113359637347266
dinner_MAE :  37.351950671878306
total_MAE :  46.232655154612786
../submission\20210625_cat.csv None

lunch_MAE :  50.24057511286847
dinner_MAE :  39.66195212739441
total_MAE :  44.95126362013144
../submission\20210625_cat_10fold.csv None

lunch_MAE :  51.229281587926096
dinner_MAE :  38.820982925102236
total_MAE :  45.02513225651417
../submission\20210627_cat_15_KJH.csv None

lunch_MAE :  67.26376485036631
dinner_MAE :  59.41562966586005
total_MAE :  63.33969725811318
../submission\20210627_cat_after_2020.csv None

lunch_MAE :  51.229281587926096
dinner_MAE :  38.820982925102236
total_MAE :  45.02513225651417
../submission\20210627_cat_KJH.csv None

lunch_MAE :  51.229281587926096
dinner_MAE :  38.820982925102236
total_MAE :  45.02513225651417
../submission\20210627_pycaret_KJH.csv None

lunch_MAE :  42.

IndexError: single positional indexer is out-of-bounds

In [24]:
import math

math.cos(math.pi*68.44/180)*69.5

25.539537282882215

# 저장

In [71]:
import datetime
today = str(datetime.datetime.now().date()).replace("-","")
print("오늘 날짜 : " + today)

submission.to_csv(f'../submission/{today}_optuna_AutoML.csv', index =False)

오늘 날짜 : 20210720
