In [None]:
# Google Colaboratory를 사용할 때는 다음 주석을 해제하고 실행하기 바랍니다.

In [None]:
# # 다음을 실행하면 authorization code 입력을 요청받습니다.
# # 출력된 링크를 클릭하고 Google 계정으로 로그인한 뒤,
# # authorization code를 복사해서 붙여 넣습니다.
# import os
# from google.colab import drive
# drive.mount('/content/drive')

In [None]:
# working_dir = 'MLSys_100Knocks' #　※※ 여러분이 만든 폴더 경로가 다를 때는 다음을 변경합니다. ※※
# path = f'/content/drive/MyDrive/{working_dir}/MainChapter/chapter08'
# os.chdir(path)

In [None]:
# Jupyter notebook ipywidgets 활성화
# for jupyter notebook (virtualenv 사용 시)
#!jupyter nbextension enable --user --py widgetsnbextension

# for jupyter lab
#!jupyter labextension install @jupyter-widgets/jupyterlab-manager

# 8장 구현한 머신러닝 모델로 새로운 데이터를 예측하는 테크닉 10



### 테크닉 71: 폴더를 만들고 데이터 로딩을 준비하자

In [None]:
import os

data_dir = 'data'
input_dir = os.path.join(data_dir, '0_input')
output_dir = os.path.join(data_dir, '1_output')
master_dir = os.path.join(data_dir, '99_master')
model_dir = 'models'
os.makedirs(input_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)
os.makedirs(master_dir, exist_ok=True)
os.makedirs(model_dir, exist_ok=True)

##### **반드시 데이터와 모델을 해당 폴더에 넣어주십시오.**

### 테크닉 72: 예측할 신규 데이터를 로딩하자

In [None]:
import pandas as pd

m_area_file = 'm_area.csv'
m_store_file = 'm_store.csv'
m_area = pd.read_csv(os.path.join(master_dir, m_area_file))
m_store = pd.read_csv(os.path.join(master_dir, m_store_file))

In [None]:
tg_ym = '202103'
target_file = 'tbl_order_' + tg_ym + '.csv'
target_data = pd.read_csv(os.path.join(input_dir, target_file))

import datetime

max_date = pd.to_datetime(target_data['order_accept_date']).max()
min_date = pd.to_datetime(target_data['order_accept_date']).min()
max_str_date = max_date.strftime('%Y%m')
min_str_date = min_date.strftime("%Y%m")

if tg_ym == min_str_date and tg_ym == max_str_date:
    print('날짜가 일치합니다')
else:
    raise Exception('날짜가 일치하지 않습니다')

### 테크닉 73: 신규 데이터를 매장별로 집계하자

In [None]:
def calc_delta(t):
    t1, t2 = t
    delta = t2 - t1
    return delta.total_seconds() / 60


def data_processing(order_data):
    order_data = order_data.loc[order_data['store_id'] != 999]
    order_data = pd.merge(order_data, m_store, on='store_id', how='left')
    order_data = pd.merge(order_data, m_area, on='area_cd', how='left')
    order_data.loc[order_data['takeout_flag'] == 0, 'takeout_name'] = 'delivery'
    order_data.loc[order_data['takeout_flag'] == 1, 'takeout_name'] = 'takeout'
    order_data.loc[order_data['status'] == 0, 'status_name'] = '주문 접수'
    order_data.loc[order_data['status'] == 1, 'status_name'] = '결제 완료'
    order_data.loc[order_data['status'] == 2, 'status_name'] = '배달 완료'
    order_data.loc[order_data['status'] == 9, 'status_name'] = '주문 취소'

    order_data.loc[:, 'order_accept_datetime'] = \
        pd.to_datetime(order_data['order_accept_date'])
    order_data.loc[:, 'delivered_datetime'] = \
        pd.to_datetime(order_data['delivered_date'])
    order_data.loc[:, 'delta'] = \
        order_data[['order_accept_datetime', 'delivered_datetime']].apply(
            calc_delta, axis=1)
    order_data.loc[:, 'order_accept_hour'] = \
        order_data['order_accept_datetime'].dt.hour
    order_data.loc[:, 'order_accept_weekday'] = \
        order_data['order_accept_datetime'].dt.weekday
    order_data.loc[order_data['order_accept_weekday'] >= 5,
                   'weekday_info'] = '휴일'
    order_data.loc[order_data['order_accept_weekday'] < 5,
                   'weekday_info'] = '평일'

    store_data = order_data.groupby(['store_name']).count()[['order_id']]
    store_f = order_data.loc[
        (order_data['status_name'] == '배달 완료') |
        (order_data['status_name'] == '결제 완료')
        ].groupby(['store_name']).count()[['order_id']]
    store_c = order_data.loc[
        order_data['status_name'] == '주문 취소'
        ].groupby(['store_name']).count()[['order_id']]
    store_d = order_data.loc[
        order_data['takeout_name'] == 'delivery'
        ].groupby(['store_name']).count()[['order_id']]
    store_t = order_data.loc[
        order_data['takeout_name'] == 'takeout'
        ].groupby(['store_name']).count()[['order_id']]
    store_weekday = order_data.loc[
        order_data['weekday_info'] == '평일'
        ].groupby(['store_name']).count()[['order_id']]
    store_weekend = order_data.loc[
        order_data['weekday_info'] == '휴일'
        ].groupby(['store_name']).count()[['order_id']]
    times = order_data['order_accept_hour'].unique()
    store_time = []

    for time in times:
        time_tmp = order_data.loc[
            order_data['order_accept_hour'] == time
            ].groupby(['store_name']).count()[['order_id']]
        time_tmp.columns = [f'order_time_{time}']
        store_time.append(time_tmp)

    store_time = pd.concat(store_time, axis=1)
    store_delta = order_data.loc[
        order_data['status_name'] != '주문 취소'
        ].groupby(['store_name']).mean()[['delta']]
    store_data.columns = ['order']
    store_f.columns = ['order_fin']
    store_c.columns = ['order_cancel']
    store_d.columns = ['order_delivery']
    store_t.columns = ['order_takeout']
    store_delta.columns = ['delta_avg']
    store_weekday.columns = ['order_weekday']
    store_weekend.columns = ['order_weekend']
    store_data = pd.concat([
        store_data, store_f, store_c, store_d, store_t,
        store_weekday, store_weekend, store_time, store_delta], axis=1)
    return store_data

In [None]:
store_data = data_processing(target_data)
store_data.reset_index(drop=False, inplace=True)
actual_data = store_data.copy()

### 테크닉 74: 신규 데이터의 범주형 변수에 대응하자

In [None]:
category_data = pd.get_dummies(store_data['store_name'], 
                               prefix='store', prefix_sep='_')
del category_data['store_가덕해안로점']
store_data = pd.concat([store_data, category_data], axis=1)
store_data.head(3)

### 테크닉 75: 모델 투입 직전의 형식으로 정리하자

In [None]:
X_cols_name = 'X_cols.csv'
X_cols = pd.read_csv(os.path.join(model_dir, X_cols_name))
X_cols = X_cols['X_cols']

In [None]:
X = store_data[X_cols].copy()
X.head(3)

### 테크닉 76: 모델 파일을 로딩하자

In [None]:
import pickle

model_weekday_name = 'model_y_weekday_GradientBoosting.pickle'
model_weekend_name = 'model_y_weekend_GradientBoosting.pickle'

model_weekday_path = os.path.join(model_dir, model_weekday_name)
model_weekend_path = os.path.join(model_dir, model_weekend_name)

with open(model_weekday_path, mode='rb') as f:
    model_weekday = pickle.load(f)

with open(model_weekend_path, mode='rb') as f:
    model_weekend = pickle.load(f)

print(model_weekday)
print(model_weekend)

### 테크닉 77: 신규 데이터를 예측하자

In [None]:
pred_weekday = model_weekday.predict(X)
pred_weekend = model_weekend.predict(X)
pred_weekend[:10]

In [None]:
pred_proba_weekday = model_weekday.predict_proba(X)
pred_proba_weekend = model_weekend.predict_proba(X)
pred_proba_weekend[:10]

In [None]:
pred_proba_weekday = pred_proba_weekday[:, 1]
pred_proba_weekend = pred_proba_weekend[:, 1]
pred_proba_weekend[:10]

In [None]:
pred = pd.DataFrame({'pred_weekday': pred_weekday,
                     'pred_weekend': pred_weekend,
                     'score_weekday': pred_proba_weekday,
                     'score_weekend': pred_proba_weekend})
pred.loc[:, 'store_name'] = store_data['store_name']
pred.loc[:, 'year_month'] = tg_ym
pred.head(3)

### 테크닉 78: 예측 결과를 히트맵으로 그리자

In [None]:
pred_viz = pred[['store_name','score_weekday','score_weekend']].copy()
pred_viz.set_index('store_name', inplace=True)
pred_viz

In [None]:
import seaborn as sns
import matplotlib as plt

# 한글 폰트 처리
import os

if os.name == 'nt':  # Windows
    plt.rc('font', family='Malgun Gothic')
elif os.name == 'posix':  # macOS
    plt.rc('font', family='AllieGothic')

sns.heatmap(pred_viz[:20].T)

### 테크닉 79: 실적 데이터를 만들자

In [None]:
target_cols = [
    'store_name', 'order', 'order_fin', 'order_cancel', 'order_delivery', 
    'order_takeout', 'order_weekday', 'order_weekend', 'delta_avg']
actual_data = actual_data[target_cols]
actual_cols = ['store_name']
rename_cols = [f'{x}_{tg_ym}' 
               for x in actual_data.columns if x != 'store_name']
actual_cols.extend(rename_cols)
actual_data.columns = actual_cols
actual_data.head(3)

### 테크닉 80: 현장용 보고서를 만들어 출력하자

In [None]:
pred.loc[pred['score_weekday'] >= 0.75,
         '주문 예측/평일'] = '증가(큼)'
pred.loc[(pred['score_weekday'] < 0.75) & (pred['score_weekday'] >= 0.5),
         '주문 예측/평일'] = '증가'
pred.loc[(pred['score_weekday'] < 0.5) & (pred['score_weekday'] >= 0.25),
         '주문 예측/평일'] = '감소'
pred.loc[pred['score_weekday'] < 0.25,
         '주문 예측/평일'] = '감소(큼)'

pred.loc[pred['score_weekend'] >= 0.75,
         '주문 예측/휴일'] = '증가(큼)'
pred.loc[(pred['score_weekend'] < 0.75) & (pred['score_weekend'] >= 0.5),
         '주문 예측/휴일'] = '증가'
pred.loc[(pred['score_weekend'] < 0.5) & (pred['score_weekend'] >= 0.25),
         '주문 예측/휴일'] = '감소'
pred.loc[pred['score_weekend'] < 0.25,
         '주문 예측/휴일'] = '감소(큼)'

In [None]:
report = pred[['store_name', '주문 예측/평일', '주문 예측/휴일', 
               'score_weekday', 'score_weekend']]
report = pd.merge(report, actual_data, on='store_name', how='left')
report.head(3)

In [None]:
pred_ym = datetime.datetime.strptime(tg_ym, '%Y%m')

from dateutil.relativedelta import relativedelta

pred_ym = pred_ym + relativedelta(months=1)
pred_ym = datetime.datetime.strftime(pred_ym, '%Y%m')

report_name = f'report_pred_{pred_ym}.xlsx'
report.to_excel(os.path.join(output_dir, report_name), index=False)