In [None]:
##########################################################################
######################                              ######################
######################   feature engineering 파트   ######################
######################                              ######################
##########################################################################

import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from datetime import datetime, timedelta

def feature_engineering(merge_data):
    
    '''
    feature_engineering
    '''
    # 날짜 관련 변수 생성
    merge_data['year'] = pd.to_datetime(merge_data['date']).dt.year
    merge_data['month'] = pd.to_datetime(merge_data['date']).dt.month
    merge_data['quarter'] = pd.to_datetime(merge_data['date']).dt.quarter
    merge_data['weekday'] = pd.to_datetime(merge_data['date']).dt.weekday
    merge_data['week'] = pd.to_datetime(merge_data['date']).dt.weekday
    
    # 카테고리 컬럼
    category_col = ['date','weather_tmp'
                    ,'weather_hum','weather_rain_mm'
                    ,'holiday_name','year'
                    ,'month','quarter'
                    ,'weekday','week']

    # 상품 컬럼
    prd_col = set(merge_data.columns).difference(set(category_col))
    
    # stacked data
    stacked = merge_data[prd_col].stack().reset_index(1).rename(columns={'level_1': 'item_id', 0: 'sold'})
    stacked = stacked.reset_index().merge(merge_data[category_col].reset_index(), on='index', how='inner').set_index('index')
    stacked.reset_index(drop=True, inplace=True)
    
    # get 'target' data
    stacked['sold'] = stacked['sold'].apply(lambda x : np.nan if x==0 else x)
    stacked['target'] = stacked.groupby(['item_id'])['sold'].shift(-1)
    stacked.dropna(inplace=True)
    
    
    '''
    get lag feature
    '''
    # lag of day features on hours
    for r in range(2, 22):
        stacked[f'lag_day_{r}'] = stacked.groupby(['item_id'])['sold'].shift(r)

    # mean average of day features on hours
    for r in [3, 5, 7, 14, 21, 28]:
        stacked[f'ma_day_{r}'] = stacked.groupby(['item_id'])['lag_day_2'].rolling(r).mean().reset_index().drop(columns=['item_id']).set_index('level_1')
        stacked[f'std_day_{r}'] = stacked.groupby(['item_id'])['lag_day_2'].rolling(r).std().reset_index().drop(columns=['item_id']).set_index('level_1')

    # before holiday features
    for r in range(1, 4):
        stacked[f'holiday_before_{r}'] = stacked.groupby(['date'])['holiday_name'].shift(-r)

    feat_data = stacked.dropna()
    
    
    """
        categorical features
    """
    cat_features = ['holiday_name', 'year', 'month', 'quarter', 'weekday', 'week']
    for r in range(1, 4):
        cat_features.append(f'holiday_before_{r}')

    return feat_data, cat_features