# BUILD FEATURES

### LOAD LIBRARIES

In [None]:
import os
import gc
import warnings
import pandas as pd
import numpy as np

warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

### GLOBAL VARIABLES

In [None]:
INPUT_PATH = '../../data/processed'
OUTPUT_PATH = '../../data/features'
OUTPUT_FILE_NAME = 'features_v006'
INPUT_FILE_NAME = 'dataproc_v005'
DAYS_PRED = 28

### FUNCTIONS

In [None]:
def reduce_mem_usage(df, verbose=False):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
def add_demand_features(df):
    for diff in [0, 1, 2, 3, 4, 5]:
        shift = DAYS_PRED + diff
        df[f"demand_shift_t{shift}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(shift)
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_rolling_std_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).std()
        )

    # Moving average
    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_rolling_mean_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).mean()
        )
    
    # Moving Acum average
    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_rolling_acum_mean_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(DAYS_PRED).expanding(min_periods=window).mean()
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_rolling_min_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).min()
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_rolling_max_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).max()
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_rolling_skew_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).skew()
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_rolling_kurt_t{window}"] = df.groupby(["id"])["demand"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).kurt()
        )

  
    return df

In [None]:
def add_demand_smooth_features(df):
    for diff in [0, 1, 2, 3, 4, 5]:
        shift = DAYS_PRED + diff
        df[f"demand_smoothed_shift_t{shift}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(shift)
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_smoothed_rolling_std_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).std()
        )

    # Moving average
    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_smoothed_rolling_mean_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).mean()
        )
    
    # Moving Acum average
    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_smoothed_rolling_acum_mean_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).expanding(min_periods=window).mean()
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_smoothed_rolling_min_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).min()
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_smoothed_rolling_max_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).max()
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_smoothed_rolling_skew_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).skew()
        )

    for window in [5, 10, 30, 70, 90, 120, 180]:
        df[f"demand_smoothed_rolling_kurt_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).kurt()
        )
    
    for window in [10, 50, 100, 180, 360]:
        df[f"demand_smoothed_rolling_q10_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).quantile(0.1)
    
    for window in [10, 50, 100, 180, 360]:
        df[f"demand_smoothed_rolling_q50_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).median()
    
    for window in [10, 50, 100, 180, 360]:
        df[f"demand_smoothed_rolling_q90_t{window}"] = df.groupby(["id"])["demand_smoothed"].transform(
            lambda x: x.shift(DAYS_PRED).rolling(window).quantile(0.9)
        )
    
    
        


  
    return df

In [None]:
def add_price_features(df):
    df["shift_price_t1"] = df.groupby(["id"])["sell_price"].transform(
        lambda x: x.shift(1)
    )
    df["price_change_t1"] = (df["shift_price_t1"] - df["sell_price"]) / (
        df["shift_price_t1"]
    )
    df["rolling_price_max_t365"] = df.groupby(["id"])["sell_price"].transform(
        lambda x: x.shift(1).rolling(365).max()
    )
    df["price_change_t365"] = (df["rolling_price_max_t365"] - df["sell_price"]) / (
        df["rolling_price_max_t365"]
    )

    df["rolling_price_std_t7"] = df.groupby(["id"])["sell_price"].transform(
        lambda x: x.rolling(7).std()
    )
    df["rolling_price_std_t30"] = df.groupby(["id"])["sell_price"].transform(
        lambda x: x.rolling(30).std()
    )
    return df.drop(["rolling_price_max_t365", "shift_price_t1"], axis=1)

In [None]:

def add_time_features(df, dt_col):
    df[dt_col] = pd.to_datetime(df[dt_col])
    attrs = [
        "year",
        "quarter",
        "month",
        "week",
        "day",
        "dayofweek",
    ]

    for attr in attrs:
        dtype = np.int16 if attr == "year" else np.int8
        df[attr] = getattr(df[dt_col].dt, attr).astype(dtype)

    df["is_weekend"] = df["dayofweek"].isin([5, 6]).astype(np.int8)
    return df

### LOAD DATASET

In [None]:
print("Reading files...")
data = pd.read_pickle(f'{INPUT_PATH}/{INPUT_FILE_NAME}.pkl')


### MAKE FEATURES

In [None]:
data = add_demand_features(data).pipe(reduce_mem_usage)
data = add_demand_smooth_features(data).pipe(reduce_mem_usage)
data = add_price_features(data).pipe(reduce_mem_usage)
dt_col = "date"
data = add_time_features(data, dt_col).pipe(reduce_mem_usage)
data = data.sort_values("date")

print("start date:", data[dt_col].min())
print("end date:", data[dt_col].max())
print("data shape:", data.shape)

### SAVE DATASET

In [None]:
data.to_pickle(f'{OUTPUT_PATH}/{OUTPUT_FILE_NAME}.pkl')

In [None]:
data.sort_values(['id','date'], ascending=True, inplace=True)

In [None]:
data.head()

In [None]:
data[data.id == 'FOODS_1_001_CA_1_validation']

In [None]:
data.demand_smoothed.round().astype(int).clip(0)