# MAKE SVD SMOOTH DATASET

### LOAD LIBRARIES

In [None]:
import os
import gc
import warnings
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from numpy.linalg import svd
from matplotlib import pyplot as plt
warnings.filterwarnings("ignore")
pd.set_option("display.max_columns", 500)
pd.set_option("display.max_rows", 500)

### GLOBAL VARIABLES

In [None]:
INPUT_PATH = '../../data/raw'
OUTPUT_PATH = '../../data/processed'
OUTPUT_FILE_NAME = 'dataproc_v005'
D_THRESH = 1941 - int(365 * 2) # he only left 2 years of training data, from 2014-05-23 to 2016-05-24
DAYS_PRED = 28
RANK = 12

### FUNCTIONS

In [None]:
def reduce_mem_usage(df, verbose=False):
    start_mem = df.memory_usage().sum() / 1024 ** 2
    int_columns = df.select_dtypes(include=["int"]).columns
    float_columns = df.select_dtypes(include=["float"]).columns

    for col in int_columns:
        df[col] = pd.to_numeric(df[col], downcast="integer")

    for col in float_columns:
        df[col] = pd.to_numeric(df[col], downcast="float")

    end_mem = df.memory_usage().sum() / 1024 ** 2
    if verbose:
        print(
            "Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)".format(
                end_mem, 100 * (start_mem - end_mem) / start_mem
            )
        )
    return df

In [None]:
def encode_categorical(df, cols):
    for col in cols:
        # Leave NaN as it is.
        le = LabelEncoder()
        not_null = df[col][df[col].notnull()]
        df[col] = pd.Series(le.fit_transform(not_null), index=not_null.index)

    return df

In [None]:
def extract_num(ser):
    return ser.str.extract(r"(\d+)").astype(np.int16)

In [None]:
def preprocess_svd(matrix, rank):
    # Replaces the training data with a rank-reduced approximation of itself.
    # This is for noise reduction. The intuition is that characteristics
    # that are common across stores (within the same department) are probably
    # signal, while those that are unique to one store may be noise.
    #
    # args:
    # train - A matrix of Weekly_Sales values from the training set of dimension
    #         (number of weeeks in training data) x (number of stores)
    # n.comp - the number of components to keep in the singular value
    #         decomposition
    #
    # returns:
    #  the rank-reduced approximation of the training data
    matrix = np.nan_to_num(matrix, nan=0)
    U, S, VT = svd(matrix)
    
    return U[:,:rank] @ np.diag(S[:rank]) @ VT[:rank,:]

In [None]:
def extract_num(ser):
    return ser.str.extract(r"(\d+)").astype(np.int16)

In [None]:
def reshape_sales(sales, submission, d_thresh=0, verbose=True):
    # melt sales data, get it ready for training
    id_columns = ["id", "item_id", "dept_id", "cat_id", "store_id", "state_id"]

    # get product table.
    product = sales[id_columns]

    sales = sales.melt(id_vars=id_columns, var_name="d", value_name="demand",)
    sales = reduce_mem_usage(sales)

    # separate test dataframes.
    vals = submission[submission["id"].str.endswith("validation")]
    evals = submission[submission["id"].str.endswith("evaluation")]

    # change column names.
    vals.columns = ["id"] + [f"d_{d}" for d in range(1914, 1914 + DAYS_PRED)]
    evals.columns = ["id"] + [f"d_{d}" for d in range(1942, 1942 + DAYS_PRED)]

    # merge with product table
    evals["id"] = evals["id"].str.replace("_evaluation", "_validation")
    vals = vals.merge(product, how="left", on="id")
    evals = evals.merge(product, how="left", on="id")
    evals["id"] = evals["id"].str.replace("_validation", "_evaluation")

    if verbose:
        print("validation")
        display(vals)

        print("evaluation")
        display(evals)

    vals = vals.melt(id_vars=id_columns, var_name="d", value_name="demand")
    evals = evals.melt(id_vars=id_columns, var_name="d", value_name="demand")

    sales["part"] = "train"
    vals["part"] = "validation"
    evals["part"] = "evaluation"

    data = pd.concat([sales, vals, evals], axis=0)

    del sales, vals, evals

    data["d"] = extract_num(data["d"])
    data = data[data["d"] >= d_thresh]

    # delete evaluation for now.
    data = data[data["part"] != "evaluation"]

    gc.collect()

    if verbose:
        print("data")
        display(data)

    return data

In [None]:
def merge_calendar(data, calendar):
    calendar = calendar.drop(["weekday", "wday", "month", "year"], axis=1)
    return data.merge(calendar, how="left", on="d")

In [None]:
def merge_prices(data, prices):
    return data.merge(prices, how="left", on=["store_id", "item_id", "wm_yr_wk"])

### LOAD DATASET

In [None]:
print("Reading files...")
calendar = pd.read_csv(f'{INPUT_PATH}/calendar.csv')
prices = pd.read_csv(f'{INPUT_PATH}/sell_prices.csv')
submission = pd.read_csv(f'{INPUT_PATH}/sample_submission.csv')
sales = pd.read_csv(f'{INPUT_PATH}/sales_train_validation.csv')

In [None]:
NUM_ITEMS = sales.shape[0]  # 30490

In [None]:
print("sales shape:", sales.shape)
print("prices shape:", prices.shape)
print("calendar shape:", calendar.shape)
print("submission shape:", submission.shape)

In [None]:
sales.head()

### RESHAPE AND MERGE DATSETS

In [None]:
data = reshape_sales(sales, submission, D_THRESH) # he only left 2 years of training data, from 2014-05-23 to 2016-05-24
del sales
gc.collect()

In [None]:
data = reduce_mem_usage(data)

In [None]:
train = data[data.part=='train']

In [None]:
validation = data[data.part!='train']

In [None]:
del data

In [None]:
train.head()

In [None]:
item_ids = train.item_id.unique()

In [None]:
store_ids = train.store_id.unique()

In [None]:
days = train.d.unique().tolist()

In [None]:
sales_smoothed = pd.DataFrame()
for storeid in store_ids:
    M = train[train.store_id==storeid].pivot(index='d', columns='item_id', values=['demand'])
    M_s =  preprocess_svd(M, RANK)
    M_columns = [c[1] for c in M.columns]
    del M
    M_s = pd.DataFrame(M_s)
    M_s.columns = M_columns
    M_s.index = days
    M_s.index.set_names(['d'], inplace=True)
    M_s = M_s.reset_index()
    M_s = M_s.melt(id_vars='d', var_name="item_id", value_name="demand")
    M_s['store_id'] = storeid
    sales_smoothed = sales_smoothed.append(M_s)
    del M_s
    gc.collect()
    
    

In [None]:
sales_smoothed.columns = ['d', 'item_id', 'demand_smoothed', 'store_id']

In [None]:
train = pd.merge(train, sales_smoothed, how='inner', on=['d', 'item_id', 'store_id'])

In [None]:
validation['demand_smoothed'] = 0

In [None]:
data = pd.concat([train, validation])

In [None]:
del train, validation

### ENCODE CATEGORICAL VARIABLES

In [None]:
calendar = encode_categorical(
    calendar, ["event_name_1", "event_type_1", "event_name_2", "event_type_2"]
).pipe(reduce_mem_usage)

data = encode_categorical(
    data, ["item_id", "dept_id", "cat_id", "store_id", "state_id"],
).pipe(reduce_mem_usage)

prices = encode_categorical(prices, ["item_id", "store_id"]).pipe(reduce_mem_usage)

### MERGE DATSETS

In [None]:
calendar["d"] = extract_num(calendar["d"])
data = merge_calendar(data, calendar)
del calendar
gc.collect()

data = merge_prices(data, prices)
del prices
gc.collect()

data = reduce_mem_usage(data)

In [None]:
# we leve only the evaluation part of the dataset 
submission = submission[submission["id"].str.endswith("evaluation")]

In [None]:
print("submission shape:", submission.shape)

### SAVE DATASET

In [None]:
data.to_pickle(f'{OUTPUT_PATH}/{OUTPUT_FILE_NAME}.pkl')

In [None]:
submission.to_pickle(f'{OUTPUT_PATH}/submission.pkl')