## Import Libraries and Load Data

In [1]:
# Import libraries
import warnings

import os

# Import Pandas, ignoring SettingWithCopyWarning notifications
with warnings.catch_warnings():
    warnings.simplefilter("ignore")
    import pandas as pd

import numpy as np
import statsmodels.api as sm
import patsy

# Set seed
SEED = 4031
np.random.seed(SEED)

In [2]:
# Data file locations and names

project_root_dir = "Data"
project_subdir_prefix = "fold_"
train_data_filename = "train.csv"
test_data_filename = "test.csv"


# The number of train/test data folders and the target RMSE for each
# train/test split in each folder

n_datasets = 10

In [3]:
# Get list of data subfolders, each with a separate training and test set.

os_walk = os.walk(project_root_dir)
data_subdir_list = [subdirs for root, subdirs, files in os_walk][0]
n_subdirs = len(data_subdir_list)

assert(n_subdirs == n_datasets)

In [4]:
# Lists for training and test datasets

train_datasets = []
test_datasets = []


# Loop over subfolders and read in training/test datasets and test weekly sales.
# Use a loop instead of using os.walk directly to avoid "fold10" immediately following "fold1".

for subdir_num in np.arange(n_subdirs) + 1:
    subdir_num_str = str(subdir_num)
    train_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   train_data_filename)))
    test_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_data_filename)))

## Define Scoring function

In [5]:
# Define a WMAE function for scoring

def wmae():
    file_path = 'Data/test_with_label.csv'
    test = pd.read_csv(file_path)
    num_folds = 10
    wae = []

    for i in range(num_folds):
        file_path = f'Data/fold_{i+1}/mypred.csv'
        test_pred = pd.read_csv(file_path)

        # Left join with the test data
        new_test = test_pred.merge(test, on=['Date', 'Store', 'Dept'], how='left')

        # Compute the Weighted Absolute Error
        actuals = new_test['Weekly_Sales']
        preds = new_test['Weekly_Pred']
        weights = new_test['IsHoliday'].apply(lambda x: 5 if x else 1)
        wae.append(sum(weights * abs(actuals - preds)) / sum(weights))

    return wae

## Edit original OLS: group stores by department and add SVD/PCA

Preprocessing steps
1. Group training data by department
2. Pivot each department's data so that stores are rows and dates are columns, with values = weekly sales
3. Fill in missing stores and dates, setting their sales to zero
4. Center store values
5. Perform SVD
6. Re-add store means
7. Use the SVD output as y_train for x_train = \[Year, Week, Store\]

In [25]:
# Components to return from SVD. This is from the example in Campuswire post #364:
# https://campuswire.com/c/G06C55090/feed/364
n_components = 8

In [33]:
def train_fill_and_pivot(train_data):
    """
    Prep training data for smoothing of the weekly sales figures.
    Given a training set of stores, departments, dates and sales figures,
    fill in any stores and dates missing for any given department.
    Fill in zeroes for the missing sales figures.
    This gives each department's data an identical shape.
    Finally, return a pivoted dataset with weeks as columns and sales figures as values.
    Also return dictionaries of stores and dates in the training set.
    """
    
    # Get unique stores and dates. This is to help fill in zeroes for any departments
    # that are missing from certain stores and dates.
    train_store_list = train_data["Store"].unique().tolist()
    train_date_list = train_data["Date"].unique().tolist()
    
    # Get columns needed for further processing
    train_sel_col = train_data[["Store", "Dept", "Date", "Weekly_Sales"]]
    
    # Fill in missing stores and sales dates dates for any given department.
    
    # Set Date to categorical to help fill in sales dates missing for any stores/depts
    train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)
    train_sel_col = train_sel_col.groupby(["Store", "Dept", "Date"], as_index=False).first()
    # Set Date column back to datetime from categorical
    train_sel_col.loc[:, ["Date"]] = pd.to_datetime(train_sel_col["Date"].values)
    
    # Set Store to categorical. This helps fill in stores missing for any dates/depts.
    train_sel_col.loc[:, ["Store"]] = pd.Categorical(train_sel_col["Store"].values, categories=train_store_list)
    train_sel_col = train_sel_col.groupby(["Store", "Dept", "Date"], as_index=False).first()
    # Set store back to numeric
    train_sel_col.loc[:, ["Store"]] = train_sel_col["Store"].values.astype(int)
    
    # Pivot dataframe so that stores are rows and sales dates columns, with values = Weekly_Sales.
    # Fill in missing values with zeroes.
    train_pivot = train_sel_col.pivot(index=["Dept", "Store"], columns="Date", values="Weekly_Sales").reset_index().fillna(0)

    return train_pivot, train_store_list, train_date_list

In [34]:
def smooth_weekly_sales(train_data, train_store_list, train_date_list, n_components=n_components):
    """
    Given a department's pivoted training dataset of weekly sales,
    use SVD to smooth the weekly sales across stores.
    Return the pivoted dataset with smoothed sales.
    """

    # Extract numpy array of sales figures
    train_sales = train_data.to_numpy()[:, 2:]
    
    # Get store means
    train_store_means = np.mean(train_sales, axis=1)[:, np.newaxis]
    
    # Center sales data
    train_centered_sales = train_sales - train_store_means
    
    # Perform SVD on centered sales data
    train_U, train_S, train_V = np.linalg.svd(train_centered_sales)
    
    # Reduce the number of components
    train_U_reduced = train_U[:, :n_components]
    train_D_reduced = np.diag(train_S[:n_components])
    train_Vt_reduced = train_V[:, :n_components].T
    
    # Regenerate smoothed sales
    train_sales_smooth = (train_U_reduced @ train_D_reduced @ train_Vt_reduced) + train_store_means
    
    # Convert smoothed sales from array to dataframe
    train_sales_smooth_df = pd.DataFrame(train_sales_smooth, columns=train_date_list)
    # Re-add Store column
    train_sales_smooth_df["Store"] = train_store_list

    # Return results
    return train_sales_smooth_df

In [9]:
def train_unpivot(train_pivot):
    """
    Unpivot a department's smoothed weekly sales figures back to a form
    of one figure per store and date.
    """
    
    # Unpivot the dataframe
    train_unpivot = train_pivot.melt(id_vars=["Store"], var_name="Date", value_name="Weekly_Sales")
    
    return train_unpivot

In [10]:
def dates_to_years_and_weeks(data):
    """
    Convert sales dates in data to numeric years and categorical weeks.
    Separate the date from the years and weeks.
    """

    tmp = pd.to_datetime(data["Date"])
    data["Wk"] = tmp.dt.isocalendar().week
    data["Yr"] = tmp.dt.year
    data["Wk"] = pd.Categorical(data["Wk"], categories=[i for i in range(1, 53)])  # 52 weeks
    
    date_vals = data["Date"]
    
    # Drop date column
    data.drop(["Date"], axis=1, inplace=True)
    
    return data, date_vals

In [36]:
# Loop over folds
for j in range(n_datasets):

    # Get a pair of training and test sets
    train = train_datasets[j]
    test = test_datasets[j]

    test_pred = pd.DataFrame()

    # Identify the distinct store/dept pairs shared by the training and test set.
    # Will only process these.

    train_pairs = train[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    test_pairs = test[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    unique_pairs = pd.merge(train_pairs, test_pairs, how = 'inner', on =['Store', 'Dept'])
    
    # Join the distinct store/dept pairs to the training set.
    # Why left join? When would training data not be available?
    train_split = unique_pairs.merge(train, on=['Store', 'Dept'], how='left')
    
    # Now join the distinct store/dept pairs to the test set.
    # Same question: why left join? When would training data not be available?
    test_split = unique_pairs.merge(test, on=['Store', 'Dept'], how='left')

    # Pivot training sales by week, and fill in missing stores and dates for each department.
    # Get lists of stores and dates present in training.
    train_pivot, train_store_list, train_date_list = train_fill_and_pivot(train_split)
        
    # Group by department to help build separate model for each department.
    # Create dict where key = Dept and value = dataframe of Store/Date/Weekly_Sales.
    #train_split = dict(tuple(train_split.groupby(["Dept"])))
    train_split = dict(tuple(train_pivot.groupby(["Dept"])))
    test_split = dict(tuple(test_split.groupby(["Dept"])))
    
    # Get the training departments
    depts = list(train_split)

    # Loop over (store, dept) tuples
    for dept in depts:
        
        #print("Dept:", dept)

        # Get training and test design matrices corresponding to (store, dept)
        X_train = train_split[dept]
        #if (dept == 1):
        #    print("X_train columns before smoothing", X_train.columns)
        X_test = test_split[dept]
    
        # Use SVD to smooth weekly sales in training data
        X_train_smooth = smooth_weekly_sales(train_data=X_train,
                                             train_store_list=train_store_list,
                                             train_date_list=train_date_list,
                                             n_components=n_components)
        

        # Unpivot the training data
        X_train = train_unpivot(X_train_smooth)
        
        #if (dept == 1):
        #    smoothed_train_file_path = f'Data/fold_{j+1}/smooth_train_dept_1.csv'
        #    X_train.to_csv(smoothed_train_file_path, index=False)
            
        # Convert sales dates to years and weeks
        X_train, train_dates = dates_to_years_and_weeks(X_train)
        X_test, test_dates = dates_to_years_and_weeks(X_test)
        
        # Create final design matrix for training.
        # This one-hot encodes Wk and creates an Intercept field.
        
        y_train, X_train = patsy.dmatrices("Weekly_Sales ~ Store + Yr + Wk",
                                          data = X_train,
                                          return_type="dataframe")
        
        # Create equivalent design matrix for test.
        
        y_tmp, X_test = patsy.dmatrices("Yr ~ Store + Yr + Wk",
                                          data = X_test,
                                          return_type="dataframe")
        
        # Split weekly sales from training predictors
        #y_train = X_train["Weekly_Sales"]
        #X_train = X_train.drop(["Weekly_Sales"], axis=1)
        
        #print("y_train NANs:", y_train.isna().sum())
        
        # Standardize column  order
        #X_train = X_train[["Store", "Yr", "Wk"]]
        #X_test = X_test[["Store", "Yr", "Wk"]]
        
        # Exclude store number from model fitting and prediction
        X_train.drop(["Store"], axis=1, inplace=True)
        test_store = X_test["Store"]
        X_test.drop(["Store"], axis=1, inplace=True)
        
        # Drop columns that are all zero in X_train
        cols_to_drop = X_train.columns[(X_train == 0).all()]
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)
        
        # Identify and remove columns that are highly collinear in X_train
        # with columns to their left.
        # Note that this doesn't check the Intercept column.
        cols_to_drop = []
        for i in range(len(X_train.columns) - 1, 1, -1):  # Start from the last column and move backward
            col_name = X_train.columns[i]
            # Extract the current column and all previous columns
            tmp_Y = X_train.iloc[:, i].values
            tmp_X = X_train.iloc[:, :i].values

            coefficients, residuals, rank, s = np.linalg.lstsq(tmp_X, tmp_Y, rcond=None)
            if np.sum(residuals) < 1e-10:
                    cols_to_drop.append(col_name)
                
        # Drop those collinear columns from both training and test X.
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)
        
        #if (dept == 1):
        #    print("X_train columns after collinearity handling", X_train.columns)
        #    print("X_test columns after collinearity handling", X_test.columns)

        # Fit OLS model
        model = sm.OLS(y_train, X_train).fit()
        mycoef = model.params.fillna(0)
        
        # Initialize dataframe to store predictors + predicted sales
        tmp_pred = X_test
        
        # Exclude store number from predictions
        
        # Predict and save test sales
        tmp_pred["Weekly_Pred"] = np.dot(X_test, mycoef)

        # Re-add Store, Department and Date fields
        tmp_pred["Store"] = test_store
        tmp_pred["Dept"] = dept
        tmp_pred["Date"] = test_dates

        # Append this fold's predictions to the list
        test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
        
    # Fill in any missing predictions with zero
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_1 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_2 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_3 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_4 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_5 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_6 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_7 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_8 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_9 processed


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col.loc[:, ["Date"]] = pd.Categorical(train_sel_col["Date"].values, categories=train_date_list)


fold_10 processed


## Evaluate predictions

In [37]:
wae = wmae()
for value in wae:
    print(f"\t{value:.3f}")
print(f"{sum(wae) / len(wae):.3f}")

	8990.059
	9187.492
	9193.391
	8862.238
	11264.484
	8541.695
	8563.644
	8821.667
	8667.612
	8477.072
9056.936
