## Import Libraries and Load Data

In [233]:
# Import libraries

import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.decomposition import TruncatedSVD
import patsy

# Set seed
np.random.seed(4031)

In [234]:
# Data file locations and names

project_root_dir = "Data"
project_subdir_prefix = "fold_"
train_data_filename = "train.csv"
test_data_filename = "test.csv"


# The number of train/test data folders and the target RMSE for each
# train/test split in each folder

n_datasets = 10

In [235]:
# Get list of data subfolders, each with a separate training and test set.
# fold1 - fold5 have target RMSE 0.125, and fold6 - fold10 have target RMSE 0.135.

os_walk = os.walk(project_root_dir)
data_subdir_list = [subdirs for root, subdirs, files in os_walk][0]
n_subdirs = len(data_subdir_list)

assert(n_subdirs == n_datasets)

In [236]:
# Lists for training and test datasets

train_datasets = []
test_datasets = []


# Loop over subfolders and read in training/test datasets and test house sale prices.
# Use a loop instead of using os.walk directly to avoid "fold10" immediately following "fold1".

for subdir_num in np.arange(n_subdirs) + 1:
    subdir_num_str = str(subdir_num)
    train_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   train_data_filename)))
    test_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_data_filename)))

## Define Scoring function

In [237]:
# Define a WMAE function for scoring

def wmae():
    file_path = 'Data/test_with_label.csv'
    test = pd.read_csv(file_path)
    num_folds = 10
    wae = []

    for i in range(num_folds):
        file_path = f'Data/fold_{i+1}/mypred.csv'
        test_pred = pd.read_csv(file_path)

        # Left join with the test data
        new_test = test_pred.merge(test, on=['Date', 'Store', 'Dept'], how='left')

        # Compute the Weighted Absolute Error
        actuals = new_test['Weekly_Sales']
        preds = new_test['Weekly_Pred']
        weights = new_test['IsHoliday'].apply(lambda x: 5 if x else 1)
        wae.append(sum(weights * abs(actuals - preds)) / sum(weights))

    return wae

## Preprocess Data

In [240]:
#Loop through data and pull apart date into week and year
def preprocess(data):
    #Split date into useful features
    tmp = pd.to_datetime(data['Date'])
    data['Wk'] = tmp.dt.isocalendar().week
    data['Yr'] = tmp.dt.year
    data['Wk'] = pd.Categorical(data['Wk'], categories=[i for i in range(1, 53)])  # 52 weeks 

    #One hot encode Wk
    data = pd.get_dummies(data, columns=['Wk'], prefix='Week ')

    return data

## SVD Implementation

In [None]:
for j in range(n_datasets):
    train = train_datasets[j]
    test = test_datasets[j]

    # Initialize the DataFrame to store predictions
    test_pred = pd.DataFrame()

    fold_train = preprocess(train)
    fold_test = preprocess(test)

    stores = fold_train["Store"].unique()
    depts = fold_train["Dept"].unique()

    for store in stores:
        for dept in depts:
            #Find training and test data within same store and then same department
            train = fold_train[(fold_train["Store"] == store) & (fold_train["Dept"] == dept)]
            test = fold_test[(fold_test["Store"] == store) & (fold_test["Dept"] == dept)]

            #abort if the train data is non-existant (i.e., this combo of store/dept doesnt appear in data)
            if len(train) == 0:
                continue

            Y_train = train["Weekly_Sales"]
            X_train = train.drop(["Weekly_Sales", "Date"], axis=1)

            X_test = test

            #Keep Store, dept, and date info for later merging
            tmp_pred = X_test[['Store', 'Dept', 'Date']]
            X_test = X_test.drop(["Date"], axis=1)


            #Implement SVD
            #n_components=5
            #svd_df = pd.DataFrame(svd_result, columns=[[f'SVD_{i}' for i in range (n_components)]]) 
            svd = TruncatedSVD() 
            svd_result = svd.fit_transform(X_train)
            svd_df = pd.DataFrame(svd_result) 

            #Train model on only the features SVD selected
            model = sm.OLS(Y_train, svd_df).fit()
            mycoef = model.params.fillna(0)
            
            #Fit SVD columns for test set
            X_test = svd.transform(X_test)
            
            #Predict Y
            tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)

            #Readd context of store, dept, and date
            test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
            
        
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)


## Original Code (OLS only)

In [223]:
#OLS method requires a different preprocess function

def preprocess(data):
    #Split date into useful features
    tmp = pd.to_datetime(data['Date'])
    data['Wk'] = tmp.dt.isocalendar().week
    data['Yr'] = tmp.dt.year
    data['Wk'] = pd.Categorical(data['Wk'], categories=[i for i in range(1, 53)])  # 52 weeks 

    return data


In [224]:
#Original OLS model

for j in range(n_datasets):
    train = train_datasets[j]
    test = test_datasets[j]

    test_pred = pd.DataFrame()

    train_pairs = train[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    test_pairs = test[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    unique_pairs = pd.merge(train_pairs, test_pairs, how = 'inner', on =['Store', 'Dept'])

    train_split = unique_pairs.merge(train, on=['Store', 'Dept'], how='left')
    train_split = preprocess(train_split)
    y, X = patsy.dmatrices('Weekly_Sales ~ Weekly_Sales + Store + Dept + Yr  + Wk', 
                        data = train_split, 
                        return_type='dataframe')
    train_split = dict(tuple(X.groupby(['Store', 'Dept'])))


    test_split = unique_pairs.merge(test, on=['Store', 'Dept'], how='left')
    test_split = preprocess(test_split)
    y, X = patsy.dmatrices('Yr ~ Store + Dept + Yr  + Wk', 
                        data = test_split, 
                        return_type='dataframe')
    X['Date'] = test_split['Date']
    test_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    keys = list(train_split)

    for key in keys:
        X_train = train_split[key]
        X_test = test_split[key]
    
        Y = X_train['Weekly_Sales']
        X_train = X_train.drop(['Weekly_Sales','Store', 'Dept'], axis=1)
        
        cols_to_drop = X_train.columns[(X_train == 0).all()]
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)

        
    
        cols_to_drop = []
        for i in range(len(X_train.columns) - 1, 1, -1):  # Start from the last column and move backward
            col_name = X_train.columns[i]
            # Extract the current column and all previous columns
            tmp_Y = X_train.iloc[:, i].values
            tmp_X = X_train.iloc[:, :i].values

            coefficients, residuals, rank, s = np.linalg.lstsq(tmp_X, tmp_Y, rcond=None)
            if np.sum(residuals) < 1e-10:
                    cols_to_drop.append(col_name)
                
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)

        model = sm.OLS(Y, X_train).fit()
        mycoef = model.params.fillna(0)
        
        tmp_pred = X_test[['Store', 'Dept', 'Date']]
        X_test = X_test.drop(['Store', 'Dept', 'Date'], axis=1)
        
        tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)
        test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
        
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)


fold_1 processed
fold_2 processed
fold_3 processed
fold_4 processed
fold_5 processed
fold_6 processed
fold_7 processed
fold_8 processed
fold_9 processed
fold_10 processed


## Evaluate Predictions

In [245]:
wae = wmae()
for value in wae:
    print(f"\t{value:.3f}")
print(f"{sum(wae) / len(wae):.3f}")

	7170.060
	7392.972
	7604.202
	6603.801
	7567.474
	9535.512
	7857.230
	6024.099
	6104.525
	5025.340
7088.522


Linear errors

	2049.347
	1467.113
	1446.882
	1595.628
	2334.678
	1675.221
	1720.828
	1427.286
	1443.787
	1444.677
1660.545