## Import Libraries and Load Data

In [1]:
# Import libraries

import os
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.decomposition import TruncatedSVD
import patsy

# Set seed
np.random.seed(4031)

In [2]:
# Data file locations and names

project_root_dir = "Data"
project_subdir_prefix = "fold_"
train_data_filename = "train.csv"
test_data_filename = "test.csv"


# The number of train/test data folders and the target RMSE for each
# train/test split in each folder

n_datasets = 10

In [3]:
# Get list of data subfolders, each with a separate training and test set.
# fold1 - fold5 have target RMSE 0.125, and fold6 - fold10 have target RMSE 0.135.

os_walk = os.walk(project_root_dir)
data_subdir_list = [subdirs for root, subdirs, files in os_walk][0]
n_subdirs = len(data_subdir_list)

assert(n_subdirs == n_datasets)

In [4]:
# Lists for training and test datasets

train_datasets = []
test_datasets = []


# Loop over subfolders and read in training/test datasets and test weekly sales.
# Use a loop instead of using os.walk directly to avoid "fold10" immediately following "fold1".

for subdir_num in np.arange(n_subdirs) + 1:
    subdir_num_str = str(subdir_num)
    train_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   train_data_filename)))
    test_datasets.append(pd.read_csv(os.path.join(project_root_dir,
                                                   project_subdir_prefix + subdir_num_str,
                                                   test_data_filename)))

## Define Scoring function

In [22]:
# Define a WMAE function for scoring

def wmae():
    file_path = 'Data/test_with_label.csv'
    test = pd.read_csv(file_path)
    num_folds = 10
    wae = []

    for i in range(num_folds):
        file_path = f'Data/fold_{i+1}/mypred.csv'
        test_pred = pd.read_csv(file_path)

        # Left join with the test data
        new_test = test_pred.merge(test, on=['Date', 'Store', 'Dept'], how='left')

        # Compute the Weighted Absolute Error
        actuals = new_test['Weekly_Sales']
        preds = new_test['Weekly_Pred']
        weights = new_test['IsHoliday'].apply(lambda x: 5 if x else 1)
        wae.append(sum(weights * abs(actuals - preds)) / sum(weights))

    return wae

## Preprocess Data

In [None]:
#Loop through data and pull apart date into week and year
def preprocess(data):
    #Split date into useful features
    tmp = pd.to_datetime(data['Date'])
    data['Wk'] = tmp.dt.isocalendar().week
    data['Yr'] = tmp.dt.year
    data['Wk'] = pd.Categorical(data['Wk'], categories=[i for i in range(1, 53)])  # 52 weeks 

    #One hot encode Wk
    data = pd.get_dummies(data, columns=['Wk'], prefix='Week')

    return data

## SVD Implementation (not working)

In [None]:
for j in range(n_datasets):
    train = train_datasets[j]
    test = test_datasets[j]

    # Initialize the DataFrame to store predictions
    test_pred = pd.DataFrame()

    fold_train = preprocess(train)
    fold_test = preprocess(test)

    stores = fold_train["Store"].unique()
    depts = fold_train["Dept"].unique()
    years = fold_train["Yr"].unique()

    for store in stores:
        for dept in depts:
            for year in years:
                #Find training and test data within same store and then same department
                train = fold_train[(fold_train["Store"] == store) & (fold_train["Dept"] == dept) & (fold_train["Yr"] == year)]
                test = fold_test[(fold_test["Store"] == store) & (fold_test["Dept"] == dept) & (fold_test["Yr"] == year)]

                #abort if the train data is non-existant (i.e., this combo of store/dept doesnt appear in data)
                if len(train) == 0:
                    continue

                Y_train = train["Weekly_Sales"]
                X_train = train.drop(["Weekly_Sales", "Date"], axis=1)

                X_test = test

                #Keep Store, dept, and date info for later merging
                tmp_pred = X_test[['Store', 'Dept', 'Date']]
                X_test = X_test.drop(["Date"], axis=1)


                #Implement SVD
                #n_components=5
                #svd_df = pd.DataFrame(svd_result, columns=[[f'SVD_{i}' for i in range (n_components)]]) 
                svd = TruncatedSVD() 
                svd_result = svd.fit_transform(X_train)
                svd_df = pd.DataFrame(svd_result) 

                #Train model on only the features SVD selected
                model = sm.OLS(Y_train, svd_df).fit()
                mycoef = model.params.fillna(0)
                
                #Fit SVD columns for test set
                X_test = svd.transform(X_test)
                
                #Predict Y
                tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)

                #Readd context of store, dept, and date
                test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
            
        
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)


## Linear Refactor (not working)

In [None]:
for j in range(n_datasets):
    train = train_datasets[j]
    test = test_datasets[j]

    # Initialize the DataFrame to store predictions
    test_pred = pd.DataFrame()

    fold_train = preprocess(train)
    fold_test = preprocess(test)

    stores = fold_train["Store"].unique()
    depts = fold_train["Dept"].unique()
    years = fold_train["Yr"].unique()

    for store in stores:
        for dept in depts:
            for year in years:
                #Find training and test data within same store and then same department
                train = fold_train[(fold_train["Store"] == store) & (fold_train["Dept"] == dept) & (fold_train["Yr"] == year)]
                test = fold_test[(fold_test["Store"] == store) & (fold_test["Dept"] == dept) & (fold_test["Yr"] == year)]

                #abort if the train data is non-existant (i.e., this combo of store/dept doesnt appear in data)
                if len(train) == 0:
                    continue

                Y_train = train["Weekly_Sales"]
                X_train = train.drop(["Weekly_Sales", "Date"], axis=1)

                X_test = test

                #Keep Store, dept, and date info for later merging
                tmp_pred = X_test[['Store', 'Dept', 'Date']]
                X_test = X_test.drop(["Date"], axis=1)

                X_train = X_train.drop(["Store", "Dept", "IsHoliday"], axis=1)
                X_test = X_test.drop(["Store", "Dept", "IsHoliday"], axis=1)

                #Add intercept columns
                X_train["Intercept"] = 1
                X_test["Intercept"] = 1

                #Cast one hot bools to ints
                X_train = X_train.astype(int)
                X_test = X_test.astype(int)

                #Train model on only the features SVD selected
                model = sm.OLS(Y_train, X_train).fit()
                mycoef = model.params.fillna(0)
                
                #Predict Y
                tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)

                #Readd context of store, dept, and date
                test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
            
        
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)

## Original Code (OLS only)

In [5]:
#OLS method requires a different preprocess function

def preprocess(data):
    #Split date into useful features
    tmp = pd.to_datetime(data['Date'])
    data['Wk'] = tmp.dt.isocalendar().week
    data['Yr'] = tmp.dt.year
    data['Wk'] = pd.Categorical(data['Wk'], categories=[i for i in range(1, 53)])  # 52 weeks 

    return data


In [None]:
#Original OLS model

# Loop over folds
for j in range(n_datasets):
    
    # Get a pair of training and test sets
    train = train_datasets[j]
    test = test_datasets[j]

    test_pred = pd.DataFrame()

    # Identify the distinct store/dept pairs shared by the training and test set.
    train_pairs = train[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    test_pairs = test[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    unique_pairs = pd.merge(train_pairs, test_pairs, how = 'inner', on =['Store', 'Dept'])

    # Join the distinct store/dept pairs to the training set.
    # Why left join? When would training data not be available?
    train_split = unique_pairs.merge(train, on=['Store', 'Dept'], how='left')
    
    # Add numeric column for the year and a categorical column for week # to the training set
    train_split = preprocess(train_split)
    # Get design matrices for training y and X.
    # y is just the target variable, Weekly_Sales.
    # X has pivoted weeks, where individual weeks are separate 0/1 columns.
    y, X = patsy.dmatrices('Weekly_Sales ~ Weekly_Sales + Store + Dept + Yr  + Wk', 
                        data = train_split, 
                        return_type='dataframe')
    # Get dictionary where keys are (Store, Dept) tuples, and values are the
    # \"Weekly_Sales + Store + Dept + Yr + Wk\" design matrices corresponding to each key.
    # The design matrices include an Intercept column with value 1.
    train_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    
    # Now join the distinct store/dept pairs to the test set.
    # Same question: why left join? When would training data not be available?
    test_split = unique_pairs.merge(test, on=['Store', 'Dept'], how='left')
    # Add numeric column for the year and a categorical column for week # to the test set
    test_split = preprocess(test_split)
    # Get design matrices for text y and X.
    # y is the Year, and the design matrix is \"Store + Dept + Yr + Wk\".
    # Note that test sets don't have the Weekly_Sales target variable.
    # Why save Year as y?
    y, X = patsy.dmatrices('Yr ~ Store + Dept + Yr  + Wk', 
                        data = test_split, 
                        return_type='dataframe')
    # Re-add Date column to the design matrix X
    X['Date'] = test_split['Date']
    # Get dictionary where keys are (Store, Dept) tuples, and values are the
    # \"Yr  + Wk + Date\" design matrices corresponding to each key.
    test_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    # Get the training (store, dept) tuples.
    # SHOULD be the same keys as in test, given the left joins above.
    keys = list(train_split)

    # Loop over (store, dept) tuples
    for key in keys:
        # Get training and test design matrices corresponding to (store, dept)
        X_train = train_split[key]
        X_test = test_split[key]
    
        # Target variable for (store, dept)
        Y = X_train['Weekly_Sales']
        # Drop ID and target to get just a table of predictors
        X_train = X_train.drop(['Weekly_Sales','Store', 'Dept'], axis=1)
        
        # Identify columns that are all zero in training predictors, and drop them
        # from both training and test X.
        # This should drop weeks that are not represented in the training data.
        # How does this affect test X? Are there cases where all test weeks would be dropped?
        cols_to_drop = X_train.columns[(X_train == 0).all()]
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)

        
        # Identify X training columns that are highly collinear with the columns to the left.
        # Note that this doesn't check the Intercept column.
        cols_to_drop = []
        for i in range(len(X_train.columns) - 1, 1, -1):  # Start from the last column and move backward
            col_name = X_train.columns[i]
            # Extract the current column and all previous columns
            tmp_Y = X_train.iloc[:, i].values
            tmp_X = X_train.iloc[:, :i].values

            coefficients, residuals, rank, s = np.linalg.lstsq(tmp_X, tmp_Y, rcond=None)
            if np.sum(residuals) < 1e-10:
                    cols_to_drop.append(col_name)
                
        # Drop those collinear columns from both training and test X.
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)
        print(X_train)
        # Fit a regular ordinary least squares model on training Weekly_Sales.
        model = sm.OLS(Y, X_train).fit()
        mycoef = model.params.fillna(0)
        
        tmp_pred = X_test[['Store', 'Dept', 'Date']]
        X_test = X_test.drop(['Store', 'Dept', 'Date'], axis=1)
        
        tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)
        test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)
        
    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    # Save the output to CSV
    file_path = f'Data/fold_{j+1}/mypred.csv'
    print(f'fold_{j+1} processed')
    test_pred.to_csv(file_path, index=False)

## Evaluate Predictions

In [None]:
wae = wmae()
for value in wae:
    print(f"\t{value:.3f}")
print(f"{sum(wae) / len(wae):.3f}")

Linear errors

	2049.347
	1467.113
	1446.882
	1595.628
	2334.678
	1675.221
	1720.828
	1427.286
	1443.787
	1444.677
1660.545

## Edit original OLS: group stores by department and add SVD/PCA

Preprocessing steps
1. Group training data by department
2. Pivot each department's data so that stores are rows and dates are columns, with values = weekly sales
3. Fill in missing stores and dates, setting their sales to zero
4. Center store values
5. Perform SVD
6. Re-add store means
7. Use the SVD output as y_train for x_train = \[Year, Week, Store\]

In [6]:
# Try on a single fold: j = 2
j = 2

# Components to return from SVD. This is from the example in Campuswire post #364:
# https://campuswire.com/c/G06C55090/feed/364
n_components = 8
temp_seed = 4031
    
# Get a pair of training and test sets
train = train_datasets[j]
test = test_datasets[j]

test_pred = pd.DataFrame()

# Identify the distinct store/dept pairs shared by the training and test set.
# Will only process these.

train_pairs = train[['Store', 'Dept']].drop_duplicates(ignore_index=True)
test_pairs = test[['Store', 'Dept']].drop_duplicates(ignore_index=True)
unique_pairs = pd.merge(train_pairs, test_pairs, how = 'inner', on =['Store', 'Dept'])

# Join the distinct store/dept pairs to the training set.
# Why left join? When would training data not be available?
train_split = unique_pairs.merge(train, on=['Store', 'Dept'], how='left')

In [7]:
# Get unique stores and dates. This is to help fill in zeroes for any departments
# that are missing from certain stores and dates.

train_store_list = train_split["Store"].unique().tolist()
train_date_list = train_split["Date"].unique().tolist()

In [8]:
# Prep train data for SVD/PCA.
# For each department, construct a dataframe consisting of:
# rows = dates; columns = store numbers; and values = weekly sales.

# Get columns needed for SVD
train_sel_col = train_split[["Store", "Dept", "Date", "Weekly_Sales"]]

In [9]:
# Set Date to categorical to help fill in sales dates missing for any stores/depts
train_sel_col["Date"] = pd.Categorical(train_sel_col["Date"], categories=train_date_list)
train_sel_col = train_sel_col.groupby(["Store", "Dept", "Date"], as_index=False).first()
train_sel_col


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_sel_col["Date"] = pd.Categorical(train_sel_col["Date"], categories=train_date_list)


Unnamed: 0,Store,Dept,Date,Weekly_Sales
0,1,1,2010-02-05,24924.50
1,1,1,2010-02-12,46039.49
2,1,1,2010-02-19,41595.55
3,1,1,2010-02-26,19403.54
4,1,1,2010-03-05,21827.90
...,...,...,...,...
259510,45,99,2011-05-27,
259511,45,99,2011-06-03,
259512,45,99,2011-06-10,
259513,45,99,2011-06-17,


In [10]:
# Set Date column back to datetime from categorical
train_sel_col["Date"] = pd.to_datetime(train_sel_col["Date"])
train_sel_col

Unnamed: 0,Store,Dept,Date,Weekly_Sales
0,1,1,2010-02-05,24924.50
1,1,1,2010-02-12,46039.49
2,1,1,2010-02-19,41595.55
3,1,1,2010-02-26,19403.54
4,1,1,2010-03-05,21827.90
...,...,...,...,...
259510,45,99,2011-05-27,
259511,45,99,2011-06-03,
259512,45,99,2011-06-10,
259513,45,99,2011-06-17,


In [11]:
# Set Store to categorical to help fill in stores missing for any dates/depts
train_sel_col["Store"] = pd.Categorical(train_sel_col["Store"], categories=train_store_list)
train_sel_col = train_sel_col.groupby(["Store", "Dept", "Date"], as_index=False).first()
train_sel_col

Unnamed: 0,Store,Dept,Date,Weekly_Sales
0,1,1,2010-02-05,24924.50
1,1,1,2010-02-12,46039.49
2,1,1,2010-02-19,41595.55
3,1,1,2010-02-26,19403.54
4,1,1,2010-03-05,21827.90
...,...,...,...,...
259510,45,99,2011-05-27,
259511,45,99,2011-06-03,
259512,45,99,2011-06-10,
259513,45,99,2011-06-17,


In [12]:
# Set store back to numeric
train_sel_col["Store"] = train_sel_col["Store"].astype(int)
train_sel_col

Unnamed: 0,Store,Dept,Date,Weekly_Sales
0,1,1,2010-02-05,24924.50
1,1,1,2010-02-12,46039.49
2,1,1,2010-02-19,41595.55
3,1,1,2010-02-26,19403.54
4,1,1,2010-03-05,21827.90
...,...,...,...,...
259510,45,99,2011-05-27,
259511,45,99,2011-06-03,
259512,45,99,2011-06-10,
259513,45,99,2011-06-17,


In [13]:

# Pivot dataframe so that stores are rows and sales dates columns, with values = Weekly_Sales.
# Fill in missing values with zeroes.
train_pivot = train_sel_col.pivot(index=["Dept", "Store"], columns="Date", values="Weekly_Sales").reset_index().fillna(0)
train_pivot

# Fill in any missing dates for any stores and departments.
#train_pivot["Date"] = pd.Categorical(train_pivot["Date"], categories=train_pivot["Date"].unique())
#train_pivot = train_pivot.groupby(["Dept", "Date"], as_index=False).first()

Date,Dept,Store,2010-02-05 00:00:00,2010-02-12 00:00:00,2010-02-19 00:00:00,2010-02-26 00:00:00,2010-03-05 00:00:00,2010-03-12 00:00:00,2010-03-19 00:00:00,2010-03-26 00:00:00,...,2011-04-22 00:00:00,2011-04-29 00:00:00,2011-05-06 00:00:00,2011-05-13 00:00:00,2011-05-20 00:00:00,2011-05-27 00:00:00,2011-06-03 00:00:00,2011-06-10 00:00:00,2011-06-17 00:00:00,2011-06-24 00:00:00
0,1,1,24924.50,46039.49,41595.55,19403.54,21827.90,21043.39,22136.64,26229.21,...,50510.31,41512.39,20138.19,17235.15,15136.78,15741.60,16434.15,15883.52,14978.09,15682.81
1,1,2,35034.06,60483.70,58221.52,25962.32,27372.05,28660.87,28446.92,32213.99,...,72065.08,57905.55,24658.53,22484.33,21590.53,21334.34,22174.12,21554.22,22431.56,21426.72
2,1,3,6453.58,12748.72,8918.31,4992.00,5172.73,5540.28,5118.12,6620.12,...,15862.38,9410.55,5941.28,4660.39,5142.22,5194.21,4906.27,4790.03,5738.30,4484.47
3,1,4,38724.42,69872.44,49937.09,30107.54,31580.69,29452.49,30853.27,33401.19,...,83445.89,50759.95,25494.77,27189.70,25198.40,25035.61,25401.86,24726.22,27152.27,26859.67
4,1,5,9323.89,16861.10,11417.67,7168.41,8344.13,7531.45,8719.27,9945.67,...,18238.83,12830.22,8751.74,7871.57,8169.97,7901.75,8175.25,8170.13,8157.72,8392.34
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3550,99,41,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,45.00,25.00,0.00,0.00,0.00,0.00,100.00,0.00
3551,99,42,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3552,99,43,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00
3553,99,44,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,...,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00,0.00


In [14]:
# Group by Dept. Create dict where key = Dept and value = dataframe of Store/Date/Weekly_Sales.
train_split = dict(tuple(train_pivot.groupby(["Dept"])))
train_split

{1: Date  Dept  Store  2010-02-05 00:00:00  2010-02-12 00:00:00  \
 0        1      1             24924.50             46039.49   
 1        1      2             35034.06             60483.70   
 2        1      3              6453.58             12748.72   
 3        1      4             38724.42             69872.44   
 4        1      5              9323.89             16861.10   
 5        1      6             25619.00             43749.81   
 6        1      7              8970.97             14026.65   
 7        1      8             16181.89             34262.09   
 8        1      9             12861.40             20273.94   
 9        1     10             40212.84             67699.32   
 10       1     11             19611.13             39996.04   
 11       1     12             17426.75             37734.82   
 12       1     13             46761.90             78415.94   
 13       1     14             32842.31             39972.64   
 14       1     15             12239.

In [15]:
# Try out SVD on a single department's data.
train_1_sales = train_split[1].to_numpy()[:, 2:]
train_1_sales

array([[24924.5 , 46039.49, 41595.55, ..., 15883.52, 14978.09, 15682.81],
       [35034.06, 60483.7 , 58221.52, ..., 21554.22, 22431.56, 21426.72],
       [ 6453.58, 12748.72,  8918.31, ...,  4790.03,  5738.3 ,  4484.47],
       ...,
       [ 6476.76, 18597.64,  9939.45, ...,  5450.76,  5580.68,  5696.96],
       [ 6871.2 , 12315.65,  7751.11, ...,  7632.33,  7580.63,  7377.88],
       [18628.11, 22416.94, 28756.53, ..., 10312.84, 11742.  , 11570.03]])

In [16]:
# Get store means
train_1_store_means = np.mean(train_1_sales, axis=1)
train_1_store_means = train_1_store_means[:, np.newaxis]
train_1_store_means

array([[23292.71958904],
       [31835.44041096],
       [ 6576.02342466],
       [36351.0490411 ],
       [ 9403.93410959],
       [24362.51780822],
       [ 9181.15972603],
       [15059.11520548],
       [11614.67273973],
       [41353.18561644],
       [18490.01986301],
       [17474.41794521],
       [46937.28972603],
       [31719.0960274 ],
       [14102.94876712],
       [11799.6390411 ],
       [21827.37726027],
       [23729.38054795],
       [21799.16972603],
       [43259.01493151],
       [15318.60438356],
       [22267.34958904],
       [32764.28684932],
       [19396.14534247],
       [20784.72219178],
       [20080.48109589],
       [30960.07315068],
       [20516.37438356],
       [15889.66493151],
       [10341.54589041],
       [17448.42506849],
       [22026.30945205],
       [ 1899.6990411 ],
       [19785.75      ],
       [18506.21      ],
       [ 2106.34424658],
       [10700.5660274 ],
       [ 6314.25369863],
       [20466.10452055],
       [19132.82410959],


In [17]:
np.sum(train_1_sales[0, :]) / 73

23292.719589041095

In [18]:
train_1_sales.shape, train_1_store_means.shape

((45, 73), (45, 1))

In [19]:
# Center sales data
train_1_centered_sales = train_1_sales - train_1_store_means
train_1_centered_sales

array([[ 1.63178041e+03,  2.27467704e+04,  1.83028304e+04, ...,
        -7.40919959e+03, -8.31462959e+03, -7.60990959e+03],
       [ 3.19861959e+03,  2.86482596e+04,  2.63860796e+04, ...,
        -1.02812204e+04, -9.40388041e+03, -1.04087204e+04],
       [-1.22443425e+02,  6.17269658e+03,  2.34228658e+03, ...,
        -1.78599342e+03, -8.37723425e+02, -2.09155342e+03],
       ...,
       [-1.08795589e+03,  1.10329241e+04,  2.37473411e+03, ...,
        -2.11395589e+03, -1.98403589e+03, -1.86775589e+03],
       [-7.21060959e+02,  4.72338904e+03,  1.58849041e+02, ...,
         4.00690411e+01, -1.16309589e+01, -2.14380959e+02],
       [ 1.77472329e+02,  3.96630233e+03,  1.03058923e+04, ...,
        -8.13779767e+03, -6.70863767e+03, -6.88060767e+03]])

In [20]:
# Perform SVD on centered sales data
train_1_U, train_1_S, train_1_V = np.linalg.svd(train_1_centered_sales)

In [21]:
train_1_U.shape, train_1_S.shape, train_1_V.shape

((45, 45), (45,), (73, 73))

In [22]:
# Reduce the number of components
train_1_U_reduced = train_1_U[:, :n_components]
train_1_D_reduced = np.diag(train_1_S[:n_components])
train_1_Vt_reduced = train_1_V[:, :n_components].T

In [23]:
train_1_U_reduced.shape, train_1_D_reduced.shape, train_1_Vt_reduced.shape

((45, 8), (8, 8), (8, 73))

In [24]:
# Regenerate smoothed sales
train_1_sales_smooth = (train_1_U_reduced @ train_1_D_reduced @ train_1_Vt_reduced) + train_1_store_means
train_1_sales_smooth

array([[27087.93052381, 35945.45823503, 25868.24000487, ...,
        32391.71845372, 16496.56645842, 17914.95695844],
       [37319.08835908, 51286.64996211, 34051.27335302, ...,
        45312.35064211, 23005.0320505 , 23650.40154193],
       [ 7358.90339863,  8767.23063961,  8241.43570697, ...,
         8960.55611647,  4864.8764816 ,  5577.85533269],
       ...,
       [ 8943.23246437, 12029.17031054,  8312.07322133, ...,
         9872.0692972 ,  6709.5125888 ,  8392.49439072],
       [ 6943.79232781,  7088.24929425,  7340.88303104, ...,
         9236.6848497 ,  7492.84689946,  6648.14575364],
       [19273.19784604, 24652.58996819, 25447.01735677, ...,
        26339.00913538, 12267.58177506,  8897.68855028]])

In [25]:
train_1_sales_smooth.shape

(45, 73)

In [26]:
# Convert smoothed sales from array to dataframe
train_1_sales_smooth_df = pd.DataFrame(train_1_sales_smooth, columns=train_date_list)
# Re-add Store as a column
train_1_sales_smooth_df["Store"] = train_store_list
train_1_sales_smooth_df

Unnamed: 0,2010-02-05,2010-02-12,2010-02-19,2010-02-26,2010-03-05,2010-03-12,2010-03-19,2010-03-26,2010-04-02,2010-04-09,...,2011-04-29,2011-05-06,2011-05-13,2011-05-20,2011-05-27,2011-06-03,2011-06-10,2011-06-17,2011-06-24,Store
0,27087.930524,35945.458235,25868.240005,26815.591272,7020.224163,37617.33266,11234.392742,25176.597699,25707.506725,35127.525386,...,25098.176517,10138.733302,26390.706177,27156.50989,9541.4784,8989.998822,32391.718454,16496.566458,17914.956958,1
1,37319.088359,51286.649962,34051.273353,38137.759861,10159.596651,51546.185431,15219.78986,34788.186072,32754.781812,48395.781187,...,32887.380972,13530.33877,37093.597836,37410.285318,11919.860027,12219.53,45312.350642,23005.032051,23650.401542,2
2,7358.903399,8767.23064,8241.435707,6329.4587,1564.568301,10742.322733,3258.803851,7085.737867,8501.743328,11358.782303,...,7638.279102,2534.294467,7003.879946,5770.839737,1478.041904,2324.944323,8960.556116,4864.876482,5577.855333,3
3,38000.365456,47388.293752,35437.558925,38690.8994,22353.184194,51525.706643,25281.364631,46058.903414,35464.310662,58181.216356,...,41732.04605,20904.092063,35971.012919,39139.166589,14201.221366,14929.298824,52415.060085,30658.028946,28496.193052,4
4,10227.3165,11588.036811,11434.642183,9498.796189,4323.580693,12874.640061,5824.085202,8392.865623,11502.557148,12139.383962,...,9719.035174,5829.509301,9979.829781,9815.09188,5656.537371,5011.762683,11376.245033,7425.962881,8114.331008,5
5,27073.097401,31147.701382,26738.412761,27647.442676,14004.183244,33205.521051,12684.784559,22738.806284,26659.397331,32653.689309,...,22654.3945,11637.306612,28261.220261,27509.016939,10646.237046,11766.958818,33089.728283,19960.41699,17959.373728,6
6,9662.465963,10900.806141,9128.812313,10158.312486,7161.952428,11218.060818,6519.686904,9239.963333,9096.279459,11505.237951,...,8739.125417,5818.351998,10237.70825,10431.17823,5157.109078,6144.445432,11831.605209,8260.26404,7112.531776,7
7,17509.060439,22934.955828,17417.436753,16357.410978,3822.815638,25513.206627,7121.457476,17241.804345,17626.410998,26068.163749,...,17981.986979,6683.342796,16148.965332,14075.726925,4693.184959,4685.354313,21962.695922,11208.947153,13437.565007,8
8,12304.020488,13839.318009,12963.399207,11597.310575,6740.475607,15924.68621,7768.233505,12598.205677,13093.669927,17524.184585,...,12595.258749,6861.147491,11911.107746,10840.451126,5148.15052,5860.971523,15245.604183,9971.278887,9990.222288,9
9,44907.608919,50530.949581,44732.296196,46131.27924,21370.335061,58670.357226,16888.645054,39275.916776,45940.335726,61521.484853,...,35389.544695,6713.072206,52238.162242,51908.683232,1395.472037,17009.62705,60606.687733,31870.77583,20058.15736,10


In [27]:
# Unpivot the dataframe
train_1_sales_smooth_unpivoted_df = train_1_sales_smooth_df.melt(id_vars=["Store"], var_name="Date", value_name="Weekly_Sales")
train_1_sales_smooth_unpivoted_df

Unnamed: 0,Store,Date,Weekly_Sales
0,1,2010-02-05,27087.930524
1,2,2010-02-05,37319.088359
2,3,2010-02-05,7358.903399
3,4,2010-02-05,38000.365456
4,5,2010-02-05,10227.316500
...,...,...,...
3280,41,2011-06-24,15002.058441
3281,42,2011-06-24,9889.570102
3282,43,2011-06-24,8392.494391
3283,44,2011-06-24,6648.145754


In [28]:
# Split train_1_X from train_1_Y
train_1_y = train_1_sales_smooth_unpivoted_df["Weekly_Sales"]
train_1_X = train_1_sales_smooth_unpivoted_df.drop(["Weekly_Sales"], axis=1)

# Add year and week columns to train_1_X
train_1_X = preprocess(train_1_X)
# Drop original date column
train_1_X.drop(["Date"], axis=1, inplace=True)
train_1_X

Unnamed: 0,Store,Wk,Yr
0,1,5,2010
1,2,5,2010
2,3,5,2010
3,4,5,2010
4,5,5,2010
...,...,...,...
3280,41,25,2011
3281,42,25,2011
3282,43,25,2011
3283,44,25,2011


In [30]:
# Create dummy variables for Wk
y, X = patsy.dmatrices("Yr ~ Store + Yr + Wk", data=train_1_X, return_type="dataframe")
X

Unnamed: 0,Intercept,Wk[T.2],Wk[T.3],Wk[T.4],Wk[T.5],Wk[T.6],Wk[T.7],Wk[T.8],Wk[T.9],Wk[T.10],...,Wk[T.45],Wk[T.46],Wk[T.47],Wk[T.48],Wk[T.49],Wk[T.50],Wk[T.51],Wk[T.52],Store,Yr
0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2010.0
1,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,2010.0
2,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,2010.0
3,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2010.0
4,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2010.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3280,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,41.0,2011.0
3281,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,42.0,2011.0
3282,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,43.0,2011.0
3283,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,44.0,2011.0


In [31]:
# Make sure fitting the model at least doesn't throw an error
model = sm.OLS(train_1_y, X).fit()
mycoef = model.params.fillna(0)

In [32]:
mycoef

Intercept    5.670528e+06
Wk[T.2]      2.005652e+03
Wk[T.3]     -2.392168e+02
Wk[T.4]     -1.384653e+03
Wk[T.5]     -1.502016e+03
Wk[T.6]      4.291664e+03
Wk[T.7]     -1.815606e+03
Wk[T.8]      5.812320e+02
Wk[T.9]     -1.129349e+04
Wk[T.10]     2.143693e+03
Wk[T.11]    -9.960302e+03
Wk[T.12]    -7.053408e+03
Wk[T.13]    -7.025903e+02
Wk[T.14]     2.799607e+03
Wk[T.15]     5.017133e+03
Wk[T.16]     5.390847e+03
Wk[T.17]    -4.983336e+03
Wk[T.18]    -8.001707e+03
Wk[T.19]    -7.379913e+02
Wk[T.20]     2.815857e+03
Wk[T.21]    -1.214483e+04
Wk[T.22]    -1.530510e+04
Wk[T.23]     5.723521e+03
Wk[T.24]     1.494324e+03
Wk[T.25]    -2.503484e+03
Wk[T.26]     1.906887e+04
Wk[T.27]    -8.087700e+03
Wk[T.28]     4.227973e+03
Wk[T.29]     7.778773e+03
Wk[T.30]    -3.363742e+04
Wk[T.31]     6.950086e+03
Wk[T.32]    -1.276496e+04
Wk[T.33]     2.682611e+03
Wk[T.34]     5.337387e+03
Wk[T.35]     1.087041e+04
Wk[T.36]    -2.682012e+04
Wk[T.37]    -9.104652e+03
Wk[T.38]    -3.486441e+03
Wk[T.39]    

In [87]:
# QUESTIONS:
# 1) If Week is categorical, is it a good idea to treat Store as categorical too?

# NEXT STEPS:
# 1) Consolidate data-prep logic
# 2) Loop over folds and departments
# 3) Prep training and test data
# 4) Fit model and make predictions
# 5) Keep running tally of predictions for each fold
# 6) Save predictions
# 7) Output WMAE per fold