In [10]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tqdm import tqdm

def preprocess(data):
    tmp = pd.to_datetime(data['Date'])
    data['Wk'] = tmp.dt.isocalendar().week
    data['Yr'] = tmp.dt.year
    data['Wk'] = pd.Categorical(data['Wk'], categories=[i for i in range(1, 53)])  # 52 weeks 
    return data

def PCATransform(train, d=8):
    train_svd = pd.DataFrame()

    for dept in train["Dept"].unique():
        filtered_train = train[train['Dept'] == dept]
        selected_columns = filtered_train[['Store', 'Date', 'Weekly_Sales']]
        
        train_dept_ts = selected_columns.pivot(index='Date', columns='Store', values='Weekly_Sales').reset_index().T
        train_dept_ts = train_dept_ts.fillna(0)

        sales_data = train_dept_ts.iloc[1:].to_numpy()
        store_mean = np.mean(sales_data, axis=1)
        centered_data = (sales_data.T - store_mean).T
        centered_data = centered_data.astype(float)

        try:
            U, S, Vh = np.linalg.svd(centered_data)
            S = np.diag(S)
            smooth_sales = np.dot(U[:, :d], np.dot(S[:d, :d], Vh[:d, :]))
            smooth_sales = (smooth_sales.T + store_mean).T
        
            train_dept_ts.iloc[1:] = smooth_sales
            train_dept_ts = train_dept_ts.T
            
            nib = pd.melt(train_dept_ts, id_vars=['Date'])
            nib["Dept"] = dept
            
            train_svd = pd.concat([train_svd, nib], ignore_index=True)
        except ValueError:
            continue
            
    train_svd = train_svd.rename(columns={"value": "Weekly_Sales"})
    train_svd["Weekly_Sales"] = train_svd["Weekly_Sales"].astype(float)
    train_svd["Store"] = train_svd["Store"].astype(int)
    
    return train_svd

def post_prediction_adjustment(test_pred, shift=1/7):
    # Define the critical weeks
    critical_weeks = ['2011-12-16', '2011-12-23', '2011-12-30', '2012-01-06', '2012-01-13']
    test_pred['Date'] = pd.to_datetime(test_pred['Date'])
    test_pred['Wk'] = test_pred['Date'].dt.isocalendar().week

    # average sales for weeks 49, 50, and 51
    avg_sales_49_51 = test_pred[test_pred['Date'].isin(['2011-12-02', '2011-12-09', '2011-12-16'])].groupby(['Store', 'Dept'])['Weekly_Pred'].mean().reset_index()

    # average sales for weeks 48 and 52
    avg_sales_48_52 = test_pred[test_pred['Date'].isin(['2011-11-25', '2011-12-30'])].groupby(['Store', 'Dept'])['Weekly_Pred'].mean().reset_index()

    merged_avg = pd.merge(avg_sales_49_51, avg_sales_48_52, on=['Store', 'Dept'], how='inner', suffixes=('_49_51', '_48_52'))

    # departments with sales bulge
    bulge_depts = merged_avg[merged_avg['Weekly_Pred_49_51'] > 1.1 * merged_avg['Weekly_Pred_48_52']]

    
    for date in critical_weeks:
        for _, row in bulge_depts.iterrows():
            store, dept = row['Store'], row['Dept']
            current_week_sales = test_pred[(test_pred['Date'] == date) & (test_pred['Store'] == store) & (test_pred['Dept'] == dept)]['Weekly_Pred']
            
            if not current_week_sales.empty:
                test_pred.loc[(test_pred['Date'] == date) & (test_pred['Store'] == store) & (test_pred['Dept'] == dept), 'Weekly_Pred'] *= (1 - shift)
                
                next_week = (pd.to_datetime(date) + pd.Timedelta(weeks=1)).strftime('%Y-%m-%d')
                test_pred.loc[(test_pred['Date'] == next_week) & (test_pred['Store'] == store) & (test_pred['Dept'] == dept), 'Weekly_Pred'] += current_week_sales.values[0] * shift

    return test_pred

In [12]:
num_folds = 10
for fold in range(1, num_folds + 1):
    # pre-allocate a pd to store the predictions
    test_pred = pd.DataFrame()
    
    train = pd.read_csv(f'data/fold_{fold}/train.csv')
    # Smooth data (remove noise) using PCA       
    train = PCATransform(train, d=8)
    
    test = pd.read_csv(f'data/fold_{fold}/test.csv')

    # Get store/dept pairs that appear in both train and test
    train_pairs = train[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    test_pairs = test[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    unique_pairs = pd.merge(train_pairs, test_pairs, how = 'inner', on =['Store', 'Dept'])

    # Create design matrix for each store/dept pair
    train_split = unique_pairs.merge(train, on=['Store', 'Dept'], how='left')
    train_split = preprocess(train_split)
    y, X = patsy.dmatrices('Weekly_Sales ~ Weekly_Sales + Store + Dept + Yr  + Wk + I(Yr**2)', 
                           data = train_split, 
                           return_type='dataframe')
    train_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    test_split = unique_pairs.merge(test, on=['Store', 'Dept'], how='left')
    test_split = preprocess(test_split)
    y, X = patsy.dmatrices('Yr ~ Store + Dept + Yr  + Wk + I(Yr**2)', 
                           data = test_split, 
                           return_type='dataframe')
    X['Date'] = test_split['Date']
    test_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    keys = list(train_split)

    # Build model for each store/dept pair
    for key in tqdm(keys):
        X_train = train_split[key]
        X_test = test_split[key]

        Y = X_train['Weekly_Sales']
        X_train = X_train.drop(['Weekly_Sales','Store', 'Dept'], axis=1)

        # Drop const columns
        cols_to_drop = X_train.columns[(X_train == 0).all()]
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)

        # Drop linearly dependent columns
        cols_to_drop = []
        for i in range(len(X_train.columns) - 1, 1, -1):  # Start from the last column and move backward
            col_name = X_train.columns[i]
            # Extract the current column and all previous columns
            tmp_Y = X_train.iloc[:, i].values
            tmp_X = X_train.iloc[:, :i].values
    
            coefficients, residuals, rank, s = np.linalg.lstsq(tmp_X, tmp_Y, rcond=None)
            if np.sum(residuals) < 1e-10:
                    cols_to_drop.append(col_name)
                
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)

        tmp_pred = X_test[['Store', 'Dept', 'Date']]
        X_test = X_test.drop(['Store', 'Dept', 'Date'], axis=1)
        
        # Build model and predict
        model = sm.OLS(Y, X_train).fit()
        mycoef = model.params.fillna(0)
        
        tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)
        test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)

    test_pred['Weekly_Pred'].fillna(0, inplace=True)
    test_pred = post_prediction_adjustment(test_pred, shift=1/7)
    file_path = f'data/fold_{fold}/mypred.csv'
    print(test_pred.shape, file_path)
    test_pred.to_csv(file_path, index=False)

100%|██████████████████████████████████████████████████████████████████████████████| 3069/3069 [01:11<00:00, 43.22it/s]


(26489, 5) data/fold_1/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3059/3059 [01:11<00:00, 42.91it/s]


(23524, 5) data/fold_2/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3073/3073 [01:14<00:00, 41.32it/s]


(26345, 5) data/fold_3/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3088/3088 [01:18<00:00, 39.51it/s]


(26541, 5) data/fold_4/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3096/3096 [01:20<00:00, 38.26it/s]


(26815, 5) data/fold_5/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3112/3112 [01:22<00:00, 37.58it/s]


(23772, 5) data/fold_6/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3100/3100 [01:23<00:00, 37.19it/s]


(26713, 5) data/fold_7/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3089/3089 [01:26<00:00, 35.67it/s]


(26560, 5) data/fold_8/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3078/3078 [01:27<00:00, 35.14it/s]


(26579, 5) data/fold_9/mypred.csv


100%|██████████████████████████████████████████████████████████████████████████████| 3062/3062 [01:26<00:00, 35.41it/s]


(23704, 5) data/fold_10/mypred.csv


In [13]:
def myeval():
    file_path = 'data/test_with_label.csv'
    test_with_label = pd.read_csv(file_path)
    wae = []

    num_folds = 10
    for fold in range(1, num_folds + 1):
        file_path = f'data/fold_{fold}/test.csv'
        test = pd.read_csv(file_path)
        test = test.drop(columns=['IsHoliday']).merge(test_with_label, on=['Date', 'Store', 'Dept'])

        file_path = f'data/fold_{fold}/mypred.csv'
        test_pred = pd.read_csv(file_path)

        # Left join with the test data
        new_test = test_pred.merge(test, on=['Date', 'Store', 'Dept'], how='left')

        # Compute the Weighted Absolute Error
        actuals = new_test['Weekly_Sales']
        preds = new_test['Weekly_Pred']
        weights = new_test['IsHoliday'].apply(lambda x: 5 if x else 1)
        wae.append(sum(weights * abs(actuals - preds)) / sum(weights))

    return wae

wae = myeval() 
print(wae)
print(np.mean(wae))

[1944.027077415397, 1362.0392914112012, 1379.8216756610193, 1525.6178760546272, 2145.015414702814, 1636.6146799855312, 1612.6082414810323, 1354.3464306177025, 1335.893209275752, 1332.118979529825]
1562.8102876134901
