In [21]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import patsy
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

def preprocess(data):
    tmp = pd.to_datetime(data['Date'])
    data['Wk'] = tmp.dt.isocalendar().week
    data['Yr'] = tmp.dt.year
    data['Wk'] = pd.Categorical(data['Wk'], categories=[i for i in range(1, 53)])  # 52 weeks 
    return data

def PCATransform(X_train, X_test, d=8):
    pipeline = Pipeline([('pca', PCA())])
    pipeline.fit(X_train)
    
    X_train_svd = pipeline.transform(X_train)
    X_train_svd = X_train_svd[:, :d]

    X_test_svd = pipeline.transform(X_test)
    X_test_svd = X_test_svd[:, :last_valid_index]
    
    return X_train_svd, X_test_svd

In [None]:
true_sales = pd.read_csv("data/test_with_label.csv")
true_sales_start = 0
true_sales_end = 0

wae = []

num_folds = 11
for i in range(2, num_folds + 1):
    # pre-allocate a pd to store the predictions
    test_pred = pd.DataFrame()
    
    train = pd.read_csv(f'data/fold_{i}/train.csv')
    test = pd.read_csv(f'data/fold_{i}/test.csv')
    test = test.drop(columns=['IsHoliday']).merge(test_with_label, on=['Date', 'Store', 'Dept'])

    # Get store/dept pairs that appear in both train and test
    train_pairs = train[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    test_pairs = test[['Store', 'Dept']].drop_duplicates(ignore_index=True)
    unique_pairs = pd.merge(train_pairs, test_pairs, how = 'inner', on =['Store', 'Dept'])

    # Create design matrix for each store/dept pair
    train_split = unique_pairs.merge(train, on=['Store', 'Dept'], how='left')
    train_split = preprocess(train_split)
    y, X = patsy.dmatrices('Weekly_Sales ~ Weekly_Sales + Store + Dept + Yr  + Wk', 
                           data = train_split, 
                           return_type='dataframe')
    train_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    test_split = unique_pairs.merge(test, on=['Store', 'Dept'], how='left')
    test_split = preprocess(test_split)
    y, X = patsy.dmatrices('Yr ~ Store + Dept + Yr  + Wk', 
                           data = test_split, 
                           return_type='dataframe')
    X['Date'] = test_split['Date']
    test_split = dict(tuple(X.groupby(['Store', 'Dept'])))

    keys = list(train_split)
    
    # Build model for each store/dept pair
    for i, key in enumerate(keys):
        X_train = train_split[key]
        X_test = test_split[key]
     
        Y = X_train['Weekly_Sales']
        X_train = X_train.drop(['Weekly_Sales','Store', 'Dept'], axis=1)

        # Drop const columns
        cols_to_drop = X_train.columns[(X_train == 0).all()]
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)

        # Drop linearly dependent columns
        cols_to_drop = []
        for i in range(len(X_train.columns) - 1, 1, -1):  # Start from the last column and move backward
            col_name = X_train.columns[i]
            # Extract the current column and all previous columns
            tmp_Y = X_train.iloc[:, i].values
            tmp_X = X_train.iloc[:, :i].values
    
            coefficients, residuals, rank, s = np.linalg.lstsq(tmp_X, tmp_Y, rcond=None)
            if np.sum(residuals) < 1e-10:
                    cols_to_drop.append(col_name)
                
        X_train = X_train.drop(columns=cols_to_drop)
        X_test = X_test.drop(columns=cols_to_drop)

        # X_train_svd, X_test_svd = PCATransform(X_train, X_test, )
        
        # Build model and predict
        model = sm.OLS(Y, X_train).fit()
        mycoef = model.params.fillna(0)
        
        tmp_pred = X_test[['Store', 'Dept', 'Date']]
        X_test = X_test.drop(['Store', 'Dept', 'Date'], axis=1)
        
        tmp_pred['Weekly_Pred'] = np.dot(X_test, mycoef)
        test_pred = pd.concat([test_pred, tmp_pred], ignore_index=True)

    new_test = test_pred.merge(test, on=['Date', 'Store', 'Dept'], how='left')
    
    actuals = new_test['Weekly_Sales']
    preds = new_test['Weekly_Pred']
    weights = new_test['IsHoliday'].apply(lambda x: 5 if x else 1)
    
    nib = sum(weights * abs(actuals - preds)) / sum(weights)
    wae.append(nib)
    print(nib)

print(wae)

1467.1125685653747
1446.8820968342973
1595.6281946591039
2334.678115053473
1675.2208943240933
1720.8283693097724
