In [1]:
import pandas as pd
import numpy as np
from datetime import date
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import KFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler

In [2]:
# load both datasets
train = pd.read_csv('preprocessed_train_with_avg.csv')
test = pd.read_csv('preprocessed_test_with_avg.csv')

# put in one dataframe
df = pd.concat([train, test])
df.reset_index(drop=True)
del train
del test

# convert date to datetime
if df['Date'].dtype == np.dtype('O'):
    df['Date'] = pd.to_datetime(df['Date'], format='%Y-%m-%d')

# prepare date as number for the regression
print('Converting date to a number...')
df['DateNumeric'] = df['Date'].values.astype(float)
    
# add temporary columns to compute splits
df['Year_temp'] = df['Date'].dt.year
df['Month_temp'] = df['Date'].dt.month

# get all year-month pairs
l = sorted(list(set(df[['Year_temp', 'Month_temp']].itertuples(index=False))))

# drop temporary columns
df = df.drop(['Year_temp', 'Month_temp'], axis=1)

# cycle on two months at a time
print('Number of months: {}'.format(len(l)))
l = [(l[i], l[i+1]) for i in range(0, len(l), 2)]
print('Number of folds: {}'.format(len(l) - 1))

# declare new features
print('Declaring new features...')
df['regression_whole'] = np.empty(len(df))
df['regression_twomonths'] = np.empty(len(df))

# first 2 months = NaN
print('First two months...')
first_two_months = (df.Date <= date(2016, 4, 30))
df.loc[first_two_months, 'regression_whole'] = np.nan
df.loc[first_two_months, 'regression_twomonths'] = np.nan

# following months: linear regression

# version trained on all preceding months
kf = TimeSeriesSplit(len(l) - 1)

print('Preparing indexes...')
indexes = []
for train_months_idx, target_months_idx in kf.split(l):
    train_months = [l[i] for i in train_months_idx]
    target_months = [l[i] for i in target_months_idx]
    # flatten
    train_months = [x for y in train_months for x in y]
    target_months = [x for y in target_months for x in y]
    train_indexes = df.Date.apply(lambda d: (d.year, d.month) in train_months)
    target_indexes = df.Date.apply(lambda d: (d.year, d.month) in target_months)
    indexes.append((train_indexes, target_indexes))

# iterate 2 months at a time
fold = 0
for train_indexes, target_indexes in indexes:
    fold += 1
    print('Fold {}'.format(fold))
    for store in df.StoreID.unique():
        for day in (0, 1, 2, 3, 4, 5, 6):
            # fit linear regression on sales and HasPromotions
            mask = (train_indexes) & (df.StoreID == store) & (df.DayOfWeek == day)
            X = df.loc[mask, ['DateNumeric', 'HasPromotions']]
            y = df.loc[mask, 'NumberOfSales']      
            target_mask = (target_indexes) & (df.StoreID == store) & (df.DayOfWeek == day)
            if len(X) == 0:
                # leave empty
                df.loc[target_mask, 'regression_whole'] = np.nan
            else:
                target_X = df.loc[target_mask, ['DateNumeric', 'HasPromotions']]
                if len(target_X) == 0:
                    # skip
                    continue
                # predict following two months
                model = LinearRegression()
                model.fit(X, y)
                df.loc[target_mask, 'regression_whole'] = model.predict(target_X)
        
        
# version trained on the two preceding months
kf = TimeSeriesSplit(len(l) - 1, max_train_size=1)

print('Preparing indexes...')
indexes = []
for train_months_idx, target_months_idx in kf.split(l):
    train_months = [l[i] for i in train_months_idx]
    target_months = [l[i] for i in target_months_idx]
    # flatten
    train_months = [x for y in train_months for x in y]
    target_months = [x for y in target_months for x in y]
    train_indexes = df.Date.apply(lambda d: (d.year, d.month) in train_months)
    target_indexes = df.Date.apply(lambda d: (d.year, d.month) in target_months)
    indexes.append((train_indexes, target_indexes))

# iterate 2 months at a time
fold = 0
for train_indexes, target_indexes in indexes:
    fold += 1
    print('Fold {}'.format(fold))
    for store in df.StoreID.unique():
        for day in (0, 1, 2, 3, 4, 5, 6):
            # fit linear regression on sales and HasPromotions
            mask = (train_indexes) & (df.StoreID == store) & (df.DayOfWeek == day)
            X = df.loc[mask, ['DateNumeric', 'HasPromotions']]
            y = df.loc[mask, 'NumberOfSales']      
            target_mask = (target_indexes) & (df.StoreID == store) & (df.DayOfWeek == day)
            if len(X) == 0:
                # leave empty
                df.loc[target_mask, 'regression_twomonths'] = np.nan
            else:
                target_X = df.loc[target_mask, ['DateNumeric', 'HasPromotions']]
                if len(target_X) == 0:
                    # skip
                    continue
                # predict following two months
                model = LinearRegression()
                model.fit(X, y)
                df.loc[target_mask, 'regression_twomonths'] = model.predict(target_X)


Converting date to a number...
Number of months: 26
Number of folds: 12
Declaring new features...
First two months...
Preparing indexes...
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10
Fold 11
Fold 12
Preparing indexes...
Fold 1
Fold 2
Fold 3
Fold 4
Fold 5
Fold 6
Fold 7
Fold 8
Fold 9
Fold 10
Fold 11
Fold 12


In [3]:
# drop temporary column
df = df.drop('DateNumeric', axis=1)

# add another useful feature
df['RegressionDistance'] = df.Date.apply(lambda x: x.day + ((x.month + 1) % 2) * 30)

In [4]:
df[df.Date < date(2018, 3, 1)].to_csv('preprocessed_train_regression.csv',index=False)
df[df.Date >= date(2018, 3, 1)].to_csv('preprocessed_test_regression.csv', index=False)