In [1]:
#General imports
import pandas as pd
import numpy as np
import scipy
import re

import os, sys, gc, time, warnings, pickle, psutil, random

import time

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

In [2]:
TARGET = 'sales'         # Our main target
END_TRAIN = 1913         # Last day in train set
MAIN_INDEX = ['id','d']  # We can identify item by these columns

In [3]:
train = pd.read_csv('sales_train_validation.csv')

In [4]:
train_ca = train[train['state_id']=='CA']#break up by state in order to compute faster
train_tx = train[train['state_id']=='TX']
train_wi = train[train['state_id']=='WI']

In [5]:
index_columns = ['id','item_id','dept_id','cat_id','store_id','state_id']
train_ca = pd.melt(train_ca, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)
train_tx = pd.melt(train_tx, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)
train_wi = pd.melt(train_wi, 
                  id_vars = index_columns, 
                  var_name = 'd', 
                  value_name = TARGET)

Let's create some temporary df's for the purposes of engineering lag features.

In [6]:
'''Given our size of our data, it might be wise to record how long each function takes so we know where
our bottlenecks are.'''

from functools import wraps

def timer(func):
    """A decorator that prints how long a function took to run."""

    # Define the wrapper function to return.
    @wraps #preserve the metadata of our function.
    def wrapper(*args, **kwargs):
        # When wrapper() is called, get the current time.
        t_start = time.time()

        # Call the decorated function and store the result.
        result = func(*args, **kwargs)

        # Get the total time it took to run, and print it.
        t_total = time.time() - t_start

        print('{} took {}s'.format(func.__name__, t_total))        
        return result
    return wrapper

In [8]:
'''Since our data is already sorted by 'd' values, we can easily shift() values
as we as aggregate values.'''

lags = [col for col in range(15,36,2)]

start_time = time.time()
ca_shifts = train_ca[['id','d',TARGET]]
ca_shifts = ca_shifts.assign(**{
        '{}_lag_{}'.format(col, l): ca_shifts.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in lags
        for col in [TARGET]
    })

print('%0.2f min: Time for bulk shift' % ((time.time() - start_time) / 60))

2.62 min: Time for bulk shift


In [9]:
tx_shifts = train_tx[['id','d',TARGET]]
tx_shifts = tx_shifts.assign(**{
        '{}_lag_{}'.format(col, l): tx_shifts.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in lags
        for col in [TARGET]
    })

In [7]:
wi_shifts = train_wi[['id','d',TARGET]]
wi_shifts = wi_shifts.assign(**{
        '{}_lag_{}'.format(col, l): wi_shifts.groupby(['id'])[col].transform(lambda x: x.shift(l))
        for l in lags
        for col in [TARGET]
    })

Now that we generated lag features, we can focus on creating smoothing/rolling features. Afterwards, we can focus on dealing with the NaNs created from these features.

In [10]:
'''Rolling averages with different time frames'''

ca_rolls = train_ca[['id','d','sales']]

for i in [14,30,60, 90,180]:
    print('Rolling period:', i)
    ca_rolls['rolling_mean_'+str(i)] = ca_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    ca_rolls['rolling_std_'+str(i)]  = ca_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())
    ca_rolls['rolling_max_'+str(i)]  = ca_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).max())
    ca_rolls['rolling_min_'+str(i)]  = ca_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).min())

Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 90
Rolling period: 180


In [6]:
'''Rolling averages with different time frames'''

tx_rolls = train_tx[['id','d','sales']]

for i in [14,30,60, 90,180]:
    print('Rolling period:', i)
    tx_rolls['rolling_mean_'+str(i)] = tx_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    tx_rolls['rolling_std_'+str(i)]  = tx_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())
    tx_rolls['rolling_max_'+str(i)]  = tx_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).max())
    tx_rolls['rolling_min_'+str(i)]  = tx_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).min())

Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 90
Rolling period: 180


In [6]:
'''Rolling averages with different time frames'''

wi_rolls = train_wi[['id','d','sales']]

for i in [14,30,60, 90,180]:
    print('Rolling period:', i)
    wi_rolls['rolling_mean_'+str(i)] = wi_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).mean())
    wi_rolls['rolling_std_'+str(i)]  = wi_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).std())
    wi_rolls['rolling_max_'+str(i)]  = wi_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).max())
    wi_rolls['rolling_min_'+str(i)]  = wi_rolls.groupby(['id'])[TARGET].transform(lambda x: x.shift(1).rolling(i).min())

Rolling period: 14
Rolling period: 30
Rolling period: 60
Rolling period: 90
Rolling period: 180


In [13]:
'''we can reduce memory by transforming our df to a sparse matrix and returning it back to a dense matrix. 
LGB is able to handle ths type of matrix.'''

'we can reduce memory by transforming our df to a sparse matrix and returning it back to a dense matrix. \nLGB is able to handle ths type of matrix.'