In [1]:
#data manipulation packages
import pandas as pd
import numpy as np
import scipy
import re

#visualization tools
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from itertools import cycle

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

In [2]:
sales = pd.read_csv('sales_train_validation.csv')
cal = pd.read_csv('calendar.csv', parse_dates = ['date'])
sales.shape

(30490, 1919)

In this notebook, we will address the large gaps within each item's time series by using Gausian Process Regression. After handling this, we will then address the intermittent demand we see in many of the series. I will explain that process after first handling the missing data.

Below, we are indexing on where we observe the large 0 gaps. if the occur in the beginning, we will reverse our data to train the missing data. If the gaps occur in the middle and end, then we will model the data in its regular order.

In [19]:
'''As observed above, some of the items, for one reason or another, have limited history throughout the time series.
The following script will find the items that have limited history, the amount of consecutive zeros within the time
series, and where exactly they occur in the series (i.e. 'beginning', 'middle', 'end').
'''

import itertools

limited_items = {}
d_cols = [col for col in sales.columns if 'd_' in col]

for item_id in sales['id']:
    df = sales.loc[sales['id'] == item_id][d_cols].T
    df = df.rename(columns={sales.index[sales['id']==item_id].to_list()[0]:item_id}) # Name it correctly
    df = df.reset_index().rename(columns={'index': 'd'}) # make the index "d"
    df = df.merge(cal, how='left', validate='1:1')
    
    rolled = np.asarray(df.iloc[:,1].astype(int))
    
    zero_consec, zero_count = [0], 0
    full_series = []
    
    condition = np.where(rolled==0,'true','false')
    zero_groups = [ sum( 1 for _ in group ) for key, group in itertools.groupby( condition ) if key ]
    zero_gap = (zero_groups.index(max(zero_groups))/len(zero_groups))*100

    if zero_gap<35:
        zero_location = 'beginning'
    elif zero_gap <= 35 or zero_gap<=75:
        zero_location = 'middle'
    else:
        zero_location = 'end'

    for val in range(len(rolled)):
        condition = val == 0
        if rolled[val]==0:
            zero_count+=1
        else:
            if zero_count>zero_consec[0]:
                zero_consec[0] = zero_count
                zero_ind = val-zero_count
            else:
                zero_count = 0
    if zero_consec[0]>100:
        limited_items[item_id] = zero_consec[0], zero_location, zero_ind
    else:
        continue

In [23]:
##first 10 items in limited_items

from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

first_ten = take(10, limited_items.items())
first_ten

[('HOBBIES_1_001_CA_1_validation', (916, 'beginning', 2)),
 ('HOBBIES_1_002_CA_1_validation', (150, 'beginning', 2)),
 ('HOBBIES_1_003_CA_1_validation', (1109, 'beginning', 3)),
 ('HOBBIES_1_005_CA_1_validation', (112, 'beginning', 0)),
 ('HOBBIES_1_006_CA_1_validation', (429, 'beginning', 0)),
 ('HOBBIES_1_007_CA_1_validation', (530, 'beginning', 0)),
 ('HOBBIES_1_008_CA_1_validation', (185, 'middle', 543)),
 ('HOBBIES_1_009_CA_1_validation', (143, 'end', 1644)),
 ('HOBBIES_1_010_CA_1_validation', (110, 'beginning', 4)),
 ('HOBBIES_1_011_CA_1_validation', (557, 'beginning', 7))]

We can see that we've grabbed all of the items that have more than 100 consecutive days without selling an item. Additionally, we've grabbed how large the zero gap is, where in the time series it occured, and at what index the gap begins. This way, we can make a train and test set for a GPR model.

I think the best way to optimize parameters for the model is to aggregate the mean values for each category in a rolling window and do a Bayesian Opt. Hopefully the avg will represent the overall noise of the items within the category and, from there, we will model each item to fill these zero gaps.