In [1]:
#data manipulation packages
import pandas as pd
import numpy as np
import scipy
import re
import os, sys, gc, time, warnings, pickle, psutil, random


#visualization tools
%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
from itertools import cycle

import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 400)

In [2]:
sales = pd.read_csv('sales_train_validation.csv')
cal = pd.read_csv('calendar.csv', parse_dates = ['date'])
sales.shape

(30490, 1919)

In this notebook, we will address the large gaps within each item's time series by using Gausian Process Regression. After handling this, we will then address the intermittent demand we see in many of the series. I will explain that process after first handling the missing data.

Below, we are indexing on where we observe the large 0 gaps. if the occur in the beginning, we will reverse our data to train the missing data. If the gaps occur in the middle and end, then we will model the data in its regular order.

In [3]:
'''As observed above, some of the items, for one reason or another, have limited history throughout the time series.
The following script will find the items that have limited history, the amount of consecutive zeros within the time
series, and where exactly they occur in the series (i.e. 'beginning', 'middle', 'end').
'''

import itertools

limited_items = {}
d_cols = [col for col in sales.columns if 'd_' in col]

for item_id in sales['id']:
    df = sales.loc[sales['id'] == item_id][d_cols].T
    df = df.rename(columns={sales.index[sales['id']==item_id].to_list()[0]:item_id}) # Name it correctly
    df = df.reset_index().rename(columns={'index': 'd'}) # make the index "d"
    df = df.merge(cal, how='left', validate='1:1')
    
    rolled = np.asarray(df.iloc[:,1].astype(int))
    
    all_gaps, zero_consec, zero_count, one_per = [], [0], 0, 0
    full_series = []
    
    condition = np.where(rolled==0,'true','false')
    zero_groups = [ sum( 1 for _ in group ) for key, group in itertools.groupby( condition ) if key ]
    zero_gap = (zero_groups.index(max(zero_groups))/len(zero_groups))*100

    if zero_gap<35:
        zero_location = 'beginning'
    elif zero_gap <= 35 or zero_gap<=75:
        zero_location = 'middle'
    else:
        zero_location = 'end'

    for val in range(len(rolled)):
        condition = val == 0
        if rolled[val]==0:
            zero_count+=1
        else:
            if zero_count>zero_consec[0]:
                zero_consec[0] = zero_count
                zero_ind = val-zero_count
            elif (zero_count>=70) and (one_per==0):
                all_gaps.append(zero_count)
                one_per = 1
            else:
                zero_count = 0
                one_per = 0
    if zero_consec[0]>100:
        limited_items[item_id] = zero_consec[0], zero_location, zero_ind, len(all_gaps)
    else:
        continue

In [28]:
##first 10 items in limited_items
from itertools import islice

def take(n, iterable):
    "Return first n items of the iterable as a list"
    return list(islice(iterable, n))

first_ten = take(10, limited_items.items())
first_ten

[('HOBBIES_1_001_CA_1_validation', (921, 'beginning', 4, 1)),
 ('HOBBIES_1_002_CA_1_validation', (186, 'beginning', 10, 1)),
 ('HOBBIES_1_003_CA_1_validation', (1109, 'beginning', 3, 2)),
 ('HOBBIES_1_005_CA_1_validation', (113, 'beginning', 2, 1)),
 ('HOBBIES_1_006_CA_1_validation', (432, 'beginning', 2, 2)),
 ('HOBBIES_1_007_CA_1_validation', (657, 'beginning', 23, 1)),
 ('HOBBIES_1_008_CA_1_validation', (185, 'middle', 543, 1)),
 ('HOBBIES_1_009_CA_1_validation', (237, 'end', 1651, 1)),
 ('HOBBIES_1_010_CA_1_validation', (110, 'beginning', 4, 1)),
 ('HOBBIES_1_011_CA_1_validation', (557, 'beginning', 7, 2))]

In [55]:
from collections import Counter 
gap_counter = [i[3] for i in limited_items.values()]
print(Counter(gap_counter))

Counter({1: 12008, 2: 6382, 3: 3012, 4: 1820, 5: 788, 6: 334, 7: 88, 0: 77, 8: 30, 9: 14})


In [None]:
#why would items have there gap in the beginning but not on day zero?

#consult prices release date with zero-gap location.

#create categorical of zero gaps placement.

#create numerical of  number of zero gaps per item.

We can see that we've grabbed all of the items that have more than 100 consecutive days without selling an item. Additionally, we've grabbed how large the zero gap is, where in the time series it occured, and at what index the gap begins. Let's input these as features and have CV tell us if they are actually useful.

In [3]:
grid = pd.read_pickle('grid_df.pkl')
grid.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,d,sales,release,wm_yr_wk
0,HOBBIES_1_008_CA_1_validation,HOBBIES_1_008,HOBBIES_1,HOBBIES,CA_1,CA,d_1,12.0,0,11101
1,HOBBIES_1_009_CA_1_validation,HOBBIES_1_009,HOBBIES_1,HOBBIES,CA_1,CA,d_1,2.0,0,11101
2,HOBBIES_1_010_CA_1_validation,HOBBIES_1_010,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,0,11101
3,HOBBIES_1_012_CA_1_validation,HOBBIES_1_012,HOBBIES_1,HOBBIES,CA_1,CA,d_1,0.0,0,11101
4,HOBBIES_1_015_CA_1_validation,HOBBIES_1_015,HOBBIES_1,HOBBIES,CA_1,CA,d_1,4.0,0,11101


In [4]:
grid['day'] = grid['d'].str.replace('d_','').astype(int)
grid = grid.loc[grid['day']>1200]
grid.shape

(22116579, 11)

In [74]:
for key, value in limited_items.items():
    if key in grid['id'].values:
        grid['gap_size'] = value[0]
        grid['gap_cat'] = value[1]
        grid['gap_start'] = value[2]
        grid['number_of_gaps'] = value[3]

KeyboardInterrupt: 

In [78]:
grid.to_pickle('grid_gaps.pkl')

In [47]:
# from croston import croston
# a = np.zeros(50) 
# val = np.array(random.sample(range(100,200), 10)) 
# idxs = random.sample(range(50), 10)

# ts = np.insert(a, idxs, val)

# fit_pred = croston.fit_croston(ts, 10)
# # len(fit_pred)

# yhat = np.concatenate([fit_pred['croston_fittedvalues'], fit_pred['croston_forecast']])
# print(len(fit_pred['croston_fittedvalues']))
# print(len(fit_pred['croston_forecast']))
# # plt.plot(ts)
# # plt.plot(yhat)

60
10


In [21]:
item = 'FOODS_3_090_CA_3_validation'
print(len(grid.loc[grid['id']==item]['sales']))
print(len(grid.loc[grid['id']==item]))

741
741


In [5]:
from croston import croston
crost = {}
epic = True
while epic ==True:
    for item in grid['id'].unique().to_list():
        fit_pred = pd.Series(croston.fit_croston(np.asarray(grid.loc[grid['id']==item, 'sales'].fillna(0)), 0))
        yhat = pd.Series(fit_pred['croston_fittedvalues'])
        crost[item] = yhat
#         idx = list(grid.loc[grid['id']==item].index)
#         grid.loc[idx]['croston'] = yhat
    epic=False

index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0
index 0 is out of bounds for axis 0 with size 0


In [6]:
crosts = pd.DataFrame(crost)
crosts.head()

Unnamed: 0,HOBBIES_1_001_CA_1_validation,HOBBIES_1_002_CA_1_validation,HOBBIES_1_003_CA_1_validation,HOBBIES_1_004_CA_1_validation,HOBBIES_1_005_CA_1_validation,HOBBIES_1_006_CA_1_validation,HOBBIES_1_007_CA_1_validation,HOBBIES_1_008_CA_1_validation,HOBBIES_1_009_CA_1_validation,HOBBIES_1_010_CA_1_validation,HOBBIES_1_011_CA_1_validation,HOBBIES_1_012_CA_1_validation,HOBBIES_1_013_CA_1_validation,HOBBIES_1_014_CA_1_validation,HOBBIES_1_015_CA_1_validation,HOBBIES_1_016_CA_1_validation,HOBBIES_1_017_CA_1_validation,HOBBIES_1_018_CA_1_validation,HOBBIES_1_019_CA_1_validation,HOBBIES_1_020_CA_1_validation,HOBBIES_1_021_CA_1_validation,HOBBIES_1_022_CA_1_validation,HOBBIES_1_023_CA_1_validation,HOBBIES_1_024_CA_1_validation,HOBBIES_1_025_CA_1_validation,HOBBIES_1_026_CA_1_validation,HOBBIES_1_027_CA_1_validation,HOBBIES_1_028_CA_1_validation,HOBBIES_1_029_CA_1_validation,HOBBIES_1_030_CA_1_validation,HOBBIES_1_031_CA_1_validation,HOBBIES_1_032_CA_1_validation,HOBBIES_1_033_CA_1_validation,HOBBIES_1_034_CA_1_validation,HOBBIES_1_035_CA_1_validation,HOBBIES_1_036_CA_1_validation,HOBBIES_1_037_CA_1_validation,HOBBIES_1_038_CA_1_validation,HOBBIES_1_039_CA_1_validation,HOBBIES_1_040_CA_1_validation,HOBBIES_1_041_CA_1_validation,HOBBIES_1_042_CA_1_validation,HOBBIES_1_043_CA_1_validation,HOBBIES_1_044_CA_1_validation,HOBBIES_1_047_CA_1_validation,HOBBIES_1_048_CA_1_validation,HOBBIES_1_049_CA_1_validation,HOBBIES_1_050_CA_1_validation,HOBBIES_1_051_CA_1_validation,HOBBIES_1_052_CA_1_validation,...,FOODS_2_029_CA_2_validation,FOODS_2_140_CA_2_validation,FOODS_2_206_CA_2_validation,FOODS_2_302_CA_2_validation,FOODS_2_334_CA_2_validation,FOODS_2_379_CA_2_validation,HOUSEHOLD_2_342_CA_4_validation,FOODS_2_379_CA_4_validation,FOODS_2_379_TX_1_validation,FOODS_2_379_TX_3_validation,HOBBIES_1_170_CA_4_validation,FOODS_2_147_TX_3_validation,FOODS_2_045_CA_2_validation,FOODS_2_088_CA_2_validation,FOODS_2_102_CA_2_validation,FOODS_2_112_CA_2_validation,FOODS_2_184_CA_2_validation,FOODS_2_250_CA_2_validation,FOODS_2_379_TX_2_validation,FOODS_2_259_TX_3_validation,FOODS_2_310_TX_3_validation,FOODS_2_193_CA_2_validation,FOODS_2_248_CA_4_validation,FOODS_2_117_TX_1_validation,FOODS_2_117_TX_2_validation,FOODS_2_209_TX_3_validation,FOODS_2_117_CA_1_validation,FOODS_2_209_CA_1_validation,FOODS_2_117_CA_2_validation,FOODS_2_209_CA_2_validation,FOODS_2_256_CA_2_validation,FOODS_2_209_CA_3_validation,FOODS_2_117_CA_4_validation,FOODS_2_256_CA_4_validation,HOUSEHOLD_1_183_CA_4_validation,FOODS_3_296_CA_1_validation,FOODS_3_296_CA_2_validation,FOODS_3_296_TX_2_validation,FOODS_3_296_WI_2_validation,HOUSEHOLD_1_512_CA_3_validation,FOODS_3_296_CA_4_validation,HOUSEHOLD_1_400_WI_2_validation,FOODS_3_595_CA_1_validation,HOUSEHOLD_1_311_CA_2_validation,HOUSEHOLD_1_405_CA_2_validation,HOUSEHOLD_1_278_CA_3_validation,FOODS_3_595_CA_3_validation,HOUSEHOLD_1_400_CA_4_validation,HOUSEHOLD_1_386_WI_1_validation,HOUSEHOLD_1_020_WI_2_validation
0,0.0,0.0,0.0,1.505618,1.191011,1.119718,0.251404,13.061797,0.0,0.453652,0.0,0.22426,0.0,2.119382,2.283708,0.0,0.0,0.0,10.283707,0.0,0.0,0.35543,0.0,0.0,0.0,0.0,0.677918,0.0,0.0,3.883427,0.0,0.0,0.0,0.40085,0.0,0.0,0.0,0.629213,0.0,0.0,0.0,1.588983,0.0,0.0,0.0,0.926966,0.0,0.0,0.196051,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.160494,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.48503,0.150602,0.962963,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.787879,0.0,0.0,0.0,1.119048,0.0,0.0,0.0,0.0,0.478873,0.0,0.0,2.408451,0.0
1,0.0,0.265537,0.0,1.866149,1.23455,1.119718,0.251404,6.643637,0.295615,0.488291,0.0,0.22426,0.0,2.519777,14.247412,0.0,0.0,0.0,4.42923,0.0,0.0,0.35543,0.0,0.0,0.0,0.0,0.701652,0.0,0.0,1.906386,0.0,0.0,0.0,0.40085,0.0,0.0,0.0,0.629213,0.0,0.0,0.211268,1.588983,0.0,0.0,0.0,2.587634,0.0,0.59691,0.196051,0.0,...,0.0,0.0,0.0,0.0,0.0,0.262295,0.0,0.0,0.0,0.430303,0.160494,0.0,0.0,0.319527,0.0,0.0,0.0,0.0,0.0,0.578824,0.150602,0.965872,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.428571,1.806871,1.217391,0.0,0.0,1.119048,0.0,0.0,2.323943,0.0,0.478873,0.0,0.0,2.380739,0.0
2,0.0,0.265537,0.0,1.866149,1.23455,1.119718,0.252208,6.643637,0.295615,0.488291,0.0,0.230496,0.0,2.261026,6.510153,6.707865,0.0,0.0,4.42923,0.0,1.004219,0.35543,0.0,0.0,0.0,0.0,0.701652,0.0,0.0,20.774349,0.247126,5.08146,0.274538,0.40085,0.0,1.612359,0.0,0.629213,0.0,0.0,0.211268,1.588983,4.947443,0.0,0.0,6.164538,0.0,1.46085,0.196051,0.0,...,0.0,0.0,0.0,0.0,0.0,0.262295,0.0,0.0,0.0,0.456514,0.160494,0.0,0.0,0.370776,0.0,0.0,0.0,0.0,0.290909,0.70609,0.150602,0.968759,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.454739,1.806871,1.217391,0.0,0.0,0.527676,0.0,0.0,2.433297,0.0,0.478873,0.0,0.0,2.285607,0.0
3,0.0,0.265537,0.0,1.982565,1.157069,0.934959,0.252208,6.027234,0.320109,0.553375,0.0,0.230496,0.0,2.125172,9.944031,6.707865,0.0,0.0,1.057253,0.0,1.004219,0.35543,0.0,0.0,0.0,0.0,0.701652,0.808989,0.744382,11.403045,0.247126,4.604245,0.274538,0.40085,0.0,1.453187,0.0,0.615518,0.0,0.0,0.211268,1.35245,4.947443,0.0,0.0,13.343096,0.0,1.46085,0.196051,0.0,...,0.0,0.0,0.0,0.0,0.0,0.297223,0.342697,0.0,0.0,0.482967,0.160494,0.0,0.0,0.425109,0.0,0.0,1.585799,0.0,0.290909,0.70609,0.150602,0.971589,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.529614,1.806871,1.217391,0.0,0.0,0.527676,0.421687,0.0,2.404526,2.197183,0.478873,0.647887,0.225352,2.195752,0.0
4,0.441011,0.270795,0.0,4.656746,1.14849,0.940861,0.253011,6.860279,0.320109,0.583793,0.0,0.236757,0.276685,2.125172,19.192322,5.647889,1.092697,0.0,3.610192,0.0,0.844261,0.35543,0.0,0.21831,0.0,0.0,0.69415,0.808989,1.478855,16.261142,0.247126,4.293534,0.274538,0.319011,0.0,1.453187,0.0,0.615518,0.0,0.0,0.211268,1.326925,2.229566,0.0,0.0,17.941095,0.0,1.083225,0.196051,0.0,...,0.0,0.0,0.0,0.0,0.0,0.297223,0.342697,0.0,0.0,0.482967,0.160494,0.0,0.0,0.425109,0.0,0.0,3.662752,0.0,0.290909,0.70609,0.150602,0.971589,0.0,0.0,0.594203,1.914893,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.207921,0.529614,1.754376,1.217391,1.406593,0.964286,0.50137,0.421687,1.901408,2.404526,1.0,0.478873,0.647887,0.225352,2.195752,1.257143


In [8]:
# cols = True
# while cols == True:
#     for item in crosts.columns:
#         grid.loc[grid['id']==item, 'crost'] = crosts[item]
#     cols = False

In [22]:
#creating Croston series through 'crost' df
crost_s = pd.concat([crosts[i] for i in crosts.columns], axis=0)
grid['croston'] = crost_s.values

In [50]:
crost_df = grid[['id', 'd', 'croston']]
crost_df.head()

Unnamed: 0,id,d,croston
24765098,HOBBIES_1_001_CA_1_validation,d_1201,0.0
24765099,HOBBIES_1_002_CA_1_validation,d_1201,0.0
24765100,HOBBIES_1_003_CA_1_validation,d_1201,0.0
24765101,HOBBIES_1_004_CA_1_validation,d_1201,0.0
24765102,HOBBIES_1_005_CA_1_validation,d_1201,0.441011


In [51]:
#exporting crost with only a few columns to merge with minimal memory
crost_df.to_pickle('crost.pkl')