In [1]:
# Extracting the Zip File to Get Access to the Data
import zipfile
with zipfile.ZipFile("store-sales-time-series-forecasting.zip","r") as zip_loaded:
    zip_loaded.extractall("files/")

print("Extraction Complete.")

Extraction Complete.


In [2]:
# Importing and loading relevant libraries and packages
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from itertools import *

import warnings

# Hiding the warnings
#warnings.filterwarnings('ignore')

print("Loading complete.", "Warnings hidden.")



**Previewing & exploring the files**

**Train data and complementary data**

In [3]:
train_data = pd.read_csv("files/train.csv")
train_data

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB


In [5]:
train_data.nunique()

id             3000888
date              1684
store_nbr           54
family              33
sales           379610
onpromotion        362
dtype: int64

In [33]:
# Setting all floats to display with 2 decimal places
pd.set_option("display.float_format", lambda x: "%.27" % x)

In [6]:
## Getting the  actual dates
actual_days = train_data["date"].unique()
actual_days

array(['2013-01-01', '2013-01-02', '2013-01-03', ..., '2017-08-13',
       '2017-08-14', '2017-08-15'], dtype=object)

In [7]:
# Converting the date column to datetime format
train_data["sales_date"] = pd.to_datetime(train_data["date"]).dt.date
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
 6   sales_date   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 160.3+ MB


In [8]:
# Checking if there are any missing dates
date_range = train_data.sales_date.min(), train_data.sales_date.max()
date_range

(datetime.date(2013, 1, 1), datetime.date(2017, 8, 15))

In [9]:
# Check completeness of dates
## Number of expected dates
expected_days = pd.date_range(start = train_data["sales_date"].min(), end = train_data["sales_date"].max())
expected_days

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', length=1688, freq='D')

We note a difference of 4 days between the actual dates (1,684) and expected dates (1,688) within the range. As such we have to find the missing dates and add them to ensure completeness of the dates.

In [10]:
## Get missing dates
missing_dates = set(expected_days.date) - set(train_data["sales_date"].unique())
missing_dates

{datetime.date(2013, 12, 25),
 datetime.date(2014, 12, 25),
 datetime.date(2015, 12, 25),
 datetime.date(2016, 12, 25)}

In [11]:
# Getting the list of unique stores
unique_stores = train_data["store_nbr"].unique()
unique_stores

array([ 1, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,  2, 20, 21, 22, 23, 24,
       25, 26, 27, 28, 29,  3, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,  4,
       40, 41, 42, 43, 44, 45, 46, 47, 48, 49,  5, 50, 51, 52, 53, 54,  6,
        7,  8,  9], dtype=int64)

In [12]:
# Getting the unique families
unique_families = train_data["family"].unique()
unique_families

array(['AUTOMOTIVE', 'BABY CARE', 'BEAUTY', 'BEVERAGES', 'BOOKS',
       'BREAD/BAKERY', 'CELEBRATION', 'CLEANING', 'DAIRY', 'DELI', 'EGGS',
       'FROZEN FOODS', 'GROCERY I', 'GROCERY II', 'HARDWARE',
       'HOME AND KITCHEN I', 'HOME AND KITCHEN II', 'HOME APPLIANCES',
       'HOME CARE', 'LADIESWEAR', 'LAWN AND GARDEN', 'LINGERIE',
       'LIQUOR,WINE,BEER', 'MAGAZINES', 'MEATS', 'PERSONAL CARE',
       'PET SUPPLIES', 'PLAYERS AND ELECTRONICS', 'POULTRY',
       'PREPARED FOODS', 'PRODUCE', 'SCHOOL AND OFFICE SUPPLIES',
       'SEAFOOD'], dtype=object)

Since we're predicting the sales for each store, it means we have to fill in the missing dates for each store. We will do this with the _product_ module from _itertools_

In [13]:
missing_data = list(product(missing_dates, unique_stores, unique_families))
train_addon = pd.DataFrame(missing_data, columns = ["sales_date", "store_nbr", "family"])
train_addon

Unnamed: 0,sales_date,store_nbr,family
0,2016-12-25,1,AUTOMOTIVE
1,2016-12-25,1,BABY CARE
2,2016-12-25,1,BEAUTY
3,2016-12-25,1,BEVERAGES
4,2016-12-25,1,BOOKS
...,...,...,...
7123,2015-12-25,9,POULTRY
7124,2015-12-25,9,PREPARED FOODS
7125,2015-12-25,9,PRODUCE
7126,2015-12-25,9,SCHOOL AND OFFICE SUPPLIES


In [14]:
train_data = pd.concat([train_data, train_addon], ignore_index=True)
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3008016 entries, 0 to 3008015
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           float64
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  float64
 6   sales_date   object 
dtypes: float64(3), int64(1), object(3)
memory usage: 160.6+ MB


In [15]:
train_data

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,sales_date
0,0.0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,2013-01-01
1,1.0,2013-01-01,1,BABY CARE,0.0,0.0,2013-01-01
2,2.0,2013-01-01,1,BEAUTY,0.0,0.0,2013-01-01
3,3.0,2013-01-01,1,BEVERAGES,0.0,0.0,2013-01-01
4,4.0,2013-01-01,1,BOOKS,0.0,0.0,2013-01-01
...,...,...,...,...,...,...,...
3008011,,,9,POULTRY,,,2015-12-25
3008012,,,9,PREPARED FOODS,,,2015-12-25
3008013,,,9,PRODUCE,,,2015-12-25
3008014,,,9,SCHOOL AND OFFICE SUPPLIES,,,2015-12-25


- With December 25 omitted from each of the years, I assume that it was deliberate - most likely because all shops are closed on December 25 each year. In effect, no items would have been on promotion and no sales would have been made; that is to say that it is safe to fill the null "sales" and "onpromotion" column data with 0.

- By this, I am also dropping the "id" column as it will not be relevant to subsequent analyses and modelling.

- I will be filling the missing dates in the original dates column with the sales data, for aesthetic purposes only.

In [16]:
# Filling missing rows in the train data and dropping "id" column
train_data.drop("id", axis = 1, inplace = True)
train_data["date"].fillna(train_data["sales_date"], inplace = True)
train_data["sales"].fillna(0, inplace = True)
train_data["onpromotion"].fillna(0, inplace = True)
train_data

Unnamed: 0,date,store_nbr,family,sales,onpromotion,sales_date
0,2013-01-01,1,AUTOMOTIVE,0.0,0.0,2013-01-01
1,2013-01-01,1,BABY CARE,0.0,0.0,2013-01-01
2,2013-01-01,1,BEAUTY,0.0,0.0,2013-01-01
3,2013-01-01,1,BEVERAGES,0.0,0.0,2013-01-01
4,2013-01-01,1,BOOKS,0.0,0.0,2013-01-01
...,...,...,...,...,...,...
3008011,2015-12-25,9,POULTRY,0.0,0.0,2015-12-25
3008012,2015-12-25,9,PREPARED FOODS,0.0,0.0,2015-12-25
3008013,2015-12-25,9,PRODUCE,0.0,0.0,2015-12-25
3008014,2015-12-25,9,SCHOOL AND OFFICE SUPPLIES,0.0,0.0,2015-12-25


**Transactions data**

In [17]:
transactions = pd.read_csv("files/transactions.csv")
transactions

Unnamed: 0,date,store_nbr,transactions
0,2013-01-01,25,770
1,2013-01-02,1,2111
2,2013-01-02,2,2358
3,2013-01-02,3,3487
4,2013-01-02,4,1922
...,...,...,...
83483,2017-08-15,50,2804
83484,2017-08-15,51,1573
83485,2017-08-15,52,2255
83486,2017-08-15,53,932


In [18]:
# Viewing basic information about the transactions data
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83488 entries, 0 to 83487
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   date          83488 non-null  object
 1   store_nbr     83488 non-null  int64 
 2   transactions  83488 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.9+ MB


In [19]:
transactions.nunique()

date            1682
store_nbr         54
transactions    4993
dtype: int64

- Since the train data has the same number of unique stores as the transactions data, we can use the unique stores variable defined earlier to fill in the missing dates.
- Also, given that the transactions and train data cover the same period, it is concerning that the transactions data has even less unique dates than the train data has. As such, we have to find and impute the missing dates as done for the train data.

In [20]:
transactions["sales_date"] = pd.to_datetime(transactions["date"]).dt.date

In [21]:
# Getting missing dates
missing_txn_dates = set(expected_days.date) - set(transactions["sales_date"].unique())
missing_txn_dates

{datetime.date(2013, 12, 25),
 datetime.date(2014, 12, 25),
 datetime.date(2015, 12, 25),
 datetime.date(2016, 1, 1),
 datetime.date(2016, 1, 3),
 datetime.date(2016, 12, 25)}

In [22]:
missing_txn_data = list(product(missing_txn_dates, unique_stores))
txn_data_addon = pd.DataFrame(missing_txn_data, columns = ["sales_date", "store_nbr"])
txn_data_addon

Unnamed: 0,sales_date,store_nbr
0,2016-12-25,1
1,2016-12-25,10
2,2016-12-25,11
3,2016-12-25,12
4,2016-12-25,13
...,...,...
319,2016-01-03,54
320,2016-01-03,6
321,2016-01-03,7
322,2016-01-03,8


In [23]:
transactions

Unnamed: 0,date,store_nbr,transactions,sales_date
0,2013-01-01,25,770,2013-01-01
1,2013-01-02,1,2111,2013-01-02
2,2013-01-02,2,2358,2013-01-02
3,2013-01-02,3,3487,2013-01-02
4,2013-01-02,4,1922,2013-01-02
...,...,...,...,...
83483,2017-08-15,50,2804,2017-08-15
83484,2017-08-15,51,1573,2017-08-15
83485,2017-08-15,52,2255,2017-08-15
83486,2017-08-15,53,932,2017-08-15


In [24]:
# Adding the data for the missing transaction dates to the main transaction data and filling nulls with 0
transactions = pd.concat([transactions, txn_data_addon], ignore_index=True)
transactions.drop("date", axis = 1, inplace = True)
transactions["transactions"].fillna(0, inplace = True)
transactions

Unnamed: 0,store_nbr,transactions,sales_date
0,25,770.0,2013-01-01
1,1,2111.0,2013-01-02
2,2,2358.0,2013-01-02
3,3,3487.0,2013-01-02
4,4,1922.0,2013-01-02
...,...,...,...
83807,54,0.0,2016-01-03
83808,6,0.0,2016-01-03
83809,7,0.0,2016-01-03
83810,8,0.0,2016-01-03


In [25]:
transactions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83812 entries, 0 to 83811
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   store_nbr     83812 non-null  int64  
 1   transactions  83812 non-null  float64
 2   sales_date    83812 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 1.9+ MB


**Holidays and events data**

In [26]:
holidays_events = pd.read_csv("files/holidays_events.csv")
holidays_events

Unnamed: 0,date,type,locale,locale_name,description,transferred
0,2012-03-02,Holiday,Local,Manta,Fundacion de Manta,False
1,2012-04-01,Holiday,Regional,Cotopaxi,Provincializacion de Cotopaxi,False
2,2012-04-12,Holiday,Local,Cuenca,Fundacion de Cuenca,False
3,2012-04-14,Holiday,Local,Libertad,Cantonizacion de Libertad,False
4,2012-04-21,Holiday,Local,Riobamba,Cantonizacion de Riobamba,False
...,...,...,...,...,...,...
345,2017-12-22,Additional,National,Ecuador,Navidad-3,False
346,2017-12-23,Additional,National,Ecuador,Navidad-2,False
347,2017-12-24,Additional,National,Ecuador,Navidad-1,False
348,2017-12-25,Holiday,National,Ecuador,Navidad,False


In [34]:
holidays_events.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 350 entries, 0 to 349
Data columns (total 6 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   date         350 non-null    object
 1   type         350 non-null    object
 2   locale       350 non-null    object
 3   locale_name  350 non-null    object
 4   description  350 non-null    object
 5   transferred  350 non-null    bool  
dtypes: bool(1), object(5)
memory usage: 14.1+ KB


In [27]:
holidays_events.nunique()

date           312
type             6
locale           3
locale_name     24
description    103
transferred      2
dtype: int64

**Oil data**

In [29]:
oil_data = pd.read_csv("files/oil.csv")
oil_data

Unnamed: 0,date,dcoilwtico
0,2013-01-01,
1,2013-01-02,93.14
2,2013-01-03,92.97
3,2013-01-04,93.12
4,2013-01-07,93.20
...,...,...
1213,2017-08-25,47.65
1214,2017-08-28,46.40
1215,2017-08-29,46.46
1216,2017-08-30,45.96


**Stores data**

In [30]:
stores_data = pd.read_csv("files/stores.csv")
stores_data

Unnamed: 0,store_nbr,city,state,type,cluster
0,1,Quito,Pichincha,D,13
1,2,Quito,Pichincha,D,13
2,3,Quito,Pichincha,D,8
3,4,Quito,Pichincha,D,9
4,5,Santo Domingo,Santo Domingo de los Tsachilas,D,4
5,6,Quito,Pichincha,D,13
6,7,Quito,Pichincha,D,8
7,8,Quito,Pichincha,D,8
8,9,Quito,Pichincha,B,6
9,10,Quito,Pichincha,C,15


**Test data**

In [31]:
test_data = pd.read_csv("files/test.csv")
test_data

Unnamed: 0,id,date,store_nbr,family,onpromotion
0,3000888,2017-08-16,1,AUTOMOTIVE,0
1,3000889,2017-08-16,1,BABY CARE,0
2,3000890,2017-08-16,1,BEAUTY,2
3,3000891,2017-08-16,1,BEVERAGES,20
4,3000892,2017-08-16,1,BOOKS,0
...,...,...,...,...,...
28507,3029395,2017-08-31,9,POULTRY,1
28508,3029396,2017-08-31,9,PREPARED FOODS,0
28509,3029397,2017-08-31,9,PRODUCE,1
28510,3029398,2017-08-31,9,SCHOOL AND OFFICE SUPPLIES,9


**Sample Submission**

In [32]:
sample_submission = pd.read_csv("files/sample_submission.csv")
sample_submission

Unnamed: 0,id,sales
0,3000888,0.0
1,3000889,0.0
2,3000890,0.0
3,3000891,0.0
4,3000892,0.0
...,...,...
28507,3029395,0.0
28508,3029396,0.0
28509,3029397,0.0
28510,3029398,0.0
