In [1]:
# Extracting the Zip File to Get Access to the Data
import zipfile
with zipfile.ZipFile("store-sales-time-series-forecasting.zip","r") as zip_loaded:
    zip_loaded.extractall("files/")

print("Extraction Complete")

Extraction Complete


In [2]:
# Importing and loading relevant libraries and packages
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt

**Previewing & exploring the files**

**Train data and complementary data**

In [3]:
train_data = pd.read_csv("files/train.csv")
train_data

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion
0,0,2013-01-01,1,AUTOMOTIVE,0.000,0
1,1,2013-01-01,1,BABY CARE,0.000,0
2,2,2013-01-01,1,BEAUTY,0.000,0
3,3,2013-01-01,1,BEVERAGES,0.000,0
4,4,2013-01-01,1,BOOKS,0.000,0
...,...,...,...,...,...,...
3000883,3000883,2017-08-15,9,POULTRY,438.133,0
3000884,3000884,2017-08-15,9,PREPARED FOODS,154.553,1
3000885,3000885,2017-08-15,9,PRODUCE,2419.729,148
3000886,3000886,2017-08-15,9,SCHOOL AND OFFICE SUPPLIES,121.000,8


In [4]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 6 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
dtypes: float64(1), int64(3), object(2)
memory usage: 137.4+ MB


In [5]:
train_data.nunique()

id             3000888
date              1684
store_nbr           54
family              33
sales           379610
onpromotion        362
dtype: int64

In [6]:
## Getting the  actual dates
actual_days = train_data["date"].unique()
actual_days

array(['2013-01-01', '2013-01-02', '2013-01-03', ..., '2017-08-13',
       '2017-08-14', '2017-08-15'], dtype=object)

In [7]:
# Converting the date column to datetime format
train_data["sales_date"] = pd.to_datetime(train_data["date"]).dt.date
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3000888 entries, 0 to 3000887
Data columns (total 7 columns):
 #   Column       Dtype  
---  ------       -----  
 0   id           int64  
 1   date         object 
 2   store_nbr    int64  
 3   family       object 
 4   sales        float64
 5   onpromotion  int64  
 6   sales_date   object 
dtypes: float64(1), int64(3), object(3)
memory usage: 160.3+ MB


In [8]:
# Checking if there are any missing dates
date_range = train_data.sales_date.min(), train_data.sales_date.max()
date_range

(datetime.date(2013, 1, 1), datetime.date(2017, 8, 15))

In [9]:
# Check completeness of dates
## Number of expected dates
expected_days = pd.date_range(start = train_data["sales_date"].min(), end = train_data["sales_date"].max())
expected_days

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06', '2013-01-07', '2013-01-08',
               '2013-01-09', '2013-01-10',
               ...
               '2017-08-06', '2017-08-07', '2017-08-08', '2017-08-09',
               '2017-08-10', '2017-08-11', '2017-08-12', '2017-08-13',
               '2017-08-14', '2017-08-15'],
              dtype='datetime64[ns]', length=1688, freq='D')

We note a difference of 4 days between the actual dates (1,684) and expected dates (1,688) within the range. As such we have to find the missing dates and add them to ensure completeness of the dates.

In [10]:
## Get missing dates
missing_dates = set(expected_days.date) - set(train_data["sales_date"].unique())
missing_dates

{datetime.date(2013, 12, 25),
 datetime.date(2014, 12, 25),
 datetime.date(2015, 12, 25),
 datetime.date(2016, 12, 25)}

In [11]:
expected_days_df = pd.DataFrame(expected_days, columns = ["sales_date"])
expected_days_df

Unnamed: 0,sales_date
0,2013-01-01
1,2013-01-02
2,2013-01-03
3,2013-01-04
4,2013-01-05
...,...
1683,2017-08-11
1684,2017-08-12
1685,2017-08-13
1686,2017-08-14


In [12]:
train_data["sales_date"] = pd.to_datetime(train_data["sales_date"]).dt.date
expected_days_df["sales_date"] = pd.to_datetime(expected_days_df["sales_date"]).dt.date

full_train = pd.merge(train_data, expected_days_df, how = "outer", on = "sales_date")
full_train

Unnamed: 0,id,date,store_nbr,family,sales,onpromotion,sales_date
0,0.0,2013-01-01,1.0,AUTOMOTIVE,0.0,0.0,2013-01-01
1,1.0,2013-01-01,1.0,BABY CARE,0.0,0.0,2013-01-01
2,2.0,2013-01-01,1.0,BEAUTY,0.0,0.0,2013-01-01
3,3.0,2013-01-01,1.0,BEVERAGES,0.0,0.0,2013-01-01
4,4.0,2013-01-01,1.0,BOOKS,0.0,0.0,2013-01-01
...,...,...,...,...,...,...,...
3000887,3000887.0,2017-08-15,9.0,SEAFOOD,16.0,0.0,2017-08-15
3000888,,,,,,,2013-12-25
3000889,,,,,,,2014-12-25
3000890,,,,,,,2015-12-25


In [13]:

stores_df = pd.DataFrame({"store_nbr":list(train_data["store_nbr"].unique())})
stores_df

Unnamed: 0,store_nbr
0,1
1,10
2,11
3,12
4,13
5,14
6,15
7,16
8,17
9,18


In [14]:
missing_datex = pd.DataFrame(missing_dates, columns = ["sales_date"])
missing_datex

Unnamed: 0,sales_date
0,2015-12-25
1,2014-12-25
2,2013-12-25
3,2016-12-25


In [15]:
family_df = pd.DataFrame({"family":list(train_data["family"].unique())})
family_df

Unnamed: 0,family
0,AUTOMOTIVE
1,BABY CARE
2,BEAUTY
3,BEVERAGES
4,BOOKS
5,BREAD/BAKERY
6,CELEBRATION
7,CLEANING
8,DAIRY
9,DELI


Since we're predicting the sales for each store, it means we have to fill in the missing dates for each store.

In [18]:
# Putting the missing dates down for each store
comp_train = pd.merge(stores_df.assign(key = 1),
                     missing_datex.assign(key = 1),
                     on = "key").drop("key", axis = 1)
comp_train

Unnamed: 0,store_nbr,sales_date
0,1,2015-12-25
1,1,2014-12-25
2,1,2013-12-25
3,1,2016-12-25
4,10,2015-12-25
...,...,...
211,8,2016-12-25
212,9,2015-12-25
213,9,2014-12-25
214,9,2013-12-25


In [None]:
refilled_train = pd.merge(train_data, comp_train, how = "outer", on = "sales_date")
refilled_train

In [None]:
refilled_train["date"].fillna(refilled_train["sales_date"], inplace = True)
refilled_train["store_nbr_x"].fillna(refilled_train["store_nbr_y"], inplace = True)
refilled_train["sales"].fillna(0, inplace = True)
refilled_train = refilled_train.drop("store_nbr_y", axis = 1)
refilled_train.rename(columns = {"store_nbr_x":"store_nbr"}, inplace = True)

In [None]:
refilled_train.sort_values(by = ["store_nbr", "sales_date", "id"], ignore_index = True, inplace = True)

refilled_train

**Transactions data**

In [None]:
transactions = pd.read_csv("files/transactions.csv")
transactions

**Holidays and events data**

In [None]:
holidays_events = pd.read_csv("files/holidays_events.csv")
holidays_events

**Oil data**

In [None]:
oil_data = pd.read_csv("files/oil.csv")
oil_data

**Stores data**

In [None]:
stores_data = pd.read_csv("files/stores.csv")
stores_data

**Test data**

In [None]:
test_data = pd.read_csv("files/test.csv")
test_data

**Sample Submission**

In [None]:
sample_submission = pd.read_csv("files/sample_submission.csv")
sample_submission