In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [2]:
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')
holidays = pd.read_csv('../data/holidays_events.csv')
oil = pd.read_csv('../data/oil.csv')
stores = pd.read_csv('../data/stores.csv')
transactions = pd.read_csv('../data/transactions.csv')

### Train

In [3]:
train.isnull().sum()

id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64

### Holidays

In [4]:
holidays.isnull().sum()

date           0
type           0
locale         0
locale_name    0
description    0
transferred    0
dtype: int64

### Oil

In [5]:
oil.isnull().sum()

date           0
dcoilwtico    43
dtype: int64

In [6]:
# Back fill oil price since oil price does not change drastically day-to-day
oil['dcoilwtico'] = oil['dcoilwtico'].bfill()
oil.isnull().sum()

date          0
dcoilwtico    0
dtype: int64

### Stores

In [7]:
stores.isnull().sum()

store_nbr    0
city         0
state        0
type         0
cluster      0
dtype: int64

### Transactions

In [8]:
transactions.isnull().sum()

date            0
store_nbr       0
transactions    0
dtype: int64

### Convert Dates to datetime

In [9]:
for df in [train, test, holidays, oil, transactions]:
    df['date'] = pd.to_datetime(df['date'])

### Check for nulls

In [10]:
print("=== Missing train values ===")
print(train.isnull().sum())
print("=== Missing test values ===")
print(test.isnull().sum())
print("=== Missing holidays values ===")
print(holidays.isnull().sum())
print("=== Missing oil values ===")
print(oil.isnull().sum())
print("=== Missing stores values ===")
print(stores.isnull().sum())
print("=== Missing transaction values ===")
print(transactions.isnull().sum())

=== Missing train values ===
id             0
date           0
store_nbr      0
family         0
sales          0
onpromotion    0
dtype: int64
=== Missing test values ===
id             0
date           0
store_nbr      0
family         0
onpromotion    0
dtype: int64
=== Missing holidays values ===
date           0
type           0
locale         0
locale_name    0
description    0
transferred    0
dtype: int64
=== Missing oil values ===
date          0
dcoilwtico    0
dtype: int64
=== Missing stores values ===
store_nbr    0
city         0
state        0
type         0
cluster      0
dtype: int64
=== Missing transaction values ===
date            0
store_nbr       0
transactions    0
dtype: int64


### Merge store data into train and test

In [11]:
train = train.merge(stores, on='store_nbr', how='left')
test = test.merge(stores, on='store_nbr', how='left')