# Checking out the different datasets.

Import basic dependencies

In [57]:
import pandas as pd
from definitions import HOLIDAY_EVENTS_FILE, OIL_FILE, STORES_FILE, TEST_FILE, TRAIN_FILE, TRANSACTIONS_FILE

suppress warnings for cleaner output

In [58]:
import warnings
warnings.filterwarnings('ignore')

Create some general code for looping through datasets.

In [59]:
from pandas.api.types import is_numeric_dtype, is_object_dtype

def describe_dataset(df: pd.DataFrame):
    print(f'columns: {df.columns.tolist()}')
    print(f'length {len(df)}')
    for col in df.columns:
        if col == 'id':
            continue

        print()
        print(f" === Column: {col} === ")

        print(f'Has nans?: {df[col].hasnans}')

        if is_object_dtype(df[col]):
            try:
                date_col = pd.to_datetime(df[col])
                print(f'{col} converted to datetime')
                print(date_col.describe())
                continue
            except:
                pass

            print(df[col].value_counts())
            print(f'{len(df[col].value_counts())} unique values')
            continue

        if is_numeric_dtype(df[col]):
            print(df[col].describe())
            continue

        raise Exception(f'Cannot analyse {col}.')

## Training data (`train.csv`)

In [60]:
train_df = pd.read_csv(TRAIN_FILE)
describe_dataset(train_df)

print(train_df.store_nbr.value_counts().sort_index())

columns: ['id', 'date', 'store_nbr', 'family', 'sales', 'onpromotion']
length 3000888

 === Column: date === 
Has nans?: False
date converted to datetime
count                 3000888
unique                   1684
top       2013-01-01 00:00:00
freq                     1782
first     2013-01-01 00:00:00
last      2017-08-15 00:00:00
Name: date, dtype: object

 === Column: store_nbr === 
Has nans?: False
count    3.000888e+06
mean     2.750000e+01
std      1.558579e+01
min      1.000000e+00
25%      1.400000e+01
50%      2.750000e+01
75%      4.100000e+01
max      5.400000e+01
Name: store_nbr, dtype: float64

 === Column: family === 
Has nans?: False
AUTOMOTIVE                    90936
HOME APPLIANCES               90936
SCHOOL AND OFFICE SUPPLIES    90936
PRODUCE                       90936
PREPARED FOODS                90936
POULTRY                       90936
PLAYERS AND ELECTRONICS       90936
PET SUPPLIES                  90936
PERSONAL CARE                 90936
MEATS              

The training data has the following columns.
id: ids
date:
 - Dates ranging from 2013-01-01 until 2017-08-15, so the recording stops in 2017 mid august.
 - There are over 3 million records but only 1684 unique dates (about 4.6 years). Every date occurs 1782 times.
store_nbr:
 - Numbers ranging from 1 till 54, every store number is in 55572 records.
family
 - Families seem to be product categories.
 - 33 different families each occurring 90936 times, some families have a I en II variant.
sales
 - daily sales per store and family ranging from 0 to 124717 item (seems very unreliable to describe sales in items differ in price).
 - 0 to 741 items on promotion (average is 2.6)

## Testing data (`test.csv`)

In [61]:
test_df = pd.read_csv(TEST_FILE)
describe_dataset(test_df)

columns: ['id', 'date', 'store_nbr', 'family', 'onpromotion']
length 28512

 === Column: date === 
Has nans?: False
date converted to datetime
count                   28512
unique                     16
top       2017-08-16 00:00:00
freq                     1782
first     2017-08-16 00:00:00
last      2017-08-31 00:00:00
Name: date, dtype: object

 === Column: store_nbr === 
Has nans?: False
count    28512.000000
mean        27.500000
std         15.586057
min          1.000000
25%         14.000000
50%         27.500000
75%         41.000000
max         54.000000
Name: store_nbr, dtype: float64

 === Column: family === 
Has nans?: False
AUTOMOTIVE                    864
HOME APPLIANCES               864
SCHOOL AND OFFICE SUPPLIES    864
PRODUCE                       864
PREPARED FOODS                864
POULTRY                       864
PLAYERS AND ELECTRONICS       864
PET SUPPLIES                  864
PERSONAL CARE                 864
MEATS                         864
MAGAZINES     

Same as the train dataset but containing the last two week of august 2017 (16 days).

## Holiday data (`holidays_events.csv`)

In [62]:
holiday_df = pd.read_csv(HOLIDAY_EVENTS_FILE)
describe_dataset(holiday_df)

print(f'{len(holiday_df.description.unique())} different descriptions')

columns: ['date', 'type', 'locale', 'locale_name', 'description', 'transferred']
length 350

 === Column: date === 
Has nans?: False
date converted to datetime
count                     350
unique                    312
top       2014-06-25 00:00:00
freq                        4
first     2012-03-02 00:00:00
last      2017-12-26 00:00:00
Name: date, dtype: object

 === Column: type === 
Has nans?: False
Holiday       221
Event          56
Additional     51
Transfer       12
Bridge          5
Work Day        5
Name: type, dtype: int64
6 unique values

 === Column: locale === 
Has nans?: False
National    174
Local       152
Regional     24
Name: locale, dtype: int64
3 unique values

 === Column: locale_name === 
Has nans?: False
Ecuador                           174
Quito                              13
Riobamba                           12
Guaranda                           12
Latacunga                          12
Ambato                             12
Guayaquil                         

date:
 - There are 350 holidays on record, on 312 different dates, so some holiday's coincide, most likely  they are local to multiple regions.
type:
 - The majority of the dates are labelled as 'holidays'
 - 12 dates are transferred to another date and where not celebrated on that day.
 - 5 additional free days where added, compensated by 5 additional work days.
locale:
 - a little over half the  holidays are national, a few are regional and the rest is local.
locale_name:
 - there are 24 locale's.
description:
 - Carnaval is the most occuring holiday (10 records), there are 103 different descriptions.
transferred:
 - 12 holidays have been transferred, this matches the data from the type column.

In [63]:
transferred_and_transfer_dates = holiday_df[(holiday_df.transferred == True) | (holiday_df.type == 'Transfer')][['transferred', 'type', 'description']]
print(transferred_and_transfer_dates)

     transferred      type                             description
19          True   Holiday              Independencia de Guayaquil
20         False  Transfer     Traslado Independencia de Guayaquil
72          True   Holiday              Independencia de Guayaquil
73         False  Transfer     Traslado Independencia de Guayaquil
135         True   Holiday              Independencia de Guayaquil
136        False  Transfer     Traslado Independencia de Guayaquil
255         True   Holiday                    Batalla de Pichincha
256        False  Transfer           Traslado Batalla de Pichincha
265        False  Transfer         Traslado Fundacion de Guayaquil
266         True   Holiday                  Fundacion de Guayaquil
268         True   Holiday           Primer Grito de Independencia
269        False  Transfer  Traslado Primer Grito de Independencia
297         True   Holiday                      Primer dia del ano
298        False  Transfer             Traslado Primer dia del

The transferred holidays also seem to have 'Translado ' prepende to the description on the date the are transferred to.

Most dates in this are 'off-days' except;
 - when transferred is True.
 - type is 'work day'

In [64]:
work_day_dates = pd.to_datetime(holiday_df[holiday_df.type == 'Work Day']['date'])
print(work_day_dates.dt.day_of_week)

42     5
43     5
149    5
161    5
283    5
Name: date, dtype: int64


Note that all 'Work Day' types are on a saturday.

## Oil data (`oil.csv`)

In [65]:
oil_df = pd.read_csv(OIL_FILE)
describe_dataset(oil_df)

columns: ['date', 'dcoilwtico']
length 1218

 === Column: date === 
Has nans?: False
date converted to datetime
count                    1218
unique                   1218
top       2013-01-01 00:00:00
freq                        1
first     2013-01-01 00:00:00
last      2017-08-31 00:00:00
Name: date, dtype: object

 === Column: dcoilwtico === 
Has nans?: True
count    1175.000000
mean       67.714366
std        25.630476
min        26.190000
25%        46.405000
50%        53.190000
75%        95.660000
max       110.620000
Name: dcoilwtico, dtype: float64


date:
 - there are 1218 days in the oil dataset, the train and test set have 1684 + 16 = 1700 days
dcoilwtico:
 - the oilprice ranges from 26,19 to 110,00.

In [66]:
print(pd.to_datetime(oil_df.date).dt.day_of_week.value_counts())

1    244
2    244
3    244
4    243
0    243
Name: date, dtype: int64


There are no weekend days in the dataset, explaining why there are less days than in the train/test data.

## Stores data (`stores.csv`)

In [67]:
stores_df = pd.read_csv(STORES_FILE)
describe_dataset(stores_df)

columns: ['store_nbr', 'city', 'state', 'type', 'cluster']
length 54

 === Column: store_nbr === 
Has nans?: False
count    54.000000
mean     27.500000
std      15.732133
min       1.000000
25%      14.250000
50%      27.500000
75%      40.750000
max      54.000000
Name: store_nbr, dtype: float64

 === Column: city === 
Has nans?: False
Quito            18
Guayaquil         8
Cuenca            3
Santo Domingo     3
Manta             2
Latacunga         2
Machala           2
Ambato            2
Quevedo           1
Esmeraldas        1
Loja              1
Libertad          1
Playas            1
Daule             1
Babahoyo          1
Salinas           1
Puyo              1
Guaranda          1
Ibarra            1
Riobamba          1
Cayambe           1
El Carmen         1
Name: city, dtype: int64
22 unique values

 === Column: state === 
Has nans?: False
Pichincha                         19
Guayas                            11
Santo Domingo de los Tsachilas     3
Azuay                    

store_nbr:
 - There store numbers reflect the test/train data.
city:
 - most cities have 1 store, som have more, Quito has 18 which is a log.
state:
 - Pichincha and Guayas have a relatively high number of store.
 - other state have 1,2 or 3.
type:
 - there are 5 type of stores, labelled as letters. It is unclear what they stand for.
cluster:
 - there are 17 clusters.

## Transaction data (`transactions.csv`)

In [68]:
transactions_df = pd.read_csv(TRANSACTIONS_FILE)
describe_dataset(transactions_df)

columns: ['date', 'store_nbr', 'transactions']
length 83437

 === Column: date === 
Has nans?: False
date converted to datetime
count                   83437
unique                   1681
top       2017-08-15 00:00:00
freq                       54
first     2013-01-01 00:00:00
last      2017-08-15 00:00:00
Name: date, dtype: object

 === Column: store_nbr === 
Has nans?: False
count    83437.000000
mean        26.939296
std         15.608269
min          1.000000
25%         13.000000
50%         27.000000
75%         40.000000
max         54.000000
Name: store_nbr, dtype: float64

 === Column: transactions === 
Has nans?: False
count    83437.000000
mean      1694.694536
std        963.380084
min          5.000000
25%       1046.000000
50%       1393.000000
75%       2079.000000
max       8359.000000
Name: transactions, dtype: float64


date:
 - 1681 dates, a little less than the train/test data.
store_nbr:
 - transactions are mapped to store numbers and days, but nothing else.
transactions:
 - number of transactions ranging from 5 to 8359 with a mean of 1694.