# Importing and Loading

### Importing relevant libs

In [1]:
import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import scipy.stats as st

### Reading dataset

In [148]:
data_store = pd.read_csv('./data/store.csv') 
data_train = pd.read_csv('./data/train.csv', parse_dates=['Date'])
data_test = pd.read_csv('./data/test.csv', parse_dates=['Date'])

# Dataset Exploration

## Knowing the data

### Data exploration pipeline

In [149]:
from IPython.display import display

def print_info(df):
    '''print a concise summary of a DataFrame
    Parameters:
        df (dataframe): a pandas dataframe object 
    '''
    print('\nDataframe summary')
    display(df.info())

def print_head(df):
    '''print out the first 5 rows of a dataframe
    Parameters:
        df (dataframe): a pandas dataframe object
    '''
    print('\nFirst 5 rows')
    display(df.head())
    
def print_shape(df):
    '''print out shape of a dataframe
    Parameters: 
        df (dataframe): a pandas dataframe object
    '''
    
    print('\nShape of the dataset')
    display(df.shape)
    
def print_missing_values(df):
    '''print out count of  missing value of a dataframe
    Parameters: 
        df (dataframe): a pandas dataframe object
    '''
    
    print('\nMissing values count')
    display(df.isna().sum())
    
def explore_df(df):
    '''print out shape, head, info, missing value count of a dataframe
    Parameters: 
        df (dataframe): a pandas dataframe object
    '''
    
    print_shape(df)
    print_head(df)
    print_info(df)
    print_missing_values(df)

In [150]:
explore_df(data_store)


Shape of the dataset


(1115, 10)


First 5 rows


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,



Dataframe summary
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB


None


Missing values count


Store                          0
StoreType                      0
Assortment                     0
CompetitionDistance            3
CompetitionOpenSinceMonth    354
CompetitionOpenSinceYear     354
Promo2                         0
Promo2SinceWeek              544
Promo2SinceYear              544
PromoInterval                544
dtype: int64

### Training dataset exploration

In [151]:
explore_df(data_train)


Shape of the dataset


(1017209, 9)


First 5 rows


Unnamed: 0,Store,DayOfWeek,Date,Sales,Customers,Open,Promo,StateHoliday,SchoolHoliday
0,1,5,2015-07-31,5263,555,1,1,0,1
1,2,5,2015-07-31,6064,625,1,1,0,1
2,3,5,2015-07-31,8314,821,1,1,0,1
3,4,5,2015-07-31,13995,1498,1,1,0,1
4,5,5,2015-07-31,4822,559,1,1,0,1



Dataframe summary
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1017209 entries, 0 to 1017208
Data columns (total 9 columns):
 #   Column         Non-Null Count    Dtype         
---  ------         --------------    -----         
 0   Store          1017209 non-null  int64         
 1   DayOfWeek      1017209 non-null  int64         
 2   Date           1017209 non-null  datetime64[ns]
 3   Sales          1017209 non-null  int64         
 4   Customers      1017209 non-null  int64         
 5   Open           1017209 non-null  int64         
 6   Promo          1017209 non-null  int64         
 7   StateHoliday   1017209 non-null  object        
 8   SchoolHoliday  1017209 non-null  int64         
dtypes: datetime64[ns](1), int64(7), object(1)
memory usage: 69.8+ MB


None


Missing values count


Store            0
DayOfWeek        0
Date             0
Sales            0
Customers        0
Open             0
Promo            0
StateHoliday     0
SchoolHoliday    0
dtype: int64

### Testing dataset exploration

In [152]:
explore_df(data_test)


Shape of the dataset


(41088, 8)


First 5 rows


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0



Dataframe summary
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41088 entries, 0 to 41087
Data columns (total 8 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Id             41088 non-null  int64         
 1   Store          41088 non-null  int64         
 2   DayOfWeek      41088 non-null  int64         
 3   Date           41088 non-null  datetime64[ns]
 4   Open           41077 non-null  float64       
 5   Promo          41088 non-null  int64         
 6   StateHoliday   41088 non-null  object        
 7   SchoolHoliday  41088 non-null  int64         
dtypes: datetime64[ns](1), float64(1), int64(5), object(1)
memory usage: 2.5+ MB


None


Missing values count


Id                0
Store             0
DayOfWeek         0
Date              0
Open             11
Promo             0
StateHoliday      0
SchoolHoliday     0
dtype: int64

### Missing value treatment

In [153]:
data_test['Open'] = data_test['Open'].fillna(data_test['Open'].mode())

### Outlier treating pipeline

In [154]:
from scipy.stats import zscore

def z_score_outlier_treatment(df, unvarient=True):
    z_scores = zscore(df)
    abs_z_scores = np.abs(z_scores)
    filtered_entries = (abs_z_scores < 3) if unvarient \
    else (abs_z_scores < 3).all(axis=1)
    return filtered_entries

def iqr_outlier_treatment(df):
    Q1=df.quantile(0.25)
    Q3=df.quantile(0.75)
    IQR=Q3-Q1
    Lower_Whisker = Q1-1.5*IQR
    Upper_Whisker = Q3+1.5*IQR
    return (df < Upper_Whisker)

def treat_outlier(df, dataset, method='z-score', unvarient=True):
    
    if(unvarient):
        
        if (method == 'z-score'):
            filtered_entries = z_score_outlier_treatment(df, unvarient) 
        elif (method == 'iqr'):
            filtered_entries = iqr_outlier_treatment(df)
    else:
        filtered_entries = z_score_outlier_treatment(df, unvarient) 
    
    return dataset[filtered_entries].copy()

Treating outliers on training data

In [155]:
data_sales_cleaned = treat_outlier(data_train.Sales, data_train, method='iqr')
data_train_clean = treat_outlier(data_sales_cleaned.Customers, data_sales_cleaned, method='iqr')
data_train_clean.shape

(967874, 9)

### Seasonality analysis of training set

In [166]:
import datetime as dt

summer = [6,7,8]
autumn = [9,10,11]
winter = [12,1,2]
spring = [3,4,5]

data_2013 = data_train_clean[data_train_clean['Date'].dt.year == 2013].copy()
data_2014 = data_train_clean[data_train_clean['Date'].dt.year == 2014].copy()
data_2015 = data_train_clean[data_train_clean['Date'].dt.year == 2015].copy()

data_2013['season'] = 