In [4]:
# import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import datetime as dt
import warnings
warnings.filterwarnings("ignore")

%matplotlib inline 
%config InlineBackend.figure_format = 'retina'
plt.style.use("ggplot")


# time series analysis
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [5]:
# upload the data sets
store_data = pd.read_csv("store.csv")
sample_submission_data = pd.read_csv("test.csv")
test_data = pd.read_csv("test.csv")
train_data =pd.read_csv("train.csv",parse_dates = True)


# Task 1

#### Glance the data

In [3]:
#glance the data
print("shape of the  store data: ",store_data.shape)
print("\n")
print(store_data.info())
store_data.head()

shape of the  store data:  (1115, 10)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1115 entries, 0 to 1114
Data columns (total 10 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Store                      1115 non-null   int64  
 1   StoreType                  1115 non-null   object 
 2   Assortment                 1115 non-null   object 
 3   CompetitionDistance        1112 non-null   float64
 4   CompetitionOpenSinceMonth  761 non-null    float64
 5   CompetitionOpenSinceYear   761 non-null    float64
 6   Promo2                     1115 non-null   int64  
 7   Promo2SinceWeek            571 non-null    float64
 8   Promo2SinceYear            571 non-null    float64
 9   PromoInterval              571 non-null    object 
dtypes: float64(5), int64(2), object(3)
memory usage: 87.2+ KB
None


Unnamed: 0,Store,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval
0,1,c,a,1270.0,9.0,2008.0,0,,,
1,2,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct"
2,3,a,a,14130.0,12.0,2006.0,1,14.0,2011.0,"Jan,Apr,Jul,Oct"
3,4,c,c,620.0,9.0,2009.0,0,,,
4,5,a,a,29910.0,4.0,2015.0,0,,,


In [19]:
print("shape of the  train data: ",train_data.shape)
print("\n")
print(train_data.nunique())
train_data.head()
train_data['Date']
train_data.dtypes

shape of the  train data:  (1017209, 9)


Store             1115
DayOfWeek            7
Date               942
Sales            21734
Customers         4086
Open                 2
Promo                2
StateHoliday         5
SchoolHoliday        2
dtype: int64


Store             int64
DayOfWeek         int64
Date             object
Sales             int64
Customers         int64
Open              int64
Promo             int64
StateHoliday     object
SchoolHoliday     int64
dtype: object

In [5]:
print("shape of the  test data: ",test_data.shape)
print("\n")
print(test_data.nunique())
test_data.head()

shape of the  test data:  (41088, 8)


Id               41088
Store              856
DayOfWeek            7
Date                48
Open                 2
Promo                2
StateHoliday         2
SchoolHoliday        2
dtype: int64


Unnamed: 0,Id,Store,DayOfWeek,Date,Open,Promo,StateHoliday,SchoolHoliday
0,1,1,4,2015-09-17,1.0,1,0,0
1,2,3,4,2015-09-17,1.0,1,0,0
2,3,7,4,2015-09-17,1.0,1,0,0
3,4,8,4,2015-09-17,1.0,1,0,0
4,5,9,4,2015-09-17,1.0,1,0,0


Short description:
- **Sales**: the turnover for any given day (target variable).
- **Customers**: the number of customers on a given day.
- Open: an indicator for whether the store was open: 0 = closed, 1 = open.
- Promo: indicates whether a store is running a promo on that day.
- StateHoliday: indicates a state holiday. Normally all stores, with few exceptions, are closed on state holidays.
- SchoolHoliday: indicates if the (Store, Date) was affected by the closure of public schools.

- Store: a unique Id for each store
- StoreType: differentiates between 4 different store models: a, b, c, d
- Assortment: describes an assortment level: a = basic, b = extra, c = extended
- CompetitionDistance: distance in meters to the nearest competitor store
- CompetitionOpenSince[Month/Year]: gives the approximate year and month of the time the nearest competitor was opened
- Promo2: Promo2 is a continuing a promotion for some -stores: 0 = store is not participating, 1 = store is participating
- Promo2Since[Year/Week]: describes the year and calendar week when the store started participating in Promo2
- PromoInterval: describes the consecutive intervals Promo2 is started, naming the months the promotion is started. E.g. "Feb,May,Aug,Nov" means each round starts in February, May, August, November of any given year for that store

### Data cleaning

In [6]:
#data extraction
train_data['Year'] = train_data['Date'].dt.year
train_data['Month'] = train_data['Date'].dt.month
train_data['Day'] = train_data['Date'].dt.day
# train_data['WeekOfYear'] = train_data.index.weekofyear

# adding new variable
train_data['SalePerCustomer'] = train_data['Sales']/train_data['Customers']
train_data['SalePerCustomer'].describe()

AttributeError: Can only use .dt accessor with datetimelike values

In [5]:
#dealing with cardinality
train_data['StateHoliday'].replace({0:'0','0':'o'}, inplace = True)
test_data['StateHoliday'].replace({0:'0','0':'o'}, inplace = True)

In [7]:
missing_data = pd.DataFrame(train_data.isna().sum(),columns=(['Missing Train']))
missing_data["missing_test"] = test_data.isnull().sum() # before deling missing values
missing_data

Unnamed: 0,Missing Train,missing_test
Store,0,0.0
DayOfWeek,0,0.0
Sales,0,
Customers,0,
Open,0,11.0
Promo,0,0.0
StateHoliday,0,0.0
SchoolHoliday,0,0.0
Year,0,
Month,0,


- as shown in the table the train data has no a missing value and the test data has a missing value for open variable , we impute it.

In [8]:
# impute the missing value in test data
test_data['Open'].fillna(test_data['Open'].mode, inplace = True)
test_data.isnull().values.any()


False

In [9]:
# closed stores
train_data[(train_data.Open == 0) & (train_data.Sales == 0)].count()

Store              172817
DayOfWeek          172817
Sales              172817
Customers          172817
Open               172817
Promo              172817
StateHoliday       172817
SchoolHoliday      172817
Year               172817
Month              172817
Day                172817
WeekOfYear         172817
SalePerCustomer         0
dtype: int64

- There're 172817 closed stores in the data. It is about 10% of the total amount of observations. To avoid any biased forecasts we will drop these values.

In [10]:
train_d = train_data #for later 
print(train_data.shape)
closed_stores = train_data[(train_data['Open'] == 0) & (train_data['Sales'] == 0)].index
train_data.drop(closed_stores,inplace = True)
print(train_data.shape)


(1017209, 13)


KeyboardInterrupt: 

In [None]:
open_no_sale = train_data[(train_data["Open"] != 0) & (train_data['Sales'] == 0)].index
train_data[(train_data["Open"] != 0) & (train_data['Sales'] == 0)]['Date'].nunique()
  


- There are opened store with no sales on working days. There're only 49 days in the data, so we can assume that there were external factors involved,

In [None]:
train_data.drop(open_no_sale,inplace = True)
train_data.shape

In [None]:
#exceptional stores that has open during holidays

store_exc = train_data[(train_data["StateHoliday"] != "o") 
        & (train_data["Sales"] != 0)]["Store"].nunique()
print("exceptional stores that has opened during state holidays:",store_exc)


In [None]:
store_data.isnull().sum()

- We have few variables with missing values that we need to deal with. Let's start with the CompetitionDistance.

In [None]:
# missing values in CompetitionDistance
store_data[pd.isnull(store_data.CompetitionDistance)]

- No particular pattern observed. In this case, it makes a complete sense to replace NaN with the median values (which is twice less that the average)

In [None]:
# fill NaN with a median value (skewed distribuion)
store_data['CompetitionDistance'].fillna(store_data['CompetitionDistance'].median(), inplace = True)

In [None]:
# replace NA's by 0
store_data.fillna(0, inplace = True)


In [None]:
print("Joining train set with an additional store information.")

train_store = pd.merge(train_data, store_data, how = 'inner', on = 'Store')

print("In total: ", train_store.shape)
train_store.head()

In [None]:
#Dealing with the outlier
def deal_outliers(data,columns):
    data[columns].plot(kind = box)
    print("before dealing with the ouliers the shape of the data",
          data.shape)
    for col in columns:
        q1 = data[col].quantile(0.25)
        q3 = data[col].quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 -(1.5 * iqr) 
        upper_bound = q3 +(1.5 * iqr)
        data = data[data[col]>lower_bound]
        data = data[data[col]<upper_bound]
    print("after removing the outliers",data.shape)

- In this section we will closely look at different levels of StoreType and how the main metric Sales is distributed among them.

In [None]:
train_store.groupby('StoreType')['Sales'].describe()


- StoreType B has the highest average of Sales among all others, however we have much less data for it. So let's print an overall sum of Sales and Customers to see which StoreType is the most selling and crowded one:

In [None]:
train_store.groupby('StoreType')['Customers', 'Sales'].sum()


- Clearly stores of type A. StoreType D goes on the second place in both Sales and Customers. What about date periods? Seaborn's facet grid is the best tool for this task:

In [None]:
train_store.head()

In [None]:
# sales trends
c = '#386B7F'
sns.factorplot(data = train_store, x = 'Month', y = "Sales", 
               col = 'StoreType', # per store type in cols
               palette = 'plasma',
               hue = 'StoreType',
               row = 'Promo', # per promo in the store in rows
               color = c) 