# Exploratory Data Analysis Notebook

In [40]:
# basic imports
import pandas as pd
import numpy as np
import datetime as dt

# visualizations
import matplotlib.pyplot as plt
import seaborn as sns

# time series
from statsmodels.tsa.arima_model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from pandas.plotting import autocorrelation_plot
from statsmodels.tsa.stattools import adfuller, acf, pacf,arma_order_select_ic
import statsmodels.formula.api as smf
import statsmodels.tsa.api as smt
import statsmodels.api as sm
import scipy.stats as scs

# settings
import warnings
warnings.filterwarnings('ignore')

### Step 1: Loading the Data (orders & items csv files)

In [14]:
# loading in orders & items tables
orders_df = pd.read_csv(r'C:\Users\Jagvir Dhesi\lighthouselabs\projects\demand-prediction-project\data\orders.csv', sep=';')
items_df = pd.read_csv(r'C:\Users\Jagvir Dhesi\lighthouselabs\projects\demand-prediction-project\data\items.csv', sep=';')

In [17]:
# first look at orders table 
orders_df.head()

Unnamed: 0,order_id,date,payment,item_code,quantity,gift_quantity,unit_price_vat_excl,unit_cogs,unit_rrp_vat_excl,department,channel,owner,site,CreatedAt
0,2000093387,2020-04-24 00:00:00,GOPAY_CARD,S101,1,,3.506048,0.0,3.496395,E-COMMER,RUN.huC,RUN.huO,top4running.hu,2020-04-24 21:50:00
1,2000093387,2020-04-24 00:00:00,GOPAY_CARD,S113,1,,-0.705913,0.0,-0.717209,E-COMMER,RUN.huC,RUN.huO,top4running.hu,2020-04-24 21:50:00
2,2000093388,2020-04-24 00:00:00,COD,ZB00089178,1,,5.829845,2.209302,6.472868,E-COMMER,FTB.czC,FTB.czO,11teamsports.cz,2020-04-24 21:45:00
3,2000093388,2020-04-24 00:00:00,COD,ZB00138060,1,,5.829845,2.209302,6.472868,E-COMMER,FTB.czC,FTB.czO,11teamsports.cz,2020-04-24 21:45:00
4,2000093388,2020-04-24 00:00:00,COD,ZB00015664,1,,19.379845,18.731008,32.015504,E-COMMER,FTB.czC,FTB.czO,11teamsports.cz,2020-04-24 21:45:00


In [29]:
# check the overall info of the orders dataset 
orders_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2128524 entries, 0 to 2128523
Data columns (total 14 columns):
 #   Column               Dtype  
---  ------               -----  
 0   order_id             int64  
 1   date                 object 
 2   payment              object 
 3   item_code            object 
 4   quantity             int64  
 5   gift_quantity        float64
 6   unit_price_vat_excl  float64
 7   unit_cogs            float64
 8   unit_rrp_vat_excl    float64
 9   department           object 
 10  channel              object 
 11  owner                object 
 12  site                 object 
 13  CreatedAt            object 
dtypes: float64(4), int64(2), object(8)
memory usage: 227.4+ MB


In [33]:
# check the number of rows and columns of the orders dataset
orders_df.shape

(2128524, 14)

In [27]:
# first look at items table 
items_df.head()

Unnamed: 0,item_code,item_name,style,brand_id,name,group0_id,group0,group1_id,group1,group2_id,group2,category,gender,age,color,size
0,ZB00210807,Studio Metallic LS Top,51951501,86,Puma,200.0,Apparel,230.0,T-Shirts,233.0,Longsleeved (LS) shirts,fitness,women,adults,Black,L
1,ZB00210813,TRAIN TECH EVOKNIT SS TEE,52011101,86,Puma,200.0,Apparel,230.0,T-Shirts,0.0,,fitness,men,adults,Black,S
2,ZB00210815,TRAIN TECH EVOKNIT SS TEE,52011123,86,Puma,200.0,Apparel,230.0,T-Shirts,0.0,,fitness,men,adults,Orange,L
3,ZB00210821,TRAIN TECH EVOKNIT SS TEE,52011130,86,Puma,200.0,Apparel,230.0,T-Shirts,0.0,,fitness,men,adults,Yellow,XL
4,ZB00261295,LIGA Baselayer Tee LS,655920-027,86,Puma,200.0,Apparel,230.0,T-Shirts,233.0,Longsleeved (LS) shirts,football,men,adults,Green,L


In [31]:
# check the overall info of the items dataset 
items_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 309128 entries, 0 to 309127
Data columns (total 16 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   item_code  309128 non-null  object 
 1   item_name  306002 non-null  object 
 2   style      302331 non-null  object 
 3   brand_id   309128 non-null  int64  
 4   name       309128 non-null  object 
 5   group0_id  309093 non-null  float64
 6   group0     308302 non-null  object 
 7   group1_id  301709 non-null  float64
 8   group1     265348 non-null  object 
 9   group2_id  301709 non-null  float64
 10  group2     71773 non-null   object 
 11  category   269282 non-null  object 
 12  gender     301644 non-null  object 
 13  age        301562 non-null  object 
 14  color      263532 non-null  object 
 15  size       301087 non-null  object 
dtypes: float64(3), int64(1), object(12)
memory usage: 37.7+ MB


In [35]:
# check the number of rows and columns of the items dataset
items_df.shape

(309128, 16)

#### Quick Summary
We can see that the orders table has a lot of information, as seen by the larger number of rows. This will be beneficial for our models as we will have tons of historical data to make predictions from. Both tables also have a lot of columns or features, with 14 and 16 columns respectively in the orders and items table.

### Step 2: Performing Exploratory Data Analysis (EDA) in the data