In [74]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objs as go
from plotly.subplots import make_subplots

## Data

In [4]:
data = pd.read_csv(r'./big_mart_sales/train_v9rqX0R.csv')
data.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


### Variable Identification

In [5]:
num_cols = ['Item_Weight', 'Item_Visibility', 'Item_MRP', 'Item_Outlet_Sales']
cat_cols = ['Item_Identifier', 'Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
ordinal_cols = ['Item_Fat_Content', 'Outlet_Size', 'Outlet_Location_Type']
target_var = ['Item_Outlet_Sales']

### Univariate analysis

**Continuous Variables**

In [75]:
for cols in num_cols:

    # Central tendency
    print(data[cols].describe())
    print(data[cols].skew())
    print(data[cols].kurtosis())
    
    # Subplots
    fig = make_subplots(rows=1, cols=3)
    
    # Hist plot     
    fig.add_trace(go.Histogram(x=data[cols],
                              name='Histogram'), row=1,col=1)
    
    # Box plot
    fig.add_trace(go.Box(y=data[cols],
                        boxpoints='all',
                        name='Box Plot'), row=1, col=2)
    
    # ECDF plot
    x = np.sort(data[cols])
    n = len(data[cols])
    y = np.arange(1, n+1)/n
    fig.add_trace(go.Scatter(x=x, 
                             y=y,
                            name='ECDF'), row=1, col=3)
    
    # Layout update
    fig.update_layout(bargap=0.03,
                  title=dict(text='Distribution of {}'.format(cols)),
                margin=dict(l=80, r=80, t=100, b=80))
    
    fig.show()

count    7060.000000
mean       12.857645
std         4.643456
min         4.555000
25%         8.773750
50%        12.600000
75%        16.850000
max        21.350000
Name: Item_Weight, dtype: float64
0.0824262091221237
-1.2277664144376634


count    8523.000000
mean        0.066132
std         0.051598
min         0.000000
25%         0.026989
50%         0.053931
75%         0.094585
max         0.328391
Name: Item_Visibility, dtype: float64
1.1670905496918407
1.6794454826024245


count    8523.000000
mean      140.992782
std        62.275067
min        31.290000
25%        93.826500
50%       143.012800
75%       185.643700
max       266.888400
Name: Item_MRP, dtype: float64
0.1272022683110526
-0.8897690936963571


count     8523.000000
mean      2181.288914
std       1706.499616
min         33.290000
25%        834.247400
50%       1794.331000
75%       3101.296400
max      13086.964800
Name: Item_Outlet_Sales, dtype: float64
1.1775306028542798
1.6158766814287264


**Continuous - Continuous**

In [76]:
data[num_cols].corr()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Item_Outlet_Sales
Item_Weight,1.0,-0.014048,0.027141,0.014123
Item_Visibility,-0.014048,1.0,-0.001315,-0.128625
Item_MRP,0.027141,-0.001315,1.0,0.567574
Item_Outlet_Sales,0.014123,-0.128625,0.567574,1.0


In [88]:
data['quantity'] = (data['Item_Outlet_Sales'] / data['Item_MRP']).round().astype(int)
go.Figure(go.Scatter(x=data['quantity'], y=data['Item_Outlet_Sales'], mode='markers'))