In [54]:
import pandas
import numpy
from scipy import stats
import plotly.graph_objects as go
import plotly.express as px

data = pandas.read_csv('BlackFriday.csv')

In [55]:
data.head(10)

Unnamed: 0,User_ID,Product_ID,Gender,Age,Occupation,City_Category,Stay_In_Current_City_Years,Marital_Status,Product_Category_1,Product_Category_2,Product_Category_3,Purchase
0,1000001,P00069042,F,0-17,10,A,2,0,3,,,8370
1,1000001,P00248942,F,0-17,10,A,2,0,1,6.0,14.0,15200
2,1000001,P00087842,F,0-17,10,A,2,0,12,,,1422
3,1000001,P00085442,F,0-17,10,A,2,0,12,14.0,,1057
4,1000002,P00285442,M,55+,16,C,4+,0,8,,,7969
5,1000003,P00193542,M,26-35,15,A,3,0,1,2.0,,15227
6,1000004,P00184942,M,46-50,7,B,2,1,1,8.0,17.0,19215
7,1000004,P00346142,M,46-50,7,B,2,1,1,15.0,,15854
8,1000004,P0097242,M,46-50,7,B,2,1,1,16.0,,15686
9,1000005,P00274942,M,26-35,20,A,1,1,8,,,7871


In [56]:
data = data[['User_ID','Product_ID','Age','Gender','Occupation','City_Category','Purchase']]
data.head(10)

Unnamed: 0,User_ID,Product_ID,Age,Gender,Occupation,City_Category,Purchase
0,1000001,P00069042,0-17,F,10,A,8370
1,1000001,P00248942,0-17,F,10,A,15200
2,1000001,P00087842,0-17,F,10,A,1422
3,1000001,P00085442,0-17,F,10,A,1057
4,1000002,P00285442,55+,M,16,C,7969
5,1000003,P00193542,26-35,M,15,A,15227
6,1000004,P00184942,46-50,M,7,B,19215
7,1000004,P00346142,46-50,M,7,B,15854
8,1000004,P0097242,46-50,M,7,B,15686
9,1000005,P00274942,26-35,M,20,A,7871


In [57]:
everyone_data = pandas.pivot_table(data, values='Purchase',
                                  index=['User_ID','Age','Gender', 'Occupation','City_Category'],
                                  aggfunc='sum').reset_index()
everyone_data

Unnamed: 0,User_ID,Age,Gender,Occupation,City_Category,Purchase
0,1000001,0-17,F,10,A,333481
1,1000002,55+,M,16,C,810353
2,1000003,26-35,M,15,A,341635
3,1000004,46-50,M,7,B,205987
4,1000005,26-35,M,20,A,821001
...,...,...,...,...,...,...
5886,1006036,26-35,F,15,B,3821666
5887,1006037,46-50,F,1,C,1075037
5888,1006038,55+,F,1,C,80859
5889,1006039,46-50,F,0,B,554504


In [58]:
px.histogram(everyone_data, x='Purchase', color='Gender').show()

In [59]:
stats.levene(everyone_data.Purchase[everyone_data.Gender == 'M'],
            everyone_data.Purchase[everyone_data.Gender == 'F'])

LeveneResult(statistic=40.12082333463167, pvalue=2.5639534782629217e-10)

In [60]:
everyone_data['Purchase'] = everyone_data.Purchase.apply(numpy.log2)
everyone_data.head(10)

Unnamed: 0,User_ID,Age,Gender,Occupation,City_Category,Purchase
0,1000001,0-17,F,10,A,18.347245
1,1000002,55+,M,16,C,19.628191
2,1000003,26-35,M,15,A,18.382096
3,1000004,46-50,M,7,B,17.652194
4,1000005,26-35,M,20,A,19.647024
5,1000006,51-55,F,9,A,18.53355
6,1000007,36-45,M,1,B,17.838779
7,1000008,26-35,M,12,C,19.603396
8,1000009,26-35,M,17,C,19.180006
9,1000010,36-45,F,1,B,21.048922


In [61]:
px.box(everyone_data, y='Purchase').show()

In [62]:
px.histogram(everyone_data, x='Purchase').show()

In [67]:
everyone_data['std_norm'] = (everyone_data.Purchase - everyone_data.Purchase.mean()) / everyone_data.Purchase.std()
everyone_data.head(10)

Unnamed: 0,User_ID,Age,Gender,Occupation,City_Category,Purchase,std_norm
0,1000001,0-17,F,10,A,18.347245,-0.455975
1,1000002,55+,M,16,C,19.628191,0.438407
2,1000003,26-35,M,15,A,18.382096,-0.431641
3,1000004,46-50,M,7,B,17.652194,-0.941274
4,1000005,26-35,M,20,A,19.647024,0.451557
5,1000006,51-55,F,9,A,18.53355,-0.325893
6,1000007,36-45,M,1,B,17.838779,-0.810996
7,1000008,26-35,M,12,C,19.603396,0.421095
8,1000009,26-35,M,17,C,19.180006,0.125476
9,1000010,36-45,F,1,B,21.048922,1.43039


In [68]:
px.histogram(everyone_data, x='std_norm').show()

In [65]:
from statsmodels.graphics.gofplots import qqplot

qqplot_data = qqplot(everyone_data.Purchase, line='s').gca().lines

In [66]:
fig = go.Figure()

fig.add_trace({
    'type': 'scatter',
    'x': qqplot_data[0].get_xdata(),
    'y': qqplot_data[0].get_ydata(),
    'mode': 'markers',
    'marker': {
        'color': '#19d3f3'
    }
})

fig.add_trace({
    'type': 'scatter',
    'x': qqplot_data[1].get_xdata(),
    'y': qqplot_data[1].get_ydata(),
    'mode': 'lines',
    'line': {
        'color': '#636efa'
    }

})

fig.show()

In [69]:
stats.kstest(everyone_data.Purchase, 'norm')

KstestResult(statistic=1.0, pvalue=0.0)

In [70]:
stats.shapiro(everyone_data.Purchase)


p-value may not be accurate for N > 5000.



(0.9871099591255188, 8.86643054322415e-23)

In [71]:
stats.normaltest(everyone_data.Purchase)

NormaltestResult(statistic=397.10914814909137, pvalue=5.872785283497359e-87)