# DoubleGlovo: Analysis v2

## Analysis

### Imports

In [1]:
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf

sys.path.append('c:\\Users\\Jordi Cremades\\Documents\\Repository')

from Utilities import query_engines

In [1]:
# Import dataset
# Understand the logic of the data set
# Get incremental orders number as showed by David. A DG Order has 78% extra orders in the following days... 
# Descartem els 0? Que passa si fem els dos escenaris
# Fer part A i B a la vegada --> Differents groups per a comparar incrementality de DG vs no DG

### Load Data

In [2]:

# Load data
df = pd.read_csv('inputs/dg_orders_1_x_user_access.csv')

# Flag treatment when order store accessed is DG
df['treatment'] =  np.where(df['store_accessed_channel']=='DoubleGlovo', True, False)

df = df[[
    'p_creation_date'
    ,'order_id'
    ,'store_accessed_channel'
    ,'treatment'
    ,'user_is_GNC'
    ,'user_is_prime'
    ,'months_old'
    ,'country'
    ,'n_orders_groceries_next28d'
    ,'aov_groceries_past30d'
]].copy()

# Drop duplicates
df_with_duplicates = df.copy()
df.drop_duplicates(subset='order_id',keep=False,inplace=True)

df

Unnamed: 0,p_creation_date,order_id,store_accessed_channel,treatment,user_is_GNC,user_is_prime,months_old,country,n_orders_groceries_next28d,aov_groceries_past30d
0,2023-09-30,706580245,NotDoubleGlovo,False,False,False,4,PT,2,24.02500
1,2023-09-04,690338612,NotDoubleGlovo,False,False,True,45,PT,4,18.54000
2,2023-08-24,682853212,NotDoubleGlovo,False,True,False,15,KE,0,0.00000
3,2023-09-04,690450489,NotDoubleGlovo,False,True,False,58,GE,0,0.00000
4,2023-09-30,706755438,NotDoubleGlovo,False,False,False,51,UA,0,0.00000
...,...,...,...,...,...,...,...,...,...,...
1812463,2023-08-20,680640457,NotDoubleGlovo,False,True,False,18,MA,0,0.00000
1812464,2023-08-26,684676536,NotDoubleGlovo,False,False,True,44,GE,0,10.88169
1812465,2023-08-20,680634979,NotDoubleGlovo,False,False,False,49,RO,0,0.00000
1812466,2023-08-20,680774904,NotDoubleGlovo,False,True,False,48,PL,0,0.00000


In [9]:
# Summary df
print('There are {} rows in the dataframe'.format(len(df)))
print('There are {} distinct orders'.format(df['order_id'].nunique()))
print('Min Date = {}'.format(df['p_creation_date'].min()))
print('Max Date = {}'.format(df['p_creation_date'].max()))
df['p_creation_date'] = pd.to_datetime(df['p_creation_date'])
difference_days = df['p_creation_date'].max()-df['p_creation_date'].min()
print('Difference days: {}'.format(difference_days.days))

There are 1567672 rows in the dataframe
There are 1567672 distinct orders
Min Date = 2023-07-13
Max Date = 2023-10-12
Difference days: 91


### Replicate GLM without duplicates

In [6]:
# Poisson GLM without duplicates
mod = smf.glm(formula="n_orders_groceries_next28d~treatment * user_is_GNC + user_is_prime + treatment:user_is_prime+np.sqrt(aov_groceries_past30d)+np.sqrt(months_old) + country", data=df, family=sm.families.Poisson()).fit()
print(mod.summary())

                     Generalized Linear Model Regression Results                      
Dep. Variable:     n_orders_groceries_next28d   No. Observations:              1567672
Model:                                    GLM   Df Residuals:                  1567644
Model Family:                         Poisson   Df Model:                           27
Link Function:                            Log   Scale:                          1.0000
Method:                                  IRLS   Log-Likelihood:            -2.2568e+06
Date:                        Tue, 05 Dec 2023   Deviance:                   3.1478e+06
Time:                                17:56:09   Pearson chi2:                 5.26e+06
No. Iterations:                             7   Pseudo R-squ. (CS):             0.5802
Covariance Type:                    nonrobust                                         
                                              coef    std err          z      P>|z|      [0.025      0.975]
----------------------

In [7]:
# Poisson GLM with duplicates
mod = smf.glm(formula="n_orders_groceries_next28d~treatment * user_is_GNC + user_is_prime + treatment:user_is_prime+np.sqrt(aov_groceries_past30d)+np.sqrt(months_old) + country", data=df_with_duplicates, family=sm.families.Poisson()).fit()
print(mod.summary())

  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


                     Generalized Linear Model Regression Results                      
Dep. Variable:     n_orders_groceries_next28d   No. Observations:              1812456
Model:                                    GLM   Df Residuals:                  1812428
Model Family:                         Poisson   Df Model:                           27
Link Function:                            Log   Scale:                          1.0000
Method:                                  IRLS   Log-Likelihood:            -3.8409e+06
Date:                        Tue, 05 Dec 2023   Deviance:                   5.7363e+06
Time:                                17:57:10   Pearson chi2:                 1.94e+07
No. Iterations:                             7   Pseudo R-squ. (CS):             0.7803
Covariance Type:                    nonrobust                                         
                                              coef    std err          z      P>|z|      [0.025      0.975]
----------------------

In [55]:
# Poisson GLM without duplicates
mod = smf.glm(formula="n_orders_groceries_next28d~treatment + country", data=df[df['user_is_GNC']==True], family=sm.families.Poisson()).fit()
print(mod.summary())

                     Generalized Linear Model Regression Results                      
Dep. Variable:     n_orders_groceries_next28d   No. Observations:               575097
Model:                                    GLM   Df Residuals:                   575075
Model Family:                         Poisson   Df Model:                           21
Link Function:                            Log   Scale:                          1.0000
Method:                                  IRLS   Log-Likelihood:            -3.8245e+05
Date:                        Tue, 05 Dec 2023   Deviance:                   5.8886e+05
Time:                                19:00:57   Pearson chi2:                 1.65e+06
No. Iterations:                             6   Pseudo R-squ. (CS):           0.005715
Covariance Type:                    nonrobust                                         
                        coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------------------

### Segmentation

In [30]:
df.head()

Unnamed: 0,p_creation_date,order_id,store_accessed_channel,treatment,user_is_GNC,user_is_prime,months_old,country,n_orders_groceries_next28d,aov_groceries_past30d
0,2023-09-30,706580245,NotDoubleGlovo,False,False,False,4,PT,2,24.025
1,2023-09-04,690338612,NotDoubleGlovo,False,False,True,45,PT,4,18.54
2,2023-08-24,682853212,NotDoubleGlovo,False,True,False,15,KE,0,0.0
3,2023-09-04,690450489,NotDoubleGlovo,False,True,False,58,GE,0,0.0
4,2023-09-30,706755438,NotDoubleGlovo,False,False,False,51,UA,0,0.0


In [46]:
# NC per Country
cond = df['user_is_GNC'] == True

dfg = df[cond].groupby(['country', 'treatment']).agg({'n_orders_groceries_next28d': 'mean', 'order_id': 'count'}).reset_index()
dfg.columns = ['country', 'user_is_GNC', 'mean_n_orders', 'count_order_id']
pivot_table = dfg.pivot(index='country', columns='user_is_GNC', values=['mean_n_orders', 'count_order_id'])
pivot_table['relative_difference'] = (pivot_table[('mean_n_orders', True)] - pivot_table[('mean_n_orders', False)]) / pivot_table[('mean_n_orders', False)]
pivot_table

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AM,0.251106,0.55,2035.0,20.0,1.190313
BA,0.146226,0.137931,2120.0,29.0,-0.05673
CI,0.328434,0.829268,6872.0,41.0,1.524914
ES,0.202758,0.540835,130752.0,551.0,1.667392
GE,0.216538,0.722772,21188.0,101.0,2.337859
GH,0.2481,0.230769,3289.0,65.0,-0.069853
HR,0.155037,0.286486,11236.0,185.0,0.847854
IT,0.216663,0.5625,61404.0,192.0,1.596193
KE,0.301628,0.652174,12164.0,46.0,1.162181
KG,0.18562,0.236559,7149.0,186.0,0.274424


In [50]:
pivot_table.describe()

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
count,21.0,20.0,21.0,20.0,20.0
mean,0.230819,0.477056,27212.238095,182.0,1.032399
std,0.051468,0.194245,33342.418655,230.612685,0.612011
min,0.146226,0.137931,868.0,11.0,-0.069853
25%,0.202758,0.283047,3289.0,38.75,0.657158
50%,0.217826,0.524236,11494.0,139.5,1.152049
75%,0.263347,0.608356,42534.0,187.5,1.453081
max,0.328434,0.829268,130752.0,988.0,2.337859


In [44]:
pivot_table.columns

MultiIndex([(      'mean_n_orders', False),
            (      'mean_n_orders',  True),
            (     'count_order_id', False),
            (     'count_order_id',  True),
            ('relative_difference',    '')],
           names=[None, 'user_is_GNC'])

In [35]:
# RC per Country
cond = df['user_is_GNC'] == False

dfg = df[cond].groupby(['country', 'treatment']).agg({'n_orders_groceries_next28d': 'mean', 'order_id': 'count'}).reset_index()
dfg.columns = ['country', 'user_is_GNC', 'mean_n_orders', 'count_order_id']
pivot_table = dfg.pivot(index='country', columns='user_is_GNC', values=['mean_n_orders', 'count_order_id'])
pivot_table['relative_difference'] = (pivot_table[('mean_n_orders', True)] - pivot_table[('mean_n_orders', False)]) / pivot_table[('mean_n_orders', False)]
pivot_table

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AM,1.118318,2.4,1403.0,15.0,1.14608
BA,0.773151,1.166667,1609.0,12.0,0.508976
CI,1.607305,3.383333,15223.0,120.0,1.104973
ES,1.266479,3.516611,261364.0,1806.0,1.776684
GE,1.483864,3.625899,39538.0,139.0,1.443553
GH,1.005378,2.0,3719.0,37.0,0.989302
HR,0.838572,1.745946,14477.0,185.0,1.082048
IT,1.024554,2.948187,89516.0,386.0,1.877531
KE,1.691369,4.940171,25166.0,117.0,1.920812
KG,1.252462,2.785714,5787.0,56.0,1.22419


In [36]:
# Prime per Country
cond = df['user_is_prime'] == True

dfg = df[cond].groupby(['country', 'treatment']).agg({'n_orders_groceries_next28d': 'mean', 'order_id': 'count'}).reset_index()
dfg.columns = ['country', 'user_is_GNC', 'mean_n_orders', 'count_order_id']
pivot_table = dfg.pivot(index='country', columns='user_is_GNC', values=['mean_n_orders', 'count_order_id'])
pivot_table['relative_difference'] = (pivot_table[('mean_n_orders', True)] - pivot_table[('mean_n_orders', False)]) / pivot_table[('mean_n_orders', False)]
pivot_table

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AM,1.218143,2.583333,463.0,12.0,1.120715
BA,0.333333,0.0,6.0,1.0,-1.0
CI,2.794237,4.388889,1978.0,54.0,0.570693
ES,1.829496,3.494186,74186.0,1376.0,0.909917
GE,2.404633,3.243697,12043.0,119.0,0.348936
GH,0.7,,10.0,,
HR,0.315789,7.0,19.0,2.0,21.166667
IT,1.391506,2.694915,16858.0,236.0,0.93669
KE,3.296838,5.654545,2119.0,55.0,0.715142
KG,1.974593,2.911111,1535.0,45.0,0.474284


In [37]:
# No Prime per Country
cond = df['user_is_prime'] == False

dfg = df[cond].groupby(['country', 'treatment']).agg({'n_orders_groceries_next28d': 'mean', 'order_id': 'count'}).reset_index()
dfg.columns = ['country', 'user_is_GNC', 'mean_n_orders', 'count_order_id']
pivot_table = dfg.pivot(index='country', columns='user_is_GNC', values=['mean_n_orders', 'count_order_id'])
pivot_table['relative_difference'] = (pivot_table[('mean_n_orders', True)] - pivot_table[('mean_n_orders', False)]) / pivot_table[('mean_n_orders', False)]
pivot_table

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AM,0.50958,0.695652,2975.0,23.0,0.365149
BA,0.416868,0.45,3723.0,40.0,0.079478
CI,1.053736,1.897196,20117.0,107.0,0.800448
ES,0.697638,1.876656,317930.0,981.0,1.690015
GE,0.704517,1.578512,48683.0,121.0,1.24056
GH,0.6499,0.872549,6998.0,102.0,0.34259
HR,0.540048,0.983696,25694.0,368.0,0.821496
IT,0.608375,1.783626,134062.0,342.0,1.931786
KE,1.114652,2.75,35211.0,108.0,1.467138
KG,0.486273,0.350254,11401.0,197.0,-0.279718


In [39]:
# Prime and NC per Country
cond = df['user_is_GNC'] == True
cond2 = df['user_is_prime'] == True

dfg = df[cond & cond2].groupby(['country', 'treatment']).agg({'n_orders_groceries_next28d': 'mean', 'order_id': 'count'}).reset_index()
dfg.columns = ['country', 'user_is_GNC', 'mean_n_orders', 'count_order_id']
pivot_table = dfg.pivot(index='country', columns='user_is_GNC', values=['mean_n_orders', 'count_order_id'])
pivot_table['relative_difference'] = (pivot_table[('mean_n_orders', True)] - pivot_table[('mean_n_orders', False)]) / pivot_table[('mean_n_orders', False)]
pivot_table

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AM,0.523529,0.75,170.0,4.0,0.432584
BA,0.0,0.0,1.0,1.0,
CI,0.770186,1.166667,161.0,6.0,0.514785
ES,0.335638,0.696335,9546.0,191.0,1.074661
GE,0.487027,0.827586,1349.0,29.0,0.69926
GH,0.0,,3.0,,
HR,0.0,,2.0,,
IT,0.339089,0.977778,3533.0,45.0,1.883547
KE,0.938547,0.333333,179.0,6.0,-0.644841
KG,0.491857,0.818182,307.0,11.0,0.663456


In [43]:
# No Prime and NC per Country
cond = df['user_is_GNC'] == True
cond2 = df['user_is_prime'] == False

dfg = df[cond & cond2].groupby(['country', 'treatment']).agg({'n_orders_groceries_next28d': 'mean', 'order_id': 'count'}).reset_index()
dfg.columns = ['country', 'user_is_GNC', 'mean_n_orders', 'count_order_id']
pivot_table = dfg.pivot(index='country', columns='user_is_GNC', values=['mean_n_orders', 'count_order_id'])
pivot_table['relative_difference'] = (pivot_table[('mean_n_orders', True)] - pivot_table[('mean_n_orders', False)]) / pivot_table[('mean_n_orders', False)]
pivot_table

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AM,0.226273,0.5,1865.0,16.0,1.209716
BA,0.146295,0.142857,2119.0,28.0,-0.023502
CI,0.317836,0.771429,6711.0,35.0,1.427125
ES,0.192292,0.458333,121206.0,360.0,1.383522
GE,0.198145,0.680556,19839.0,72.0,2.434633
GH,0.248326,0.230769,3286.0,65.0,-0.070701
HR,0.155065,0.286486,11234.0,185.0,0.847525
IT,0.209189,0.435374,57871.0,147.0,1.081244
KE,0.292115,0.7,11985.0,40.0,1.396315
KG,0.17188,0.2,6842.0,175.0,0.163605


In [41]:
# Prime and NC per Country
cond = df['user_is_GNC'] == False
cond2 = df['user_is_prime'] == True

dfg = df[cond & cond2].groupby(['country', 'treatment']).agg({'n_orders_groceries_next28d': 'mean', 'order_id': 'count'}).reset_index()
dfg.columns = ['country', 'user_is_GNC', 'mean_n_orders', 'count_order_id']
pivot_table = dfg.pivot(index='country', columns='user_is_GNC', values=['mean_n_orders', 'count_order_id'])
pivot_table['relative_difference'] = (pivot_table[('mean_n_orders', True)] - pivot_table[('mean_n_orders', False)]) / pivot_table[('mean_n_orders', False)]
pivot_table

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AM,1.62116,3.5,293.0,8.0,1.158947
BA,0.4,,5.0,,
CI,2.973583,4.791667,1817.0,48.0,0.611412
ES,2.050108,3.945148,64640.0,1185.0,0.924361
GE,2.646531,4.022222,10694.0,90.0,0.519809
GH,1.0,,7.0,,
HR,0.352941,7.0,17.0,2.0,18.833333
IT,1.670544,3.099476,13325.0,191.0,0.855369
KE,3.514433,6.306122,1940.0,49.0,0.79435
KG,2.345277,3.588235,1228.0,34.0,0.529984


In [42]:
# No Prime and NC per Country
cond = df['user_is_GNC'] == False
cond2 = df['user_is_prime'] == False

dfg = df[cond & cond2].groupby(['country', 'treatment']).agg({'n_orders_groceries_next28d': 'mean', 'order_id': 'count'}).reset_index()
dfg.columns = ['country', 'user_is_GNC', 'mean_n_orders', 'count_order_id']
pivot_table = dfg.pivot(index='country', columns='user_is_GNC', values=['mean_n_orders', 'count_order_id'])
pivot_table['relative_difference'] = (pivot_table[('mean_n_orders', True)] - pivot_table[('mean_n_orders', False)]) / pivot_table[('mean_n_orders', False)]
pivot_table

Unnamed: 0_level_0,mean_n_orders,mean_n_orders,count_order_id,count_order_id,relative_difference
user_is_GNC,False,True,False,True,Unnamed: 5_level_1
country,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
AM,0.985586,1.142857,1110.0,7.0,0.159572
BA,0.774314,1.166667,1604.0,12.0,0.50671
CI,1.422124,2.444444,13406.0,72.0,0.718868
ES,1.008992,2.698873,196724.0,621.0,1.67482
GE,1.052801,2.897959,28844.0,49.0,1.752617
GH,1.005388,2.0,3712.0,37.0,0.989282
HR,0.839142,1.688525,14460.0,183.0,1.012203
IT,0.911577,2.8,76191.0,195.0,2.071598
KE,1.539094,3.955882,23226.0,68.0,1.570267
KG,0.958105,1.545455,4559.0,22.0,0.613033
