# Instore Prices

This is a followup analysis to simplify v1. It is a complement, not a correction

## Config

In [1]:
import matplotlib.pyplot as  plt
# import numpy as np
import pandas as pd
import seaborn as sns
import sys
from scipy.stats import ttest_ind

sys.path.append('c:\\Users\\Jordi Cremades\\Documents\\Repos\\central-node')

# from utils import dataset_meta_stats
# from utils import dataset_stats
from utils import query_engines

# dms = dataset_meta_stats.DatasetMetaStats() 
# ds = dataset_stats.DatasetStats()

## [X] Identify top Groceries cities

In [2]:
START_DATE = "'2023-12-01'"
END_DATE = "'2024-06-01'"
TOP = 100

params = [
    # dates
    {'name':'start_date', 'value': str(START_DATE)},
    {'name':'end_date', 'value': str(END_DATE)},
    {'name':'top', 'value': str(TOP)}
    
]

q = query_engines.QueryEngines(
    query='top_cities.sql',
    params=params,
    printq=None,
    output_file='top_cities',
    load_from_output_file='top_cities'
    
)

df = q.query_run_starbust()
df

Unnamed: 0,order_city_code,order_country_code,n_orders
0,MAD,ES,866839
1,BUC,RO,834007
2,TBI,GE,709015
3,CAS,MA,687670
4,WAW,PL,606077
...,...,...,...
95,MBS,KE,27803
96,CST,ES,27369
97,FES,MA,27215
98,QYY,PL,26891


## [X] Compute raw metrics

We need to cut the raw_metric_v2 query, into two separate queries and later we will join. The first part are the metrics related to orders.

In [85]:
START_DATE = "'2024-05-01'"
END_DATE = "'2024-06-01'"

START_DATE_GROUPS = "'2023-12-01'"
END_DATE_GROUPS = "'2024-06-01'"
TOP = 100

params = [
    # dates
    {'name':'start_date', 'value': str(START_DATE)},
    {'name':'end_date', 'value': str(END_DATE)},
    {'name':'start_date_groups', 'value': str(START_DATE_GROUPS)},
    {'name':'end_date_groups', 'value': str(END_DATE_GROUPS)},
    {'name':'top', 'value': str(TOP)}  
]

q = query_engines.QueryEngines(
    query='raw_metrics_v2_part1.sql',
    params=params,
    printq=None,
    output_file='raw_metrics_v2_part1',
    load_from_output_file='raw_metrics_v2_part1'
    
)

df1 = q.query_run_starbust()
df1

Unnamed: 0,country,city,p_creation_date,ipg_variant,groceries_orders_count,groceries_ret_orders_count,stores_ret_orders_count,groceries_gmv_sum
0,BG,SOF,2024-05-02,Control Group,474,365,261,10942.453529
1,RO,IAS,2024-05-18,Control Group,525,411,274,13751.519324
2,NG,LOS,2024-05-19,Control Group,562,430,327,2156.549931
3,ES,QSA,2024-05-19,Control Group,108,75,46,2186.770000
4,MA,RBT,2024-05-22,,401,303,232,5003.265220
...,...,...,...,...,...,...,...,...
9592,PT,AMD,2024-05-21,InStorePrices,54,38,32,1155.590000
9593,ES,PAL,2024-05-20,,60,42,34,1349.980000
9594,RO,TIM,2024-05-18,,128,76,45,3551.102737
9595,PT,AMD,2024-05-18,InStorePrices,70,45,36,1451.180000


The second part is related to sessions metrics

In [86]:
START_DATE = "'2024-05-01'"
END_DATE = "'2024-06-01'"

START_DATE_GROUPS = "'2023-12-01'"
END_DATE_GROUPS = "'2024-06-01'"
TOP = 100

params = [
    # dates
    {'name':'start_date', 'value': str(START_DATE)},
    {'name':'end_date', 'value': str(END_DATE)},
    {'name':'start_date_groups', 'value': str(START_DATE_GROUPS)},
    {'name':'end_date_groups', 'value': str(END_DATE_GROUPS)},
    {'name':'top', 'value': str(TOP)}  
]

q = query_engines.QueryEngines(
    query='raw_metrics_v2_part2.sql',
    params=params,
    printq=None,
    output_file='raw_metrics_v2_part2',
    load_from_output_file='raw_metrics_v2_part2'
    
)

df2 = q.query_run_starbust()
df2

Unnamed: 0,country,city,p_creation_date,ipg_variant,total_session_count,total_customers,groceries_sw_session_count
0,GE,TBI,2024-05-31,Control Group,24647,17637,3247.0
1,RO,BUC,2024-05-09,Control Group,27390,19990,5697.0
2,GE,BAT,2024-05-13,Control Group,3347,2382,576.0
3,KG,BSK,2024-05-19,InStorePrices,1578,1202,300.0
4,MA,RBT,2024-05-13,,12126,9869,
...,...,...,...,...,...,...,...
9675,ES,TBC,2024-06-01,InStorePrices,325,254,83.0
9676,UA,KRR,2024-05-26,Control Group,1590,1249,475.0
9677,ES,VIG,2024-05-28,Control Group,1083,857,202.0
9678,RO,GLT,2024-05-04,Control Group,1259,938,347.0


In [87]:
df_merged = pd.merge(df1,df2, on=['country','city','p_creation_date','ipg_variant'], how='left')

#fillna
df_merged['ipg_variant'].fillna('not_found', inplace=True)

#metrics
df_merged['perc_cvr'] = df_merged['groceries_orders_count']/df_merged['groceries_sw_session_count']
df_merged['perc_ret'] = df_merged['groceries_ret_orders_count']/df_merged['groceries_orders_count']
df_merged['aov'] = df_merged['groceries_gmv_sum']/df_merged['groceries_orders_count']
df_merged['groceries_session_per_customer']  = df_merged['groceries_sw_session_count']/df_merged['total_customers']

df_final = df_merged[[
    'country',
    'city',
    'p_creation_date',
    'ipg_variant',
    'groceries_orders_count',
    'groceries_sw_session_count',
    'groceries_gmv_sum',
    'groceries_ret_orders_count',
    'groceries_session_per_customer',
    'perc_cvr',
    'perc_ret',
    'aov']]

df_final.to_csv('outputs/raw_results_v2.csv')

## [X] Compute P-Values and aggregates

In [88]:
df_merged = pd.merge(df1,df2, on=['country','city','p_creation_date','ipg_variant'], how='left')


#fillna
df_merged['ipg_variant'].fillna('not_found', inplace=True)

#metrics
df_merged['perc_cvr'] = df_merged['groceries_orders_count']/df_merged['groceries_sw_session_count']
df_merged['perc_ret'] = df_merged['groceries_ret_orders_count']/df_merged['groceries_orders_count']
df_merged['aov'] = df_merged['groceries_gmv_sum']/df_merged['groceries_orders_count']
df_merged['groceries_session_per_customer']  = df_merged['groceries_sw_session_count']/df_merged['total_customers']

df_final = df_merged[[
    'country',
    'city',
    'p_creation_date',
    'ipg_variant',
    'groceries_orders_count',
    'groceries_sw_session_count',
    'groceries_gmv_sum',
    'groceries_ret_orders_count',
    'groceries_session_per_customer',
    'perc_cvr',
    'perc_ret',
    'aov']]

df_final.to_csv('outputs/raw_results_v2.csv')

In [89]:
tmp_df = df_final.copy()

In [90]:
aggregated = tmp_df.groupby(['country', 'city', 'ipg_variant']).agg({
    'groceries_orders_count': 'mean',
    'groceries_sw_session_count': 'mean',
    'groceries_gmv_sum': 'mean',
    'groceries_ret_orders_count': 'mean',
    'groceries_session_per_customer':'mean',
    'perc_cvr': 'mean',
    'perc_ret': 'mean',
    'aov': 'mean'
}).reset_index()

control_data = aggregated[aggregated['ipg_variant'] == 'Control Group']
variant_data = aggregated[aggregated['ipg_variant'] == 'InStorePrices']

merged_data = pd.merge(
    control_data, variant_data, 
    on=['country', 'city'], 
    suffixes=('_control', '_variant')
)

merged_data

Unnamed: 0,country,city,ipg_variant_control,groceries_orders_count_control,groceries_sw_session_count_control,groceries_gmv_sum_control,groceries_ret_orders_count_control,groceries_session_per_customer_control,perc_cvr_control,perc_ret_control,aov_control,ipg_variant_variant,groceries_orders_count_variant,groceries_sw_session_count_variant,groceries_gmv_sum_variant,groceries_ret_orders_count_variant,groceries_session_per_customer_variant,perc_cvr_variant,perc_ret_variant,aov_variant
0,AM,YRV,Control Group,194.62500,328.75000,3921.981943,147.84375,0.158762,0.599137,0.762081,20.174054,InStorePrices,46.12500,82.87500,911.354363,35.50000,0.162939,0.566607,0.772948,19.802855
1,BG,SOF,Control Group,530.28125,1025.68750,13322.214539,378.46875,0.237341,0.520863,0.713902,25.111651,InStorePrices,134.40625,268.37500,3553.648452,95.62500,0.249208,0.504652,0.714546,26.409881
2,CI,ABJ,Control Group,388.96875,732.00000,5308.719650,310.21875,0.302941,0.532206,0.797564,13.629600,InStorePrices,99.37500,182.18750,1430.740582,80.90625,0.308727,0.549140,0.815970,14.370212
3,CI,ABN,Control Group,899.18750,1835.50000,11390.584121,715.15625,0.365762,0.499105,0.795625,12.630626,InStorePrices,237.56250,477.50000,3019.288554,189.03125,0.376165,0.508936,0.796447,12.777822
4,ES,ALC,Control Group,194.06250,383.25000,4888.063750,144.56250,0.182577,0.523750,0.734863,24.414127,InStorePrices,48.56250,95.12500,1196.154688,35.90625,0.184133,0.530105,0.731724,23.908795
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,UA,ODE,Control Group,104.21875,154.15625,1184.385330,80.25000,0.272181,0.677703,0.772813,11.381121,InStorePrices,32.12500,41.56250,349.889019,25.81250,0.276956,0.779434,0.809803,11.058374
96,UA,ODS,Control Group,557.28125,802.93750,7080.688902,432.62500,0.254451,0.696045,0.778628,12.711906,InStorePrices,135.12500,200.15625,1633.325790,101.53125,0.250693,0.678450,0.753752,12.066986
97,UA,VNT,Control Group,239.25000,428.03125,2613.563100,179.40625,0.273599,0.561945,0.755308,10.933019,InStorePrices,60.59375,108.31250,667.861171,45.00000,0.275895,0.563873,0.748619,11.029991
98,UA,ZPR,Control Group,184.65625,349.34375,2093.317103,138.18750,0.364190,0.533811,0.751178,11.312650,InStorePrices,40.12500,87.03125,453.525353,28.75000,0.384171,0.463704,0.715973,11.333084


In [91]:
results = []

metrics = [
    'groceries_session_per_customer',
    'perc_cvr', 
    'perc_ret', 
    'aov'
]


for index, row in merged_data.iterrows():
    country = row['country']
    city = row['city']
    result_row = {'country': country, 'city': city}
    
    for metric in metrics:
        control_value = row[f'{metric}_control']
        variant_value = row[f'{metric}_variant']
        
        # Calculate increment
        increment = (variant_value - control_value) / control_value * 100 if control_value != 0 else None
        
        # Perform t-test
        control_sample = tmp_df[(tmp_df['country'] == country) & (tmp_df['city'] == city) & (tmp_df['ipg_variant'] == 'Control Group')][metric]
        variant_sample = tmp_df[(tmp_df['country'] == country) & (tmp_df['city'] == city) & (tmp_df['ipg_variant'] == 'InStorePrices')][metric]
        
        t_stat, p_value = ttest_ind(control_sample, variant_sample, equal_var=False)
        
        # Append to results
        result_row.update({
            f'{metric}_control': control_value,
            f'{metric}_variant': variant_value,
            f'increment_{metric}': increment,
            f'p_value_{metric}': p_value
        })
    
    results.append(result_row)

pd.DataFrame(results).to_csv('outputs/t_test_v2.csv')

## [X] Compute P-Values and aggregates at a country level

In [94]:
df_merged = pd.merge(df1,df2, on=['country','city','p_creation_date','ipg_variant'], how='left')

#fillna
df_merged['ipg_variant'].fillna('not_found', inplace=True)

# Group by by countries
df_merged = df_merged.groupby(['country','ipg_variant','p_creation_date']).sum().reset_index()

#metrics
df_merged['perc_cvr'] = df_merged['groceries_orders_count']/df_merged['groceries_sw_session_count']
df_merged['perc_ret'] = df_merged['groceries_ret_orders_count']/df_merged['groceries_orders_count']
df_merged['aov'] = df_merged['groceries_gmv_sum']/df_merged['groceries_orders_count']
df_merged['groceries_session_per_customer']  = df_merged['groceries_sw_session_count']/df_merged['total_customers']

df_final = df_merged[[
    'country',
    'city',
    'p_creation_date',
    'ipg_variant',
    'groceries_orders_count',
    'groceries_sw_session_count',
    'groceries_gmv_sum',
    'groceries_ret_orders_count',
    'groceries_session_per_customer',
    'perc_cvr',
    'perc_ret',
    'aov']]

df_final.drop(columns='city', inplace= True)

df_final.to_csv('outputs/raw_results_v3.csv')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_final.drop(columns='city', inplace= True)


In [96]:
tmp_df = df_final.copy()

In [97]:
aggregated = tmp_df.groupby(['country', 'ipg_variant']).agg({
    'groceries_orders_count': 'mean',
    'groceries_sw_session_count': 'mean',
    'groceries_gmv_sum': 'mean',
    'groceries_ret_orders_count': 'mean',
    'groceries_session_per_customer':'mean',
    'perc_cvr': 'mean',
    'perc_ret': 'mean',
    'aov': 'mean'
}).reset_index()

control_data = aggregated[aggregated['ipg_variant'] == 'Control Group']
variant_data = aggregated[aggregated['ipg_variant'] == 'InStorePrices']

merged_data = pd.merge(
    control_data, variant_data, 
    on=['country'], 
    suffixes=('_control', '_variant')
)

merged_data

Unnamed: 0,country,ipg_variant_control,groceries_orders_count_control,groceries_sw_session_count_control,groceries_gmv_sum_control,groceries_ret_orders_count_control,groceries_session_per_customer_control,perc_cvr_control,perc_ret_control,aov_control,ipg_variant_variant,groceries_orders_count_variant,groceries_sw_session_count_variant,groceries_gmv_sum_variant,groceries_ret_orders_count_variant,groceries_session_per_customer_variant,perc_cvr_variant,perc_ret_variant,aov_variant
0,AM,Control Group,194.625,328.75,3921.981943,147.84375,0.158762,0.599137,0.762081,20.174054,InStorePrices,46.125,82.875,911.354363,35.5,0.162939,0.566607,0.772948,19.802855
1,BG,Control Group,530.28125,1025.6875,13322.214539,378.46875,0.237341,0.520863,0.713902,25.111651,InStorePrices,134.40625,268.375,3553.648452,95.625,0.249208,0.504652,0.714546,26.409881
2,CI,Control Group,1288.15625,2567.5,16699.303771,1025.375,0.345356,0.507158,0.797563,12.94526,InStorePrices,336.9375,659.6875,4450.029136,269.9375,0.354757,0.517804,0.803405,13.245574
3,ES,Control Group,11832.71875,17883.25,304277.142812,9073.03125,0.206049,0.669436,0.769303,25.716007,InStorePrices,2994.625,4515.09375,77241.954063,2305.71875,0.207068,0.6713,0.772497,25.791671
4,GE,Control Group,2852.3125,4324.8125,52151.079051,2233.78125,0.224811,0.672457,0.784015,18.218066,InStorePrices,717.0,1069.34375,13028.434122,562.9375,0.219301,0.686518,0.7859,18.111683
5,HR,Control Group,337.40625,907.34375,5437.329375,194.4375,0.148528,0.393071,0.58394,16.168142,InStorePrices,81.0625,220.96875,1286.705312,46.34375,0.146532,0.388317,0.578896,15.883437
6,IT,Control Group,4376.625,8340.5625,111188.1975,2974.53125,0.210693,0.527771,0.680819,25.405472,InStorePrices,1061.0,2068.40625,27011.725625,719.21875,0.209246,0.516233,0.679016,25.472822
7,KE,Control Group,2179.03125,3666.78125,27953.47867,1769.625,0.460765,0.593966,0.813196,12.795717,InStorePrices,563.6875,917.5625,7106.758892,462.34375,0.462794,0.61554,0.821949,12.588011
8,KG,Control Group,477.71875,939.3125,4715.434207,325.875,0.211181,0.51517,0.684636,9.875703,InStorePrices,110.0625,226.90625,1086.824026,72.65625,0.206206,0.491933,0.66351,9.8908
9,KZ,Control Group,1068.4375,2185.40625,12277.175289,781.28125,0.194782,0.491385,0.73304,11.491944,InStorePrices,248.25,520.78125,2735.961462,179.15625,0.186976,0.47955,0.723111,11.010354


In [98]:
results = []

metrics = [
    'groceries_session_per_customer',
    'perc_cvr', 
    'perc_ret', 
    'aov'
]


for index, row in merged_data.iterrows():
    country = row['country']
    result_row = {'country': country}
    
    for metric in metrics:
        control_value = row[f'{metric}_control']
        variant_value = row[f'{metric}_variant']
        
        # Calculate increment
        increment = (variant_value - control_value) / control_value * 100 if control_value != 0 else None
        
        # Perform t-test
        control_sample = tmp_df[(tmp_df['country'] == country) & (tmp_df['ipg_variant'] == 'Control Group')][metric]
        variant_sample = tmp_df[(tmp_df['country'] == country) & (tmp_df['ipg_variant'] == 'InStorePrices')][metric]
        
        t_stat, p_value = ttest_ind(control_sample, variant_sample, equal_var=False)
        
        # Append to results
        result_row.update({
            f'{metric}_control': control_value,
            f'{metric}_variant': variant_value,
            f'increment_{metric}': increment,
            f'p_value_{metric}': p_value
        })
    
    results.append(result_row)

pd.DataFrame(results).to_csv('outputs/t_test_v3.csv')

In [70]:
merged_data

Unnamed: 0,country,ipg_variant_control,groceries_orders_count_control,groceries_sw_session_count_control,groceries_gmv_sum_control,groceries_ret_orders_count_control,groceries_session_per_customer_control,perc_cvr_control,perc_ret_control,aov_control,ipg_variant_variant,groceries_orders_count_variant,groceries_sw_session_count_variant,groceries_gmv_sum_variant,groceries_ret_orders_count_variant,groceries_session_per_customer_variant,perc_cvr_variant,perc_ret_variant,aov_variant
0,AM,Control Group,194.625,328.75,3921.981943,147.84375,0.158762,0.599137,0.762081,20.174054,InStorePrices,46.125,82.875,911.354363,35.5,0.162939,0.566607,0.772948,19.802855
1,BG,Control Group,530.28125,1025.6875,13322.214539,378.46875,0.237341,0.520863,0.713902,25.111651,InStorePrices,134.40625,268.375,3553.648452,95.625,0.249208,0.504652,0.714546,26.409881
2,CI,Control Group,1288.15625,2567.5,16699.303771,1025.375,0.668703,1.031311,1.593189,26.260227,InStorePrices,336.9375,659.6875,4450.029136,269.9375,0.684892,1.058076,1.612417,27.148034
3,ES,Control Group,11832.71875,17883.25,304277.142812,9073.03125,5.637178,16.175928,20.80894,700.390235,InStorePrices,2994.625,4515.09375,77241.954063,2305.71875,5.661718,16.29701,20.948626,704.294268
4,GE,Control Group,2852.3125,4324.8125,52151.079051,2233.78125,0.475755,1.284092,1.571456,37.052392,InStorePrices,717.0,1069.34375,13028.434122,562.9375,0.479115,1.317361,1.591745,37.439174
5,HR,Control Group,337.40625,907.34375,5437.329375,194.4375,0.148528,0.393071,0.58394,16.168142,InStorePrices,81.0625,220.96875,1286.705312,46.34375,0.146532,0.388317,0.578896,15.883437
6,IT,Control Group,4376.625,8340.5625,111188.1975,2974.53125,1.416101,3.44692,4.604279,170.991046,InStorePrices,1061.0,2068.40625,27011.725625,719.21875,1.396097,3.375928,4.579893,170.251094
7,KE,Control Group,2179.03125,3666.78125,27953.47867,1769.625,0.880794,1.00643,1.50645,25.504068,InStorePrices,563.6875,917.5625,7106.758892,462.34375,0.906333,1.024958,1.545649,25.58456
8,KG,Control Group,477.71875,939.3125,4715.434207,325.875,0.211181,0.51517,0.684636,9.875703,InStorePrices,110.0625,226.90625,1086.824026,72.65625,0.206206,0.491933,0.66351,9.8908
9,KZ,Control Group,1068.4375,2185.40625,12277.175289,781.28125,0.380083,0.881192,1.404218,22.811356,InStorePrices,248.25,520.78125,2735.961462,179.15625,0.36074,0.859544,1.378679,21.859506


In [82]:
cond1 = tmp_df['country'] == 'ES'
cond2 = tmp_df['ipg_variant'] == 'Control Group'
tmp_df[cond1 & cond2]['groceries_orders_count'].sum()

378647