# Instore Prices

This is a notebook to close Instore Prices Analysis

## Config

In [2]:
# import matplotlib.pyplot as  plt
# import numpy as np
import pandas as pd
# import seaborn as sns
import sys
from scipy.stats import ttest_ind

sys.path.append('c:\\Users\\Jordi Cremades\\Documents\\Repos\\central-node')

from utils import query_engines


## [X] Identify top Groceries cities

In [9]:
q = query_engines.QueryEngines()

START_DATE = "'2023-12-01'"
END_DATE = "'2024-06-01'"
TOP = 100

params = [
    # dates
    {'name':'start_date', 'value': str(START_DATE)},
    {'name':'end_date', 'value': str(END_DATE)},
    {'name':'top', 'value': str(TOP)}
]

q.prepare_query(
    query_file='top_cities.sql', 
    params=params
)

df_top_cities = q.query_run_starburst(
    output_file='top_cities',
    load_from_output_file='top_cities',
    print_query=True
)

df_top_cities.to_csv('outputs/top_cities.csv')

# [X] (i) More range of data

We now will try to run the whole historical dataset that Ankit wants with a parellelized query. With it, we could then fill up past data, and prove the effectiveness of the new notebook

In [10]:
q = query_engines.QueryEngines(reset_query_logs=True, reset_to_load=False)

params_file_name = 'raw_metrics_v3_part1'

# df_historical_p1 = q.multiple_queries(
#     query_file='raw_metrics_v3_part1.sql', 
#     params_file_name=params_file_name, 
#     parallelize=True,
#     store_steps=True,
#     output_file='historical_p1',
#     sleep = 5
# )

df_historical_p1 = q.query_run_starburst( #query_run_livedb otherwise
    load_from_output_file='historical_p1',
)

df_historical_p1

Unnamed: 0,country,city,p_creation_date,ipg_variant,groceries_orders_count,groceries_ret_orders_count,stores_ret_orders_count,groceries_gmv_sum,param_label
0,NG,LOS,2024-04-19,InStorePrices,113,88,71,531.206173,Apr
1,ES,HEN,2024-04-24,Control Group,84,75,60,2030.540000,Apr
2,IT,BOL,2024-04-28,,77,43,38,1754.710000,Apr
3,IT,PMO,2024-04-04,InStorePrices,34,28,22,1169.330000,Apr
4,ES,BIL,2024-04-14,InStorePrices,11,7,6,161.830000,Apr
...,...,...,...,...,...,...,...,...,...
27592,ES,HEN,2024-05-18,InStorePrices,23,18,15,633.090000,May
27593,RO,CRV,2024-05-04,Control Group,135,107,75,3073.615525,May
27594,ES,ZAR,2024-05-26,,141,69,51,2743.010000,May
27595,UA,ZPR,2024-05-28,InStorePrices,44,33,29,496.421422,May


In [11]:
q = query_engines.QueryEngines(reset_query_logs=True, reset_to_load=False)

params_file_name = 'raw_metrics_v3_part2'

# df_historical_p2 = q.multiple_queries(
#     query_file='raw_metrics_v3_part2.sql', 
#     params_file_name=params_file_name, 
#     parallelize=True,
#     store_steps=True,
#     output_file='historical_p2',
#     sleep = 5
# )

df_historical_p2 = q.query_run_starburst( #query_run_livedb otherwise
    load_from_output_file='historical_p2',
)

df_historical_p2

Unnamed: 0,country,city,p_creation_date,ipg_variant,total_session_count,total_customers,groceries_sw_session_count,param_label
0,IT,PMO,2024-04-24,Control Group,2925,2295,433.0,Apr
1,RO,GLT,2024-04-23,Control Group,1483,1149,299.0,Apr
2,MA,RBT,2024-04-15,InStorePrices,2240,1559,451.0,Apr
3,CI,ABN,2024-04-05,,7053,5701,,Apr
4,PL,QLU,2024-04-03,InStorePrices,222,177,57.0,Apr
...,...,...,...,...,...,...,...,...
27822,HR,ZAG,2024-03-18,Control Group,4710,3731,656.0,Mar
27823,IT,ROM,2024-03-24,InStorePrices,2623,1956,494.0,Mar
27824,ES,VAL,2024-03-30,,14412,11638,,Mar
27825,MA,MHD,2024-03-23,Control Group,894,641,386.0,Mar


## City-level aggregation

In [68]:
#merge the dataframes
df_merged = pd.merge(df_historical_p1,df_historical_p2, on=['country','city','p_creation_date','ipg_variant'], how='left')

#fillna
df_merged['ipg_variant'].fillna('not_found', inplace=True)

In [69]:
df_merged['ipg_variant'].value_counts()

ipg_variant
InStorePrices    9199
Control Group    9199
not_found        9199
Name: count, dtype: int64

In [56]:
#metrics
df_merged['perc_cvr'] = df_merged['groceries_orders_count']/df_merged['groceries_sw_session_count']
df_merged['perc_ret'] = df_merged['groceries_ret_orders_count']/df_merged['groceries_orders_count']
df_merged['aov'] = df_merged['groceries_gmv_sum']/df_merged['groceries_orders_count']
df_merged['groceries_session_per_customer']  = df_merged['groceries_sw_session_count']/df_merged['total_customers']

df_final = df_merged[[
    'country',
    'city',
    'p_creation_date',
    'ipg_variant',
    'groceries_orders_count',
    'total_session_count',
    'groceries_sw_session_count',
    'groceries_gmv_sum',
    'groceries_ret_orders_count',
    'groceries_session_per_customer',
    'perc_cvr',
    'perc_ret',
    'aov']]

df_final.to_csv('outputs/raw_results_v3_city.csv')

aggregated = df_final.groupby(['country', 'city', 'ipg_variant']).agg({
    'groceries_orders_count': 'sum',
    'total_session_count': 'sum',
    'groceries_orders_count': 'mean',
    'groceries_sw_session_count': 'mean',
    'groceries_gmv_sum': 'mean',
    'groceries_ret_orders_count': 'mean',
    'groceries_session_per_customer':'mean',
    'perc_cvr': 'mean',
    'perc_ret': 'mean',
    'aov': 'mean'
}).reset_index()

control_data = aggregated[aggregated['ipg_variant'] == 'Control Group']
variant_data = aggregated[aggregated['ipg_variant'] == 'InStorePrices']

merged_data = pd.merge(
    control_data, variant_data, 
    on=['country', 'city'], 
    suffixes=('_control', '_variant')
)

# Calculate total orders and total customers
merged_data['groceries_orders_count'] = merged_data['groceries_orders_count_control'] + merged_data['groceries_orders_count_variant']
merged_data['total_session_count'] = merged_data['total_session_count_control'] + merged_data['total_session_count_variant']
merged_data['perc_sessions_control'] = merged_data['total_session_count_control'] / merged_data['total_session_count']
merged_data['perc_sessions_variant'] = merged_data['total_session_count_variant'] / merged_data['total_session_count']

merged_data.to_csv('outputs/aggregated_results_v3_city.csv')

In [47]:
results = []

metrics = [
    'groceries_session_per_customer',
    'perc_cvr', 
    'perc_ret', 
    'aov'
]

for index, row in merged_data.iterrows():
    country = row['country']
    city = row['city']
    result_row = {'country': country, 'city': city}
    
    for metric in metrics:
        control_value = row[f'{metric}_control']
        variant_value = row[f'{metric}_variant']
        
        # Calculate increment
        increment = (variant_value - control_value) / control_value * 100 if control_value != 0 else None
        
        # Perform t-test
        control_sample = df_final[(df_final['country'] == country) & (df_final['city'] == city) & (df_final['ipg_variant'] == 'Control Group')][metric]
        variant_sample = df_final[(df_final['country'] == country) & (df_final['city'] == city) & (df_final['ipg_variant'] == 'InStorePrices')][metric]
        
        t_stat, p_value = ttest_ind(control_sample, variant_sample, equal_var=False)
        
        # Append to results
        result_row.update({
            f'{metric}_control': control_value,
            f'{metric}_variant': variant_value,
            f'increment_{metric}': increment,
            f'p_value_{metric}': p_value
        })
        
    # Add total orders and total customers to result_row
    result_row['groceries_orders_count'] = row['groceries_orders_count']
    result_row['total_session_count'] = row['total_session_count']
    
    results.append(result_row)

pd.DataFrame(results).to_csv('outputs/t_test_v3_city.csv')

## Country-level aggregation

In [58]:
#merge the dataframes
df_merged = pd.merge(df_historical_p1,df_historical_p2, on=['country','city','p_creation_date','ipg_variant'], how='left')

#fillna
df_merged['ipg_variant'].fillna('not_found', inplace=True)

In [59]:
df_merged['ipg_variant'].value_counts()

ipg_variant
InStorePrices    9199
Control Group    9199
not_found        9199
Name: count, dtype: int64

In [73]:
#metrics
df_merged_grouped = df_merged.groupby(['country', 'p_creation_date','ipg_variant']).agg({
    'groceries_orders_count': 'sum',
    'total_session_count': 'sum',
    'groceries_orders_count': 'sum',
    'groceries_sw_session_count': 'sum',
    'groceries_gmv_sum': 'sum',
    'groceries_ret_orders_count': 'sum',
    'total_customers': 'sum'
}).reset_index()

df_merged_grouped['perc_cvr'] = df_merged_grouped['groceries_orders_count']/df_merged_grouped['groceries_sw_session_count']
df_merged_grouped['perc_ret'] = df_merged_grouped['groceries_ret_orders_count']/df_merged_grouped['groceries_orders_count']
df_merged_grouped['aov'] = df_merged_grouped['groceries_gmv_sum']/df_merged_grouped['groceries_orders_count']
df_merged_grouped['groceries_session_per_customer']  = df_merged_grouped['groceries_sw_session_count']/df_merged_grouped['total_customers']

df_final = df_merged_grouped[[
    'country',
    'p_creation_date',
    'ipg_variant',
    'groceries_orders_count',
    'total_session_count',
    'groceries_sw_session_count',
    'groceries_gmv_sum',
    'groceries_ret_orders_count',
    'groceries_session_per_customer',
    'perc_cvr',
    'perc_ret',
    'aov']]

df_final.to_csv('outputs/raw_results_v3_country.csv')

aggregated = df_final.groupby(['country', 'ipg_variant']).agg({
    'groceries_orders_count': 'sum',
    'total_session_count': 'sum',
    'groceries_orders_count': 'mean',
    'groceries_sw_session_count': 'mean',
    'groceries_gmv_sum': 'mean',
    'groceries_ret_orders_count': 'mean',
    'groceries_session_per_customer':'mean',
    'perc_cvr': 'mean',
    'perc_ret': 'mean',
    'aov': 'mean'
}).reset_index()

control_data = aggregated[aggregated['ipg_variant'] == 'Control Group']
variant_data = aggregated[aggregated['ipg_variant'] == 'InStorePrices']

merged_data = pd.merge(
    control_data, variant_data, 
    on=['country'], 
    suffixes=('_control', '_variant')
)

# Calculate total orders and total customers
merged_data['groceries_orders_count'] = merged_data['groceries_orders_count_control'] + merged_data['groceries_orders_count_variant']
merged_data['total_session_count'] = merged_data['total_session_count_control'] + merged_data['total_session_count_variant']
merged_data['perc_sessions_control'] = merged_data['total_session_count_control'] / merged_data['total_session_count']
merged_data['perc_sessions_variant'] = merged_data['total_session_count_variant'] / merged_data['total_session_count']

merged_data.to_csv('outputs/aggregated_results_v3_country.csv')

In [79]:
results = []

metrics = [
    'groceries_session_per_customer',
    'perc_cvr', 
    'perc_ret', 
    'aov'
]

for index, row in merged_data.iterrows():
    country = row['country']
    result_row = {'country': country}
    
    for metric in metrics:
        control_value = row[f'{metric}_control']
        variant_value = row[f'{metric}_variant']
        
        # Calculate increment
        increment = (variant_value - control_value) / control_value * 100 if control_value != 0 else None
        
        # Perform t-test
        control_sample = df_final[(df_final['country'] == country) & (df_final['ipg_variant'] == 'Control Group')][metric]
        variant_sample = df_final[(df_final['country'] == country) & (df_final['ipg_variant'] == 'InStorePrices')][metric]
        
        t_stat, p_value = ttest_ind(control_sample, variant_sample, equal_var=False)
        
        # Append to results
        result_row.update({
            f'{metric}_control': control_value,
            f'{metric}_variant': variant_value,
            f'increment_{metric}': increment,
            f'p_value_{metric}': p_value
        })
        
    # Add total orders and total customers to result_row
    result_row['groceries_orders_count'] = row['groceries_orders_count']
    result_row['total_session_count'] = row['total_session_count']
    
    results.append(result_row)

pd.DataFrame(results).to_csv('outputs/t_test_v3_country.csv')

# [X] (ii) Segment impact based on the number of stores with the instore price tag

We will use Ankits spreadsheet to classify the cities depending on the percentage of store with instore prices --> https://docs.google.com/spreadsheets/d/1aFHfPqNy6ax2AxSFzk2rbYohHtPUEMRprWbe8Rr_1kc/edit?gid=249100045#gid=249100045

In [15]:
city_segment = pd.read_csv('outputs/ankits_spreadsheet.csv')

city_segment['perc_stores_with_yes_instore_prices'] = city_segment['yes_instore_prices'] / city_segment['total_stores']

city_segment.fillna(0, inplace=True)
city_segment

Unnamed: 0,city,not_instore_prices,yes_instore_prices,total_stores,perc_stores_with_yes_instore_prices
0,ABJ,4.0,2.0,6,0.333333
1,ABN,4.0,2.0,6,0.333333
2,AGD,1.0,2.0,3,0.666667
3,ALA,4.0,1.0,5,0.200000
4,ALC,3.0,1.0,4,0.250000
...,...,...,...,...,...
95,WRO,3.0,1.0,4,0.250000
96,YRV,2.0,2.0,4,0.500000
97,ZAG,1.0,2.0,3,0.666667
98,ZAR,2.0,2.0,4,0.500000


## Grouping by city tier

In [57]:
# Merge the dataframes
df_merged_tmp = pd.merge(df_historical_p1,df_historical_p2, on=['country','city','p_creation_date','ipg_variant'], how='left')
df_merged = pd.merge(df_merged_tmp,city_segment, on=['city'], how='left')

# Buckets
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels = ['0-20', '20-40', '40-60', '60-80', '80-100']# Use pd.cut to segment and sort data values into bins
df_merged['city_tier'] = pd.cut(df_merged['perc_stores_with_yes_instore_prices'], bins=bins, labels=labels, include_lowest=True)

# fillna
df_merged['ipg_variant'].fillna('not_found', inplace=True)
df_merged['city_tier'] = df_merged['city_tier'].cat.add_categories('no tier').fillna('no tier')
df_merged['yes_instore_prices'].fillna(0, inplace=True)

df_merged.head()

Unnamed: 0,country,city,p_creation_date,ipg_variant,groceries_orders_count,groceries_ret_orders_count,stores_ret_orders_count,groceries_gmv_sum,param_label_x,total_session_count,total_customers,groceries_sw_session_count,param_label_y,not_instore_prices,yes_instore_prices,total_stores,perc_stores_with_yes_instore_prices,city_tier
0,NG,LOS,2024-04-19,InStorePrices,113,88,71,531.206173,Apr,1387,967,374.0,Apr,0.0,2.0,2.0,1.0,80-100
1,ES,HEN,2024-04-24,Control Group,84,75,60,2030.54,Apr,1033,801,151.0,Apr,,0.0,,,no tier
2,IT,BOL,2024-04-28,not_found,77,43,38,1754.71,Apr,4400,3632,,Apr,3.0,1.0,4.0,0.25,20-40
3,IT,PMO,2024-04-04,InStorePrices,34,28,22,1169.33,Apr,542,422,87.0,Apr,0.0,2.0,2.0,1.0,80-100
4,ES,BIL,2024-04-14,InStorePrices,11,7,6,161.83,Apr,423,322,53.0,Apr,2.0,1.0,3.0,0.333333,20-40


In [45]:
df_merged['ipg_variant'].value_counts()

ipg_variant
InStorePrices    9199
Control Group    9199
not_found        9199
Name: count, dtype: int64

In [47]:
#metrics
df_merged_grouped = df_merged.groupby(['city_tier', 'p_creation_date','ipg_variant']).agg({
    'groceries_orders_count': 'sum',
    'total_session_count': 'sum',
    'groceries_orders_count': 'sum',
    'groceries_sw_session_count': 'sum',
    'groceries_gmv_sum': 'sum',
    'groceries_ret_orders_count': 'sum',
    'total_customers': 'sum'
}).reset_index()

df_merged_grouped['perc_cvr'] = df_merged_grouped['groceries_orders_count']/df_merged_grouped['groceries_sw_session_count']
df_merged_grouped['perc_ret'] = df_merged_grouped['groceries_ret_orders_count']/df_merged_grouped['groceries_orders_count']
df_merged_grouped['aov'] = df_merged_grouped['groceries_gmv_sum']/df_merged_grouped['groceries_orders_count']
df_merged_grouped['groceries_session_per_customer']  = df_merged_grouped['groceries_sw_session_count']/df_merged_grouped['total_customers']

df_final = df_merged_grouped[[
    'city_tier',
    'p_creation_date',
    'ipg_variant',
    'groceries_orders_count',
    'total_session_count',
    'groceries_sw_session_count',
    'groceries_gmv_sum',
    'groceries_ret_orders_count',
    'groceries_session_per_customer',
    'perc_cvr',
    'perc_ret',
    'aov']]

df_final.to_csv('outputs/raw_results_v3_city_tiers.csv')

aggregated = df_final.groupby(['city_tier', 'ipg_variant']).agg({
    'groceries_orders_count': 'sum',
    'total_session_count': 'sum',
    'groceries_orders_count': 'mean',
    'groceries_sw_session_count': 'mean',
    'groceries_gmv_sum': 'mean',
    'groceries_ret_orders_count': 'mean',
    'groceries_session_per_customer':'mean',
    'perc_cvr': 'mean',
    'perc_ret': 'mean',
    'aov': 'mean'
}).reset_index()

control_data = aggregated[aggregated['ipg_variant'] == 'Control Group']
variant_data = aggregated[aggregated['ipg_variant'] == 'InStorePrices']

merged_data = pd.merge(
    control_data, variant_data, 
    on=['city_tier'], 
    suffixes=('_control', '_variant')
)

# Calculate total orders and total customers
merged_data['groceries_orders_count'] = merged_data['groceries_orders_count_control'] + merged_data['groceries_orders_count_variant']
merged_data['total_session_count'] = merged_data['total_session_count_control'] + merged_data['total_session_count_variant']
merged_data['perc_sessions_control'] = merged_data['total_session_count_control'] / merged_data['total_session_count']
merged_data['perc_sessions_variant'] = merged_data['total_session_count_variant'] / merged_data['total_session_count']

merged_data.to_csv('outputs/aggregated_results_v3_city_tiers.csv')

  df_merged_grouped = df_merged.groupby(['city_tier', 'p_creation_date','ipg_variant']).agg({
  aggregated = df_final.groupby(['city_tier', 'ipg_variant']).agg({


In [48]:
results = []

metrics = [
    'groceries_session_per_customer',
    'perc_cvr', 
    'perc_ret', 
    'aov'
]

for index, row in merged_data.iterrows():
    city_tier = row['city_tier']
    result_row = {'city_tier': city_tier}
    
    for metric in metrics:
        control_value = row[f'{metric}_control']
        variant_value = row[f'{metric}_variant']
        
        # Calculate increment
        increment = (variant_value - control_value) / control_value * 100 if control_value != 0 else None
        
        # Perform t-test
        control_sample = df_final[(df_final['city_tier'] == city_tier) & (df_final['ipg_variant'] == 'Control Group')][metric]
        variant_sample = df_final[(df_final['city_tier'] == city_tier) & (df_final['ipg_variant'] == 'InStorePrices')][metric]
        
        t_stat, p_value = ttest_ind(control_sample, variant_sample, equal_var=False)
        
        # Append to results
        result_row.update({
            f'{metric}_control': control_value,
            f'{metric}_variant': variant_value,
            f'increment_{metric}': increment,
            f'p_value_{metric}': p_value
        })
        
    # Add total orders and total customers to result_row
    result_row['groceries_orders_count'] = row['groceries_orders_count']
    result_row['total_session_count'] = row['total_session_count']
    
    results.append(result_row)

pd.DataFrame(results).to_csv('outputs/t_test_v3_city_tiers.csv')

## Grouping by number of stores with instore enabled

In [59]:
# Merge the dataframes
df_merged_tmp = pd.merge(df_historical_p1,df_historical_p2, on=['country','city','p_creation_date','ipg_variant'], how='left')
df_merged = pd.merge(df_merged_tmp,city_segment, on=['city'], how='left')

# Buckets
bins = [0, 0.2, 0.4, 0.6, 0.8, 1.0]
labels = ['0-20', '20-40', '40-60', '60-80', '80-100']# Use pd.cut to segment and sort data values into bins
df_merged['city_tier'] = pd.cut(df_merged['perc_stores_with_yes_instore_prices'], bins=bins, labels=labels, include_lowest=True)

# fillna
df_merged['ipg_variant'].fillna('not_found', inplace=True)
df_merged['city_tier'] = df_merged['city_tier'].cat.add_categories('no tier').fillna('no tier')
df_merged['yes_instore_prices'].fillna(0, inplace=True)

df_merged.head()

Unnamed: 0,country,city,p_creation_date,ipg_variant,groceries_orders_count,groceries_ret_orders_count,stores_ret_orders_count,groceries_gmv_sum,param_label_x,total_session_count,total_customers,groceries_sw_session_count,param_label_y,not_instore_prices,yes_instore_prices,total_stores,perc_stores_with_yes_instore_prices,city_tier
0,NG,LOS,2024-04-19,InStorePrices,113,88,71,531.206173,Apr,1387,967,374.0,Apr,0.0,2.0,2.0,1.0,80-100
1,ES,HEN,2024-04-24,Control Group,84,75,60,2030.54,Apr,1033,801,151.0,Apr,,0.0,,,no tier
2,IT,BOL,2024-04-28,not_found,77,43,38,1754.71,Apr,4400,3632,,Apr,3.0,1.0,4.0,0.25,20-40
3,IT,PMO,2024-04-04,InStorePrices,34,28,22,1169.33,Apr,542,422,87.0,Apr,0.0,2.0,2.0,1.0,80-100
4,ES,BIL,2024-04-14,InStorePrices,11,7,6,161.83,Apr,423,322,53.0,Apr,2.0,1.0,3.0,0.333333,20-40


In [60]:
df_merged['ipg_variant'].value_counts()

ipg_variant
InStorePrices    9199
Control Group    9199
not_found        9199
Name: count, dtype: int64

In [61]:
#metrics
df_merged_grouped = df_merged.groupby(['yes_instore_prices', 'p_creation_date','ipg_variant']).agg({
    'groceries_orders_count': 'sum',
    'total_session_count': 'sum',
    'groceries_orders_count': 'sum',
    'groceries_sw_session_count': 'sum',
    'groceries_gmv_sum': 'sum',
    'groceries_ret_orders_count': 'sum',
    'total_customers': 'sum'
}).reset_index()

df_merged_grouped['perc_cvr'] = df_merged_grouped['groceries_orders_count']/df_merged_grouped['groceries_sw_session_count']
df_merged_grouped['perc_ret'] = df_merged_grouped['groceries_ret_orders_count']/df_merged_grouped['groceries_orders_count']
df_merged_grouped['aov'] = df_merged_grouped['groceries_gmv_sum']/df_merged_grouped['groceries_orders_count']
df_merged_grouped['groceries_session_per_customer']  = df_merged_grouped['groceries_sw_session_count']/df_merged_grouped['total_customers']

df_final = df_merged_grouped[[
    'yes_instore_prices',
    'p_creation_date',
    'ipg_variant',
    'groceries_orders_count',
    'total_session_count',
    'groceries_sw_session_count',
    'groceries_gmv_sum',
    'groceries_ret_orders_count',
    'groceries_session_per_customer',
    'perc_cvr',
    'perc_ret',
    'aov']]

df_final.to_csv('outputs/raw_results_v3_n_partners_instore.csv')

aggregated = df_final.groupby(['yes_instore_prices', 'ipg_variant']).agg({
    'groceries_orders_count': 'sum',
    'total_session_count': 'sum',
    'groceries_orders_count': 'mean',
    'groceries_sw_session_count': 'mean',
    'groceries_gmv_sum': 'mean',
    'groceries_ret_orders_count': 'mean',
    'groceries_session_per_customer':'mean',
    'perc_cvr': 'mean',
    'perc_ret': 'mean',
    'aov': 'mean'
}).reset_index()

control_data = aggregated[aggregated['ipg_variant'] == 'Control Group']
variant_data = aggregated[aggregated['ipg_variant'] == 'InStorePrices']

merged_data = pd.merge(
    control_data, variant_data, 
    on=['yes_instore_prices'], 
    suffixes=('_control', '_variant')
)

# Calculate total orders and total customers
merged_data['groceries_orders_count'] = merged_data['groceries_orders_count_control'] + merged_data['groceries_orders_count_variant']
merged_data['total_session_count'] = merged_data['total_session_count_control'] + merged_data['total_session_count_variant']
merged_data['perc_sessions_control'] = merged_data['total_session_count_control'] / merged_data['total_session_count']
merged_data['perc_sessions_variant'] = merged_data['total_session_count_variant'] / merged_data['total_session_count']

merged_data.to_csv('outputs/aggregated_results_v3_n_partners_insore.csv')

In [62]:
results = []

metrics = [
    'groceries_session_per_customer',
    'perc_cvr', 
    'perc_ret', 
    'aov'
]

for index, row in merged_data.iterrows():
    yes_instore_prices = row['yes_instore_prices']
    result_row = {'yes_instore_prices': yes_instore_prices}
    
    for metric in metrics:
        control_value = row[f'{metric}_control']
        variant_value = row[f'{metric}_variant']
        
        # Calculate increment
        increment = (variant_value - control_value) / control_value * 100 if control_value != 0 else None
        
        # Perform t-test
        control_sample = df_final[(df_final['yes_instore_prices'] == yes_instore_prices) & (df_final['ipg_variant'] == 'Control Group')][metric]
        variant_sample = df_final[(df_final['yes_instore_prices'] == yes_instore_prices) & (df_final['ipg_variant'] == 'InStorePrices')][metric]
        
        t_stat, p_value = ttest_ind(control_sample, variant_sample, equal_var=False)
        
        # Append to results
        result_row.update({
            f'{metric}_control': control_value,
            f'{metric}_variant': variant_value,
            f'increment_{metric}': increment,
            f'p_value_{metric}': p_value
        })
        
    # Add total orders and total customers to result_row
    result_row['groceries_orders_count'] = row['groceries_orders_count']
    result_row['total_session_count'] = row['total_session_count']
    
    results.append(result_row)

pd.DataFrame(results).to_csv('outputs/t_test_v3_n_partners_instore.csv')