# Traffic

In this notebook we want to understand if after applying the instore prices we saw an increse of traffic to those stores

## Config

In [163]:
import matplotlib.dates as mdates
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from scipy.stats import ttest_ind
import seaborn as sns
import sys

sys.path.append('c:\\Users\\Jordi Cremades\\Documents\\Repos\\central-node')

from utils import query_engines

## Build query to obtain data

In [228]:
q = query_engines.QueryEngines(reset_query_logs=True, reset_to_load=False)

QUERY_FILE = 'traffic/share_per_partners.sql'
OUTPUT_FILE = 'share_per_partners'
LOAD_FROM_OUTPUT_FILE = 'share_per_partners'
PRINT_QUERY = True

START_DATE = "'2023-10-01'"
END_DATE = "'2024-07-01'"

params = [
    # dates
    {'name':'start_date', 'value': str(START_DATE)},
    {'name':'end_date', 'value': str(END_DATE)},
]

q.prepare_query(QUERY_FILE, params=params)

df_share  = q.query_run_starburst( #query_run_livedb otherwise
    output_file=OUTPUT_FILE,
    load_from_output_file=LOAD_FROM_OUTPUT_FILE,
    print_query=PRINT_QUERY
)

df_share

Unnamed: 0,calendar_date,store_id,city,store_name,first_date_instore_price_enabled,n_sessions_city_info,n_sessions_partner
0,2023-10-02,321258,NAP,Dodecà,2024-01-16,,122
1,2023-10-02,316240,ORD,Profi,2024-01-16,,88
2,2023-10-02,261877,PDG,City Market,2024-01-16,,14
3,2023-10-01,337011,ORD,Carrefour Hypermarket,2024-01-16,,148
4,2023-10-01,85551,SPU,Tommy,2024-01-16,,326
...,...,...,...,...,...,...,...
140744,2024-06-30,448014,QND,Univerexport,2024-06-27,537.0,151
140745,2024-06-30,448010,BEG,Univerexport,2024-06-27,3533.0,202
140746,2024-06-30,218415,GDN,Piwniczka Biedronki,2024-03-12,2545.0,41
140747,2024-06-30,450175,BCU,Carrefour Hypermarket,2024-04-02,448.0,67


In [229]:
df_share['calendar_date'] = pd.to_datetime(df_share['calendar_date'])
df_share['first_date_instore_price_enabled'] = pd.to_datetime(df_share['first_date_instore_price_enabled'])

In [230]:
df_share['share'] = df_share['n_sessions_partner']/df_share['n_sessions_city_info'] * 100
df_share

Unnamed: 0,calendar_date,store_id,city,store_name,first_date_instore_price_enabled,n_sessions_city_info,n_sessions_partner,share
0,2023-10-02,321258,NAP,Dodecà,2024-01-16,,122,
1,2023-10-02,316240,ORD,Profi,2024-01-16,,88,
2,2023-10-02,261877,PDG,City Market,2024-01-16,,14,
3,2023-10-01,337011,ORD,Carrefour Hypermarket,2024-01-16,,148,
4,2023-10-01,85551,SPU,Tommy,2024-01-16,,326,
...,...,...,...,...,...,...,...,...
140744,2024-06-30,448014,QND,Univerexport,2024-06-27,537.0,151,28.119181
140745,2024-06-30,448010,BEG,Univerexport,2024-06-27,3533.0,202,5.717521
140746,2024-06-30,218415,GDN,Piwniczka Biedronki,2024-03-12,2545.0,41,1.611002
140747,2024-06-30,450175,BCU,Carrefour Hypermarket,2024-04-02,448.0,67,14.955357


In [231]:
df_share['day_difference'] = (df_share['calendar_date'] - df_share['first_date_instore_price_enabled']).dt.days

lower_limit = df_share['day_difference'] >= -60
upper_limit = df_share['day_difference'] <= 60

df_share = df_share[(lower_limit & upper_limit)]
df_share

Unnamed: 0,calendar_date,store_id,city,store_name,first_date_instore_price_enabled,n_sessions_city_info,n_sessions_partner,share,day_difference
225,2023-11-29,316629,IAS,Carrefour Hypermarket,2024-01-16,1117.0,192,17.188899,-48
238,2023-11-29,431281,LOS,SPAR Electronics,2024-01-16,1132.0,11,0.971731,-48
242,2023-11-27,412943,LCG,Eroski,2024-01-16,485.0,190,39.175258,-50
243,2023-11-27,353926,SBU,Carrefour Hypermarket,2024-01-16,478.0,129,26.987448,-50
244,2023-11-27,319319,MAS,Condis,2024-01-16,119.0,39,32.773109,-50
...,...,...,...,...,...,...,...,...,...
140713,2024-06-26,459950,NBO,The Fat Italian,2024-06-20,6552.0,27,0.412088,6
140722,2024-06-27,415703,FIR,MohooMart,2024-05-01,438.0,11,2.511416,57
140724,2024-06-28,95249,BEG,IDEA Organic,2024-05-08,6309.0,102,1.616738,51
140744,2024-06-30,448014,QND,Univerexport,2024-06-27,537.0,151,28.119181,3


## T-test analysis

In [232]:
def perform_ab_test(
    df # Dataframe containing the data
    ,metric_is_a_ratio:False # Are the metrics already computed or after the grouping a ratio needs to be computed
    ,metric_name # Name of the final metric
    ,multiple_metrics # Names of metrics if ratios need to be computed
    ,groupby_metrics # List containing group metrics
    ,group_granularity # The granularity at which I want to perform the test
    ,variant_column # Name of column where control and variant are labelled
    ,control_label # control label in dataframe - Control Group
    ,variant_label # variant lable in dataframe - Variant Group
    ,alpha
    ):

    # Compute the grouping for minimum granularity
    minimum_granularity_group = groupby_metrics + [group_granularity]
        
    if not metric_is_a_ratio:
        print('------------------------------')
        print('--------UNIQUE METRIC---------')
        print('------------------------------')
        
        # Build dataset for control and variant
        control_df = df[df[variant_column] == control_label].groupby(minimum_granularity_group)[metric_name].sum().reset_index()
        variant_df = df[df[variant_column] == variant_label].groupby(minimum_granularity_group)[metric_name].sum().reset_index()
        sanity_check = pd.merge(control_df,variant_df,on=minimum_granularity_group, suffixes=('_control','_variant'), how='outer').copy()
        
        #Group by the specified metrics
        control_group_mean = control_df.groupby(groupby_metrics)[metric_name].mean().reset_index()
        variant_group_mean = variant_df.groupby(groupby_metrics)[metric_name].mean().reset_index()
        
        # Merge
        merged = pd.merge(control_group_mean,variant_group_mean,on=groupby_metrics, suffixes=('_control','_variant'), how='outer')
        
        control_with_sufix = metric_name + '_control'
        variant_with_sufix = metric_name + '_variant'
        merged['delta_perc_100'] = (merged[variant_with_sufix] - merged[control_with_sufix]) / merged[control_with_sufix] * 100
    else:
        print('------------------------------')
        print('--------RATIO METRIC----------')
        print('------------------------------')
        numerator = multiple_metrics[0]
        denominator = multiple_metrics[1]
    
        numerator_control_df = df[df[variant_column] == control_label].groupby(minimum_granularity_group)[numerator].sum().reset_index()
        numerator_variant_df = df[df[variant_column] == variant_label].groupby(minimum_granularity_group)[numerator].sum().reset_index()
        denominator_control_df = df[df[variant_column] == control_label].groupby(minimum_granularity_group)[denominator].sum().reset_index()
        denominator_variant_df = df[df[variant_column] == variant_label].groupby(minimum_granularity_group)[denominator].sum().reset_index()

        control_df = pd.merge(numerator_control_df,denominator_control_df,on=minimum_granularity_group, how='outer')
        variant_df = pd.merge(numerator_variant_df,denominator_variant_df,on=minimum_granularity_group, how='outer')
        
        control_df[metric_name] = control_df[numerator]/control_df[denominator]
        variant_df[metric_name] = variant_df[numerator]/variant_df[denominator]
        sanity_check = pd.merge(control_df,variant_df,on=minimum_granularity_group, suffixes=('_control','_variant'), how='outer')
        
        #Group by the specified metrics
        control_group_mean = control_df.groupby(groupby_metrics)[metric_name].mean().reset_index()
        variant_group_mean = variant_df.groupby(groupby_metrics)[metric_name].mean().reset_index()
        # Merge
        merged = pd.merge(control_group_mean,variant_group_mean,on=groupby_metrics, suffixes=('_control','_variant'), how='outer')
        
        control_with_sufix = metric_name + '_control'
        variant_with_sufix = metric_name + '_variant'
        merged['delta_perc_100'] = (merged[variant_with_sufix] - merged[control_with_sufix]) / merged[control_with_sufix] * 100
    
    # Calculate p-values
    p_values = []
    is_significant = []

    for _, row in merged.iterrows():

        test_granularity_group = minimum_granularity_group + [metric_name]
        
        control_values = control_df[(control_df[groupby_metrics] == row[groupby_metrics]).all(axis=1)][test_granularity_group]
        variant_values = variant_df[(variant_df[groupby_metrics] == row[groupby_metrics]).all(axis=1)][test_granularity_group]
        values = pd.merge(control_values,variant_values,on=minimum_granularity_group, suffixes=('_control','_variant'), how='outer')
        
        _, p_value = ttest_ind(values[control_with_sufix], values[variant_with_sufix], equal_var=False, nan_policy='omit')
 
        p_values.append(p_value)
        
        if p_value <= alpha:
            is_significant.append(True)
        elif p_value > alpha:
            is_significant.append(False)
        else:
            is_significant.append(None)

    merged['p_value'] = p_values
    merged['is_significant'] = is_significant
    
    return merged,sanity_check

## Analysis

In [233]:
pd.reset_option('display.max_rows')

In [236]:
# Do not keep the same day
df_share = df_share[df_share['day_difference']!=0]

In [235]:
df_share[(df_share['city'] == 'BCN') & (df_share['store_name'] == 'Ametller Origen')]

Unnamed: 0,calendar_date,store_id,city,store_name,first_date_instore_price_enabled,n_sessions_city_info,n_sessions_partner,share,day_difference
55636,2023-12-03,146534,BCN,Ametller Origen,2024-01-16,7331.0,106,1.445915,-44
55707,2023-12-09,146534,BCN,Ametller Origen,2024-01-16,6625.0,555,8.377358,-38
55913,2023-11-30,146534,BCN,Ametller Origen,2024-01-16,5872.0,479,8.157357,-47
56005,2024-01-15,146534,BCN,Ametller Origen,2024-01-16,6222.0,865,13.902282,-1
56401,2024-02-25,146534,BCN,Ametller Origen,2024-01-16,8647.0,469,5.423846,40
...,...,...,...,...,...,...,...,...,...
136785,2024-02-03,146534,BCN,Ametller Origen,2024-01-16,7794.0,656,8.416731,18
138473,2023-11-18,146534,BCN,Ametller Origen,2024-01-16,6159.0,533,8.654002,-59
138501,2023-11-27,146534,BCN,Ametller Origen,2024-01-16,5669.0,635,11.201270,-50
138977,2024-01-19,146534,BCN,Ametller Origen,2024-01-16,7500.0,845,11.266667,3


In [239]:
pre = df_share[df_share['day_difference']<0]
post = df_share[df_share['day_difference']>0]

In [253]:
pre_grouped = pre.groupby(['store_id','city','store_name'])['share'].mean().reset_index()
post_grouped = post.groupby(['store_id','city','store_name'])['share'].mean().reset_index()

output = pd.merge(pre_grouped,post_grouped, on=['store_id','city'], suffixes=('_pre','_post'))
output['diff'] = (output['share_post'] - output['share_pre'])/output['share_pre']*100
output

Unnamed: 0,store_id,city,store_name_pre,share_pre,store_name_post,share_post,diff
0,17566,MIL,Gelmarket Surgelati,2.307679,Gelmarket Surgelati,2.520700,9.230946
1,17569,TOR,Gelmarket Surgelati,3.165519,Gelmarket Surgelati,2.981088,-5.826247
2,17573,BOL,Gelmarket Surgelati,4.102813,Gelmarket Surgelati,4.493491,9.522187
3,17574,GEN,Gelmarket Surgelati,6.900910,Gelmarket Surgelati,4.641552,-32.740001
4,23211,TBI,Alcorium,2.938946,Alcorium,2.577604,-12.294956
...,...,...,...,...,...,...,...
566,456786,ODE,Turbo Market,0.332226,Fresh Market,2.318134,597.758269
567,456786,ODE,Turbo Market,0.332226,Turbo Market,2.360623,610.547405
568,457713,NBO,The Pink Forest Bakery,0.108506,The Pink Forest Bakery,0.493741,355.033921
569,459950,NBO,The Fat Italian,0.380638,The Fat Italian,0.308684,-18.903659


In [252]:
pd.reset_option('display.max_rows')

In [254]:
pd.set_option('display.max_rows', None)

output.sort_values(by='city')

Unnamed: 0,store_id,city,store_name_pre,share_pre,store_name_post,share_post,diff
324,327558,ABJ,Carrefour Playce Marcory,21.235615,Carrefour Playce Marcory,23.021361,8.4092
275,284804,ABJ,Carrefour Market,16.419579,Carrefour Market,16.415701,-0.023618
325,327562,ABN,Carrefour Playce Palmeraie,15.165136,Carrefour Playce Palmeraie,15.053369,-0.736999
148,161194,ABN,Carrefour Market,27.09964,Carrefour Market,26.8334,-0.982451
466,407462,ABV,Shoprite,94.180176,Shoprite,91.969494,-2.34729
470,409658,ABV,Shoprite Drinks,11.724624,Shoprite Drinks,15.610903,33.146304
524,431424,ABV,SPAR Market,10.043416,SPAR Market,14.851057,47.868578
453,406011,AGD,Carrefour Market,74.122835,Carrefour Market,47.749789,-35.580191
25,55415,AGD,Marjane,50.08184,Marjane,53.493047,6.811265
143,153478,AGD,Marjane,8.370169,Marjane,7.760217,-7.287208
