In [7]:
import sys
import pandas as pd
sys.path.append('c:\\Users\\Jordi Cremades\\Documents\\Repository')

from Utilities import query_engines
from Utilities import reader_writer
from Utilities import statistics
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

rw = reader_writer.Reader_Writer()
s = statistics.Statistics()

# TEST Switchback SPM

**Context**
<br>
From 27/06/2023 to 09/07/2023 we performed a switchback test. It consists
of randomly selecting which stores have SPM ON and witch not the previous day.
This selection is random, 50/50. It was performed in partners: KAUFL_BG and CAPRABO_es

If Control=1 then SPM is OFF, if Control=0 then SPM is ON

Variation is computed comparing (Config=0 to Config=1)/Config=1

# Data preparation

### Read configurations

In [8]:

list_dates = ['2023_07_01', '2023_07_03', '2023_07_04', '2023_07_05', '2023_07_06', '2023_07_07', '2023_07_08', '2023_06_27', '2023_06_28', '2023_06_29', '2023_06_30']
dataframes = [] 


for date in list_dates:
    # date_param to point which csv to read
    date_param = date
    
    # Read file
    df = rw.csv_read_file(f'test4_on_off/test_on_off_{date_param}.csv')

    # Modify the read csv
    df = df.rename(columns={'Unnamed: 1': 'Config'})
    date_object = datetime.strptime(date_param, "%Y_%m_%d") # Format to correct datetime
    df['Date'] = pd.Timestamp(date_object)
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')
    
    # Append result in a list
    dataframes.append(df)

# Concat all csv
df_final = pd.concat(dataframes, ignore_index=True)

df_final

Unnamed: 0,store_address_id,config,Date
0,337918,0,2023-07-01
1,337919,0,2023-07-01
2,337922,1,2023-07-01
3,337923,0,2023-07-01
4,338048,1,2023-07-01
...,...,...,...
1799,547375,0,2023-06-30
1800,496793,1,2023-06-30
1801,547349,1,2023-06-30
1802,566401,0,2023-06-30


### Read deactivations

In [9]:
list_dates = ['_CAPRABO_ES_20230701'
              ,'_CAPRABO_ES_20230703'
              ,'_CAPRABO_ES_20230704'
              ,'_CAPRABO_ES_20230705'
              ,'_CAPRABO_ES_20230706'
              ,'_CAPRABO_ES_20230707'
              ,'_CAPRABO_ES_20230708'
              ,'_CAPRABO_ES_20230627'
              ,'_CAPRABO_ES_20230628'
              ,'_CAPRABO_ES_20230629'
              ,'_CAPRABO_ES_20230630'
              ,'_KAUFL_BG_20230701'
              ,'_KAUFL_BG_20230703'
              ,'_KAUFL_BG_20230704'
              ,'_KAUFL_BG_20230705'
              ,'_KAUFL_BG_20230706'
              ,'_KAUFL_BG_20230707'
              ,'_KAUFL_BG_20230708'
              ,'_KAUFL_BG_20230627'
              ,'_KAUFL_BG_20230628'
              ,'_KAUFL_BG_20230629'
              ,'_KAUFL_BG_20230630']

dataframes = []

for date in list_dates:
    # date_param to point which csv to read
    date_param = date
    
    # Read file
    df = rw.csv_read_file(f'test4_on_off/deactivated_products{date_param}.csv')
    
    # Modify the read csv
    date_object = datetime.strptime(date_param.split("_")[-1], "%Y%m%d") # Format to correct datetime
    df['Date'] = pd.Timestamp(date_object)
    df['Date'] = df['Date'].dt.strftime('%Y-%m-%d')

    # Append result in a list
    dataframes.append(df)

# Concat all csv
df_deact_final = pd.concat(dataframes, ignore_index=True)

# Format the final dataframe
df_deact_final = df_deact_final.drop(['partner','n_deactivations'], axis=1)
df_deact_final = df_deact_final.rename(columns={'product_id_partner': 'product_id'})

# Group by to extract the number of products deactivated
df_grouped = df_deact_final.groupby(['Date', 'store_address_id'])['product_id'].nunique().reset_index()
df_grouped = df_grouped.rename(columns={'product_id': 'count_product_id'})
df_grouped
    

Unnamed: 0,Date,store_address_id,count_product_id
0,2023-06-27,337918,62
1,2023-06-27,337919,98
2,2023-06-27,338049,108
3,2023-06-27,338052,21
4,2023-06-27,338059,65
...,...,...,...
884,2023-07-08,547501,1
885,2023-07-08,547513,7
886,2023-07-08,547642,8
887,2023-07-08,547656,7


### Enrich data

Add the following fields for each store address and date:
- number of ordres
- number of bought products
- number of delivered orders
- number of orders with pna
- numbe of products with pna
- total purchase in eur

In [11]:
# Params definition
store_addresses = ",".join(str(val) for val in df_final['store_address_id'].unique().tolist())
max_date = '2023-07-08'
min_date = '2023-06-27'

# Params
params = [
    {'name':'min_date', 'value':min_date}
    ,{'name':'max_date', 'value':max_date}
    ,{'name':'list_stores', 'value':store_addresses}
]

query = query_engines.QueryEngines(
    query='backup/test_on_off_SPM.sql', 
    del1='--v1', 
    del2='--end_v1',
    params=params,
    output_dir=None,
    output_file=None,
    printq=None
)
d = query.query_run_starbust()
d.head()

SELECT
    order_descriptors.store_address_id  AS "order_descriptors.store_address_id",
    TRIM(order_descriptors.store_name)  AS "order_descriptors.store_name",
    (DATE_FORMAT(order_descriptors.p_creation_date , '%Y-%m-%d')) AS "order_descriptors.p_creation_date",
    COUNT(DISTINCT bought_products.bought_product_id ) AS "bought_products.count_id",
    COUNT(DISTINCT order_descriptors.order_id ) AS "order_descriptors.number_of_orders",
    COUNT(DISTINCT CASE WHEN (order_descriptors.order_final_status = 'DeliveredStatus') THEN order_descriptors.order_id  ELSE NULL END) AS "order_descriptors.number_of_delivered_orders",
    COUNT(DISTINCT CASE WHEN (order_descriptors.order_cancel_reason = 'PRODUCTS_NOT_AVAILABLE')
      OR ((order_feedback_facts.selected_option IN ('MISSING_PRODUCTS', 'WRONG_PRODUCTS')) = True)
      OR ((order_refund_incidents.reason IN ('MISSING_PRODUCTS', 'WRONG_PRODUCTS')) = True)
      OR (((((CASE WHEN order_descriptors.order_parent_relationship_type = 'SPLIT'

  df = pd.read_sql(self.tp__read_query, conn)


Unnamed: 0,order_descriptors.store_address_id,order_descriptors.store_name,order_descriptors.p_creation_date,bought_products.count_id,order_descriptors.number_of_orders,order_descriptors.number_of_delivered_orders,bought_products_additions_removals_replacements.number_of_orders_with_pna_1,bought_products_additions_removals_replacements.number_of_products_with_pna_1,order_descriptors.sum_total_purchase_eur
0,495632,Caprabo,2023-07-08,20,2,2,0,0,57.04
1,496267,Caprabo,2023-07-08,70,7,7,2,2,227.15
2,547367,Caprabo,2023-07-08,69,5,4,1,3,137.13
3,545297,Caprabo,2023-07-08,23,4,4,0,0,77.36
4,545485,Caprabo,2023-07-08,93,7,5,1,1,179.97


### Merge all information

In [25]:
# Merge with previous information
test = pd.merge(df_final,d, left_on=['store_address_id','Date'], right_on=['order_descriptors.store_address_id','order_descriptors.p_creation_date'], how='left')
test = pd.merge(test,df_grouped, left_on=['store_address_id','Date'], right_on=['store_address_id','Date'], how='left')
test.head()

# Format test dataset
test = test.drop(['order_descriptors.store_address_id','order_descriptors.p_creation_date'], axis=1)
test = test.rename(columns={'bought_products.count_id': 'n_products_bought'
                            ,'order_descriptors.number_of_orders': 'n_orders'
                            ,'order_descriptors.number_of_delivered_orders': 'do_orders'
                            ,'bought_products_additions_removals_replacements.number_of_orders_with_pna_1': 'n_orders_with_pna'
                            ,'bought_products_additions_removals_replacements.number_of_products_with_pna_1': 'n_products_with_pna'
                            ,'order_descriptors.sum_total_purchase_eur': 'total_gmv_purchased'
                            ,'count_product_id': 'n_products_deactivated'
                            ,'order_descriptors.store_name': 'store_name'})
test

Unnamed: 0,store_address_id,config,Date,store_name,n_products_bought,n_orders,do_orders,n_orders_with_pna,n_products_with_pna,total_gmv_purchased,n_products_deactivated
0,337918,0,2023-07-01,Kaufland,234.0,16.0,16.0,10.0,12.0,540.674842,58.0
1,337919,0,2023-07-01,Kaufland,352.0,30.0,30.0,13.0,17.0,775.815071,99.0
2,337922,1,2023-07-01,Kaufland,359.0,32.0,32.0,11.0,16.0,952.059095,
3,337923,0,2023-07-01,Kaufland,100.0,10.0,10.0,3.0,5.0,193.358416,55.0
4,338048,1,2023-07-01,Kaufland,179.0,13.0,13.0,6.0,8.0,444.284364,
...,...,...,...,...,...,...,...,...,...,...,...
1799,547375,0,2023-06-30,Caprabo,19.0,2.0,2.0,0.0,0.0,45.960000,
1800,496793,1,2023-06-30,Caprabo,3.0,1.0,1.0,0.0,0.0,6.950000,
1801,547349,1,2023-06-30,,,,,,,,
1802,566401,0,2023-06-30,Kaufland,34.0,4.0,4.0,0.0,0.0,92.771911,32.0


### Nulls, blanks,...

In [27]:
test = test.dropna(subset=['store_name'])

# Data exploration

### What was the distribution of Config 1 (SPM OFF) vs Config 0 (SPM 0)

In [31]:
# We add up all the config 1 and config 0 configurations at a partner level
config_count = test.groupby(['store_name', 'config']).size().reset_index(name='count')
config_count

Unnamed: 0,store_name,config,count
0,Caprabo,0,654
1,Caprabo,1,680
2,Kaufland,0,126
3,Kaufland,1,124


We assume that the distribution of config 1 and 0 is OK. There is no 50/50 distribution, but the variation is very low. Remember for each store address, we decided the daily config by throwing a 50/50 coin.

### At a partner level, is config 0 better than config 1 in PNA metrics? 

In [84]:
# For each Date, we look at each partner and divide by config 0 and 1
test_grouped = test.groupby(['store_name','config']).sum().reset_index()

# Delete store_address_id column, makes no sense anymore
test_grouped.drop('store_address_id', axis=1, inplace=True)

# Compute the % PNA metrics
test_grouped['%_o_pna'] = test_grouped['n_orders_with_pna']/test_grouped['n_orders']*100
test_grouped['%_p_pna'] = test_grouped['n_products_with_pna']/test_grouped['n_products_bought']*100
test_grouped['AOV'] = test_grouped['total_gmv_purchased']/test_grouped['n_orders']

test_grouped.head(5)

  test_grouped = test.groupby(['store_name','config']).sum().reset_index()


Unnamed: 0,store_name,config,n_products_bought,n_orders,do_orders,n_orders_with_pna,n_products_with_pna,total_gmv_purchased,n_products_deactivated,%_o_pna,%_p_pna,AOV
0,Caprabo,0,32669.0,3179.0,3076.0,1206.0,1912.0,85231.419964,3085.0,37.936458,5.852643,26.810764
1,Caprabo,1,32995.0,3156.0,3097.0,1177.0,1889.0,86256.019959,339.0,37.294043,5.72511,27.330805
2,Kaufland,0,15793.0,1273.0,1236.0,450.0,678.0,36920.365282,7722.0,35.349568,4.293041,29.002644
3,Kaufland,1,13890.0,1170.0,1145.0,371.0,601.0,33417.21542,477.0,31.709402,4.326854,28.561723


In [106]:
# Pivot the table to aggregating the metrics at a partner level
pivot_test_grouped = test_grouped.pivot(index='store_name', columns='config', values=['n_orders','AOV','%_o_pna','%_p_pna'])

# Flatten the columns
pivot_test_grouped.columns = [f"{col[0]}_config_{col[1]}" for col in pivot_test_grouped.columns]
pivot_test_grouped.reset_index(inplace=True)

# Compute % variation
pivot_test_grouped['%_n_orders'] = (((pivot_test_grouped['n_orders_config_0'] - pivot_test_grouped['n_orders_config_1'])/pivot_test_grouped['n_orders_config_1'])*100).round(3)
pivot_test_grouped['%_AOV'] = (((pivot_test_grouped['AOV_config_0'] - pivot_test_grouped['AOV_config_1'])/pivot_test_grouped['AOV_config_0'])*100).round(3)
pivot_test_grouped['var_%_o_pna'] = ((pivot_test_grouped['%_o_pna_config_0'] - pivot_test_grouped['%_o_pna_config_1'])).round(3)
pivot_test_grouped['var_%_p_pna'] = ((pivot_test_grouped['%_p_pna_config_0'] - pivot_test_grouped['%_p_pna_config_1'])).round(3)

pivot_test_grouped[['store_name','%_n_orders','%_AOV','var_%_o_pna','var_%_p_pna']]

Unnamed: 0,store_name,%_n_orders,%_AOV,var_%_o_pna,var_%_p_pna
0,Caprabo,0.729,-1.94,0.642,0.128
1,Kaufland,8.803,1.52,3.64,-0.034


The results of the test are inconclusive. The PNA metric is only improved at a product level in Kaufland, but not substantially.