# SPM tagging: Analysis

In [1]:
# We will use the SPM output dDP to check what would have happened if we flagged the product as low stock. To do so, we will
# use a field in the output that is the probability of the product to have pna. This ranges from 0 to 1. And we will consider
# different scenarios between 0 and 1 to see the evolution of some metrics. In specific, we want to understand how many real 
# PNA would have been tagged as low PNA, and how many orders would have been tagged as low PNA.

# To understand it even more, let's consider the extreme cases. (i) All products with probability >= 0, thus all products, 
# to be flagged as in low quantity. Then 100% of PNA and 100% of product ids would have been flagged. (ii) Only products
# with probability >1 (impossible), thus no products, to be flagged as in low quanitty. Then 0% of the PNA and 0% of 
# product ids would have been flagged. In a middle-point scenario, such as flagging only when probability >= 0.5, we 
# want to understand how much PNA do we capture while how much no PNA products do we tag.

## Analysis

### Imports

In [22]:
import sys
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
import numpy as np

sys.path.append('c:\\Users\\Jordi Cremades\\Documents\\Repository')

from Utilities import query_engines

### Queries

In [23]:
# Query Delta Time
query = query_engines.QueryEngines(
    query='spm_tagging.sql', 
    del1='--v2', 
    del2='--endv2',
    params=None,
    load_from_output_file='spm_tagging_v3',
    output_file='spm_tagging_v3',
    printq=None
)

df = query.query_run_starbust()

### Simulation

In [24]:
# Analysed time period
print(df['order_activated_at'].min())
print(df['order_activated_at'].max())

2023-11-17 00:00:30+00:00
2023-11-22 23:59:16+00:00


In [25]:
# Drop unnecessary columns
df.drop(
    axis=1
    ,columns=['start_event','end_event','order_activated_at','prediction_time']
    ,inplace=True
)

# If pna_probability is null then we consider there is 0% probability
# df['pna_probability'].fillna(-1, inplace=True) # Option 1: All products
df.dropna(subset=['pna_probability'], inplace=True) # Option 2: Only keep products processed by the SPM
df

Unnamed: 0,external_id_store_address_id,bought_product_id,bought_product_id_is_pna,bought_proudct_id_value_eur,pna_probability
0,664612_509439,104420905318,False,3.570000,0.056875
11,4860450208434_289722,104417467971,True,1.862767,0.342326
12,100317387_220813,104431529025,False,5.654040,0.030912
17,107033_614966,104383685064,False,0.420000,0.360734
18,74720_187773,104367315711,False,3.560000,0.139492
...,...,...,...,...,...
3598640,157120_614853,104403647881,False,1.990000,0.304925
3598642,727343_127535,104377459889,False,1.000000,0.128099
3598647,4607016240893_289722,104361905560,False,1.811969,0.126964
3598650,070594-000_574361,104437715410,False,4.525528,0.392417


In [17]:
# Simulation
perc_value_eur_flagged_list = [] # Related to how much eur value is flagged
perc_external_id_flagged_list = [] # Related to how many external ids are flagged
perc_pna_flagged_list = [] # Related to PNA% of flagged products
perc_pna_not_flagged_list = [] # Related to PNA% of not flagged product
num_pna_flagged_list = [] # Related to num of prodects with pna flagged
num_pna_not_flagged_list = [] # Related to num of products with pna not flagged
num_not_pna_flagged_list = [] # Related to num of products with no pna flagged
num_not_pna_not_flagged_list = [] # Related to num of products with no pna not flagged
perc_prods_never_pna_list = [] # Related to how many external ids that are flagged, never showed a PNA
perc_real_pna_flagged_list = [] # Related to % PNA detected by flag
perc_products_flagged_list = [] # Related to the % of Products flagged

# Simulation
step = 0.05
threshold_values = [round(i, 4) for i in list(np.arange(0, 1+step, step))]

for threshold in threshold_values:
    df['predicted_pna'] = df['pna_probability'].apply(lambda x: True if x >= threshold else False)
    is_flagged = df['predicted_pna'] == True # condition of being flagged
    # How much eur value is flagged
    perc_value_eur_flagged = df[is_flagged]['bought_proudct_id_value_eur'].sum()/df['bought_proudct_id_value_eur'].sum()
    # How many external ids are flagged
    perc_external_id_flagged = df[is_flagged]['bought_product_id'].nunique()/len(df['bought_product_id'])
    # How many external ids that are flagged, never showed a PNA
    sum_pna_df = df[is_flagged].groupby('external_id_store_address_id', as_index=False)['bought_product_id_is_pna'].sum()
    never_pna_df = sum_pna_df[sum_pna_df['bought_product_id_is_pna'] == 0]
    if sum_pna_df.shape[0] > 0:
        perc_prods_never_pna = never_pna_df.shape[0] / sum_pna_df.shape[0]
    else:
        perc_prods_never_pna = None
    # PNA% of flagged products
    if len(df[(is_flagged)]) > 0:
        perc_pna_flagged = len(df[(is_flagged) & (df['bought_product_id_is_pna'] == True)])/len(df[(is_flagged)])
    else:
        perc_pna_flagged = None
    # PNA% of not flagged products
    if len(df[(~is_flagged)]) > 0:
        perc_pna_not_flagged  = len(df[(~is_flagged) & (df['bought_product_id_is_pna'] == True)])/len(df[(~is_flagged)])
    else:
        perc_pna_not_flagged = None
    # Num of products with pna flagged
    num_pna_flagged = len(df[(is_flagged) & (df['bought_product_id_is_pna'] == True)])
    # Num of products with pna not flagged
    num_pna_not_flagged = len(df[(~is_flagged) & (df['bought_product_id_is_pna'] == True)])
    # Num of products with no pna flagged
    num_not_pna_flagged = len(df[(is_flagged) & (df['bought_product_id_is_pna'] == False)])
    # Num of products with no pna not flagged
    num_not_pna_not_flagged = len(df[(~is_flagged) & (df['bought_product_id_is_pna'] == False)])
    # Products flagged
    perc_products_flagged = len(df[is_flagged])/len(df)
    # % PNA detected by flag
    perc_real_pna_flagged = num_pna_flagged/(num_pna_flagged+num_pna_not_flagged)
    
    # Append
    perc_value_eur_flagged_list.append(perc_value_eur_flagged)
    perc_external_id_flagged_list.append(perc_external_id_flagged)
    perc_pna_flagged_list.append(perc_pna_flagged)
    perc_pna_not_flagged_list.append(perc_pna_not_flagged)
    num_pna_flagged_list.append(num_pna_flagged)
    num_pna_not_flagged_list.append(num_pna_not_flagged)
    num_not_pna_flagged_list.append(num_not_pna_flagged)
    num_not_pna_not_flagged_list.append(num_not_pna_not_flagged)
    perc_prods_never_pna_list.append(perc_prods_never_pna)
    perc_real_pna_flagged_list.append(perc_real_pna_flagged)
    perc_products_flagged_list.append(perc_products_flagged)

data = {
    'Threshold': threshold_values,
    'perc_value_eur_flagged': perc_value_eur_flagged_list,
    'perc_external_id_flagged': perc_external_id_flagged_list,
    'perc_pna_flagged': perc_pna_flagged_list,
    'perc_pna_flagged_never_showed_pna': perc_prods_never_pna_list,
    'perc_pna_not_flagged': perc_pna_not_flagged_list,
    'num_pna_flagged': num_pna_flagged_list,
    'num_pna_not_flagged': num_pna_not_flagged_list,
    'num_not_pna_flagged': num_not_pna_flagged_list,
    'num_not_pna_not_flagged': num_not_pna_not_flagged_list,
    'perc_real_pna_flagged': perc_real_pna_flagged_list,
    'perc_products_flagged':perc_products_flagged_list
}

result_table = pd.DataFrame(data)
result_table.to_csv('results.csv')
result_table

Unnamed: 0,Threshold,perc_value_eur_flagged,perc_external_id_flagged,perc_pna_flagged,perc_pna_flagged_never_showed_pna,perc_pna_not_flagged,num_pna_flagged,num_pna_not_flagged,num_not_pna_flagged,num_not_pna_not_flagged,perc_real_pna_flagged,perc_products_flagged
0,0.0,1.0,1.0,0.068991,0.878084,,59831,0,807400,0,1.0,1.0
1,0.05,0.898512,0.863128,0.077803,0.876473,0.01342,58238,1593,690293,117107,0.973375,0.863128
2,0.1,0.665938,0.610625,0.094415,0.871138,0.029119,49998,9833,479555,327845,0.835654,0.610625
3,0.15,0.50152,0.453948,0.103267,0.86528,0.040496,40654,19177,353024,454376,0.679481,0.453948
4,0.2,0.386685,0.347757,0.107883,0.859957,0.048255,32536,27295,269050,538350,0.543798,0.347757
5,0.25,0.297445,0.267858,0.111853,0.853609,0.053309,25983,33848,206312,601088,0.434273,0.267858
6,0.3,0.24744,0.21803,0.109762,0.856934,0.057623,20754,39077,168328,639072,0.346877,0.21803
7,0.35,0.184049,0.156246,0.112523,0.852716,0.06093,15247,44584,120254,687146,0.254834,0.156246
8,0.4,0.141168,0.113689,0.102845,0.859568,0.064648,10140,49691,88455,718945,0.169477,0.113689
9,0.45,0.115195,0.087813,0.093705,0.868446,0.066612,7136,52695,69018,738382,0.119269,0.087813
