# Fresh Products

In [7]:
import matplotlib.pyplot as plt
import os
import pandas as pd
import seaborn as sns
import sys

# Move two levels up (to the project root) and append the `src` folder
src_path = os.path.abspath(os.path.join(os.getcwd(), '..', '..'))

# Append src to sys.path
sys.path.append(src_path)

from src import query_engines, dataframe_visualizer

q = query_engines.QueryEngines()

## Q1: Identify all kinds of segmentation

In this first part I want to segment all the possible partners. In Specialties, TP, NTP, MFCs and All groceries. We also check what is the perc of migrated Store Address Ids for each of them.

### What is the distribution of segments_lvl 1?

In [2]:
q.prepare_query(
    'possible_segmentations.sql'
    ,params=None
    ,to_load_file='possible_segmentations'
    ,load_from_to_load_file='possible_segmentations'
)

possible_segmentations  = q.query_run_starburst()

grouped = possible_segmentations.groupby('segment_1').agg(
    distinct_sad_id = ('store_address_id','nunique'),
    migrated_sad_id = ('is_migrated', lambda x: x.sum())
).reset_index()

grouped['perc_m'] = grouped['migrated_sad_id']/grouped['distinct_sad_id']
grouped.sort_values(by='distinct_sad_id', ascending=False)

Unnamed: 0,segment_1,distinct_sad_id,migrated_sad_id,perc_m
2,Specialties,15387,1442,0.093715
3,Top Partner,10546,8496,0.805614
1,Non Top Partner,5850,1682,0.287521
0,MFC,256,245,0.957031


### What is the distribution of segment_lvl 2?

In [3]:
grouped = possible_segmentations.groupby('segment_2').agg(
    distinct_sad_id = ('store_address_id','nunique'),
    migrated_sad_id = ('is_migrated', lambda x: x.sum())
).reset_index()

grouped['perc_m'] = grouped['migrated_sad_id']/grouped['distinct_sad_id']
grouped.sort_values(by='distinct_sad_id', ascending=False)

Unnamed: 0,segment_2,distinct_sad_id,migrated_sad_id,perc_m
0,Groceries Partner,16396,10178,0.620761
2,Specialties,15387,1442,0.093715
1,MFC,256,245,0.957031


# Q2: Rest of metrics

In [5]:
START_DATE = "'2024-07-01'"
END_DATE = "'2024-09-15'"
END_DATE_RETENTION = "'2024-10-15'"

params = [
    {'name':'start_date','value':START_DATE},
    {'name':'end_date','value':END_DATE},
    {'name':'end_date_retention','value':END_DATE_RETENTION}
]

q.prepare_query(
    'metrics.sql'
    ,params=params
    ,to_load_file='metrics'
    ,load_from_to_load_file='metrics'
)

metrics  = q.query_run_starburst()
metrics.to_csv('outputs/metrics.csv')
metrics.head()

Unnamed: 0,country,segment_1,segment_2,all_customers,f_customers,nf_customers,all_orders,f_orders,nf_orders,all_gmv,...,ratings_f_orders,ratings_nf_orders,all_orders_retained_Groceries,f_orders_retained_Groceries,nf_orders_retained_Groceries,all_feedback_orders_retained_Groceries,f_feedback_orders_retained_Groceries,nf_feedback_orders_retained_Groceries,n_rows,n_distinct_bought_product_ids
0,MD,Non Top Partner,Groceries Partner,641,395,274,1005,644,361,21315.26,...,0,0,580,390,190,0,0,0,4422,4422
1,RS,Top Partner,Groceries Partner,24475,19537,9168,73312,56709,16603,1635795.0,...,420,39,52628,41772,10856,305,280,25,687239,687239
2,UA,MFC,MFC,42694,21437,32034,150348,61264,89084,2242776.0,...,378,98,124416,52417,71999,367,292,75,715734,715734
3,PT,Specialties,Specialties,2521,1594,1234,5391,3760,1631,153633.0,...,23,1,4565,3336,1229,22,21,1,38256,38256
4,KZ,Top Partner,Groceries Partner,18927,13984,8527,46999,31504,15495,550830.4,...,312,37,33369,22685,10684,217,196,21,307524,307524


In [9]:
START_DATE = "'2024-07-01'"
END_DATE = "'2024-09-15'"
END_DATE_RETENTION = "'2024-10-15'"
OUTLIERS_THRESHOLD = '0.98'

params = [
    {'name':'start_date','value':START_DATE},
    {'name':'end_date','value':END_DATE},
    {'name':'end_date_retention','value':END_DATE_RETENTION},
    {'name':'outliers_threshold','value':OUTLIERS_THRESHOLD}
]

q.prepare_query(
    'subsequent_all.sql'
    ,params=params
    ,to_load_file='subsequent_all'
    ,load_from_to_load_file='subsequent_all'
)

subsequent_all = q.query_run_starburst()
subsequent_all.to_csv('outputs/subsequent_all.csv')
subsequent_all.head()

Unnamed: 0,country,segment_2,f_subsequent_orders_0,nf_subsequent_orders_0,f_avg_subsequent__all_orders,nf_avg_subsequent_all_orders,f_avg_subsequent__f_orders,nf_avg_subsequent_f_orders,ff_avg_subsequent_all_orders,ff_avg_subsequent_f_orders,n_rows,n_distinct_order_id
0,MD,Specialties,800,2493,0.8325,0.788608,0.46875,0.10349,0.166667,0.166667,3293,3293
1,GE,Groceries Partner,38421,28016,1.718722,1.511244,1.276672,0.616148,1.34767,1.071685,66437,66437
2,IT,Specialties,9648,5708,0.783893,0.71356,0.698072,0.152593,0.565217,0.543478,15356,15356
3,BG,Groceries Partner,45647,8114,2.257344,2.05965,2.036826,1.253512,1.860697,1.718905,53761,53761
4,IT,MFC,97397,60694,1.455568,1.283339,1.152828,0.493377,1.012428,0.821224,158091,158091


In [10]:
START_DATE = "'2024-07-01'"
END_DATE = "'2024-09-15'"
END_DATE_RETENTION = "'2024-10-15'"
OUTLIERS_THRESHOLD = '0.98'

params = [
    {'name':'start_date','value':START_DATE},
    {'name':'end_date','value':END_DATE},
    {'name':'end_date_retention','value':END_DATE_RETENTION},
    {'name':'outliers_threshold','value':OUTLIERS_THRESHOLD}
]

q.prepare_query(
    'subsequent_all_agg.sql'
    ,params=params
    ,to_load_file='subsequent_all_agg'
    ,load_from_to_load_file='subsequent_all_agg'
)

subsequent_all_agg = q.query_run_starburst()
subsequent_all_agg.to_csv('outputs/subsequent_all_agg.csv')
subsequent_all_agg.head()

Unnamed: 0,segment_2,f_subsequent_orders_0,nf_subsequent_orders_0,f_avg_subsequent__all_orders,nf_avg_subsequent_all_orders,f_avg_subsequent__f_orders,nf_avg_subsequent_f_orders,ff_avg_subsequent_all_orders,ff_avg_subsequent_f_orders,n_rows,n_distinct_order_id
0,Groceries Partner,2587358,962156,2.565945,2.19768,2.155081,1.13597,2.076765,1.798951,3549514,3549514
1,MFC,530561,514598,2.131668,1.963469,1.5067,0.660347,1.581205,1.134219,1045159,1045159
2,Specialties,42367,92249,1.090849,1.346399,0.951354,0.063524,0.823077,0.734615,134616,134616


In [81]:
START_DATE = "'2024-07-01'"
END_DATE = "'2024-09-15'"
END_DATE_RETENTION = "'2024-10-15'"
FO_THRESHOLD = '0.80'
ALL_ORDERS_THRESHOLD = '0.20'

params = [
    {'name':'start_date','value':START_DATE},
    {'name':'end_date','value':END_DATE},
    {'name':'end_date_retention','value':END_DATE_RETENTION},
    {'name':'fo_threshold','value':FO_THRESHOLD},
    {'name':'all_orders_threshold','value':ALL_ORDERS_THRESHOLD}
]

q.prepare_query(
    'top_performers.sql'
    ,params=params
    ,to_load_file='top_performers'
    ,load_from_to_load_file='top_performers'
)

top_performers = q.query_run_starburst()
top_performers.to_csv('outputs/top_performers.csv')
top_performers.head()

  df = pd.read_sql(self.tp__read_query, conn)


Open the following URL in browser for the external authentication:
https://starburst.g8s-data-platform-prod.glovoint.com/oauth2/token/initiate/fde2ef090aa46c06396612e4eff539206e798116cdb2228a858d1ef80b2e1f7d


Unnamed: 0,country,segment_2,n_store_ids,n_orders,perc_fresh_orders
0,ME,Specialties,3,424,0.91
1,NG,Groceries Partner,1,144,0.53
2,MD,Specialties,3,287,0.85
3,CI,Specialties,2,1881,0.83
4,KG,Groceries Partner,1,14744,0.57
