In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import json
# import gspread
# from gspread_dataframe import set_with_dataframe
from datetime import date, timedelta
# from oauth2client.service_account import ServiceAccountCredentials
import setup_environment_2
import importlib
import import_ipynb
import warnings
from IPython.display import display
warnings.filterwarnings("ignore")
importlib.reload(setup_environment_2)
setup_environment_2.initialize_env()

/home/ec2-user/.Renviron
/home/ec2-user/service_account_key.json


In [2]:
def query_snowflake(query, columns=[]):
    import os
    import snowflake.connector
    import numpy as np
    import pandas as pd
    con = snowflake.connector.connect(
        user =  os.environ["SNOWFLAKE_USERNAME"],
        account= os.environ["SNOWFLAKE_ACCOUNT"],
        password= os.environ["SNOWFLAKE_PASSWORD"],
        database =os.environ["SNOWFLAKE_DATABASE"]
    )
    try:
        cur = con.cursor()
        cur.execute("USE WAREHOUSE COMPUTE_WH")
        cur.execute(query)
        if len(columns) == 0:
            out = pd.DataFrame(np.array(cur.fetchall()))
        else:
            out = pd.DataFrame(np.array(cur.fetchall()),columns=columns)
        return out
    except Exception as e:
        print("Error: ", e)
    finally:
        cur.close()
        con.close()

In [3]:
command_string = '''
with seller_region as
(
select seller_retailer.retailer_id,
case when regions.name_en = 'Greater Cairo' then cities.name_en else regions.name_en end as region,
seller_id,
seller_retailer.POLYGON_ID
from MATERIALIZED_VIEWS.SELLERS_RETAILERS_MAPPING seller_retailer
join retailers on retailers.id = seller_retailer.retailer_id
JOIN materialized_views.retailer_polygon on materialized_views.retailer_polygon.retailer_id=retailers.id
JOIN districts on districts.id=materialized_views.retailer_polygon.district_id
JOIN cities on cities.id=districts.city_id
join states on states.id=cities.state_id
join regions on regions.id=states.region_id
join egypt_marketplace.sellers on sellers.id = seller_retailer.seller_id and sellers.status = 'ACTIVE'
),

recent_price as (

select
wp.product_id as product_id,
wp.packing_unit_id as product_pu,
wp.price as price,
wp.max_per_order,
warehouses.seller_id as seller_id,
warehouses.MIN_TICKET_SIZE
from egypt_marketplace.warehouse_products wp
left join egypt_marketplace.warehouses on warehouses.id = wp.warehouse_id 
where wp.AVAILABLE > 0 and wp.total_stock > 0
and activation = 'true'
)

select
distinct
seller_region.region,
recent_price.*

from recent_price
join seller_region on seller_region.seller_id = recent_price.seller_id'''
mp = query_snowflake(command_string, columns=['region','product_id','product_pu','price','max_per_order','seller_id','min_ticket_size'])
mp.product_id = pd.to_numeric(mp.product_id)
mp.product_pu = pd.to_numeric(mp.product_pu)
mp.price = pd.to_numeric(mp.price)
mp.max_per_order = pd.to_numeric(mp.max_per_order)
mp.seller_id = pd.to_numeric(mp.seller_id)
mp.min_ticket_size = pd.to_numeric(mp.min_ticket_size)

In [4]:
#remove rows with region as Delta West or Upper Egypt
#mp = mp[~mp['region'].isin(['Delta West', 'Upper Egypt'])]


In [7]:
#count unique values in each column
unique_values = {}
for column in mp.columns:
    unique_values[column] = mp[column].nunique()


In [8]:
mp_pre=mp.copy()

In [9]:
command_string = '''
select
product_id,
PACKING_UNIT_ID as pu_id,
BASIC_UNIT_COUNT as buc

from PACKING_UNIT_PRODUCTS'''
pus = query_snowflake(command_string, columns=['product_id','pu_id','buc'])
pus.product_id = pd.to_numeric(pus.product_id)
pus.pu_id = pd.to_numeric(pus.pu_id)
pus.buc = pd.to_numeric(pus.buc)

In [10]:
command_string = '''
    select 
f.product_id,
f.wac1,f.wac4,f.wac_p
from finance.all_cogs f
where current_timestamp BETWEEN f.from_date AND f.to_date'''
wacs = query_snowflake(command_string, columns=['product_id', 'wac1', 'wac4','wac_p'])
wacs.product_id = pd.to_numeric(wacs.product_id)
wacs.wac1 = pd.to_numeric(wacs.wac1)
wacs.wac4 = pd.to_numeric(wacs.wac4)
wacs.wac_p = pd.to_numeric(wacs.wac_p)

In [11]:
pu_wacs = pd.merge(pus, wacs, on='product_id', how='left')

In [12]:
pu_wacs['pu_wac1'] = pu_wacs['buc'] * pu_wacs['wac1']
pu_wacs['pu_wac4'] = pu_wacs['buc'] * pu_wacs['wac4']

In [13]:
mp_pre.rename(columns={'product_pu': 'mp_pu_id'}, inplace=True)

In [14]:
# join both dataframes on product_id and pu_id
mp_pre_pre = pd.merge(
mp_pre,pu_wacs,
    how='inner',  
    right_on=['product_id'],
    left_on=['product_id']
)


In [15]:
mp_pre_pre['pu_wac4_diff'] = (
    (mp_pre_pre['price'].astype(float) - mp_pre_pre['pu_wac4'].astype(float)) 
    / mp_pre_pre['pu_wac4'].astype(float) * 100
).round(2)


In [16]:
# for rows with pu_wac4_diff between -0.01 and 0.01, set a new flag column 'pu_wac4_diff_flag' to True
mp_pre_pre['pu_wac4_diff_flag'] = mp_pre_pre['pu_wac4_diff'].apply(lambda x: True if -40 <= x <= 40 else False)


In [20]:
#keep only rows with pu_wac4_diff_flag is true
mp_pre_pre = mp_pre_pre[mp_pre_pre['pu_wac4_diff_flag'] == True]

In [22]:
mp_final = mp_pre_pre[['region','product_id','price','max_per_order','seller_id','min_ticket_size','pu_id']]


In [23]:
# Define the filtering function to remove only high outliers
def remove_high_outliers_iqr(df, column='min_ticket_size'):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.85)
    IQR = Q3 - Q1
    upper_bound = Q3 + 3 * IQR
    lower_bound = Q1 - 1.5 * IQR
    return df[(df[column] <= upper_bound)&(df[column] >= lower_bound)]

# Apply the function per group
filtered_df = (
    
    mp_final.groupby(['region', 'product_id', 'pu_id'], group_keys=False)
    .apply(remove_high_outliers_iqr)
)


In [24]:
def remove_outliers(filtered_df):
    def iqr_filter(group):
        #
        # # For max_per_order
        q1_max = group['max_per_order'].quantile(0.25)
        q3_max = group['max_per_order'].quantile(0.85)
        iqr_max = q3_max - q1_max
        lower_max = q1_max - 1.5 * iqr_max
        upper_max = q3_max + 1.5 * iqr_max

        # Keep rows where both values are within IQR bounds
        return group[
            (group['max_per_order'] >= lower_max)]

    # Apply to each group
    filtered_df = filtered_df.groupby(['region', 'product_id','pu_id'], group_keys=False).apply(iqr_filter)
    return filtered_df
filtered_df = remove_outliers(filtered_df)

In [26]:
# Function to compute 10th and 90th percentiles
def get_percentile_bounds(group):
    # Convert 'price' to float to avoid Decimal/float conflict
    price_float = group['price'].astype(float)
    lower = price_float.quantile(0.10)
    upper = price_float.quantile(0.90)
    return pd.Series({'min_price': lower, 'max_price': upper})


# Apply function per group
price_bounds = (
    filtered_df
    .groupby(['region', 'product_id', 'pu_id'])
    .apply(get_percentile_bounds)
    .reset_index()
)


In [29]:
#selct the max and minimum price for each group
def get_min_max_price(group):
    min_price = group['price'].min()
    max_price = group['price'].max()
    return pd.Series({'true_min_price': min_price, 'true_max_price': max_price})
# Apply function per group
min_max = (
    filtered_df
    .groupby(['region', 'product_id', 'pu_id'])
    .apply(get_min_max_price)
    .reset_index()
)

In [30]:
#calculate the mod price per product, packing and region
def calculate_mod_price(group):
    # Calculate the mode of the price column
    mode_price = group['price'].mode()
    if not mode_price.empty:
        return mode_price[0]
    else:
        return np.nan
# Apply the function to each group
mod_price = (
    filtered_df
    .groupby(['region', 'product_id', 'pu_id'])
    .apply(calculate_mod_price)
    .reset_index(name='mod_price')
)
# Merge the mod_price with the price_bounds DataFrame
price_bounds = pd.merge(price_bounds, mod_price, on=['region', 'product_id', 'pu_id'], how='left')
price_bounds = pd.merge(price_bounds, min_max, on=['region', 'product_id', 'pu_id'], how='left')


In [31]:
price_bounds[(price_bounds['product_id']==1309)&(price_bounds['region']=='Cairo')]

Unnamed: 0,region,product_id,pu_id,min_price,max_price,mod_price,true_min_price,true_max_price
2789,Cairo,1309,1,550.3,577.0,560.0,540.0,600.0
2790,Cairo,1309,2,48.2,49.8,48.0,48.0,50.0
2791,Cairo,1309,23,48.2,49.8,48.0,48.0,50.0


In [None]:
command_string = '''
select
seller_id,
product_id,
packing_unit_id as pu_id,
item_price,
sum (sop.total_price) as nmv
from egypt_marketplace.sales_orders so
join egypt_marketplace.sales_order_products sop on sop.order_id = so.id

where so.status = 6 
and so.created_at::date >= current_date - 100

group by all '''
mp_sales = query_snowflake(command_string, columns=['seller_id','product_id','pu_id','item_price','nmv'])
mp_sales.seller_id = pd.to_numeric(mp_sales.seller_id)
mp_sales.product_id = pd.to_numeric(mp_sales.product_id)
mp_sales.pu_id = pd.to_numeric(mp_sales.pu_id)
mp_sales.item_price = pd.to_numeric(mp_sales.item_price)
mp_sales.nmv = pd.to_numeric(mp_sales.nmv)

In [None]:
command_string = '''
select
seller_id,
product_id,
packing_unit_id as pu_id,
item_price,
sum (sop.total_price) as nmv
from egypt_marketplace.sales_orders so
join egypt_marketplace.sales_order_products sop on sop.order_id = so.id

where so.status not in (3,7,8)
and so.created_at::date >= current_date - 5

group by all'''
mp_sales_recent = query_snowflake(command_string, columns=['seller_id','product_id','pu_id','item_price','nmv'])
mp_sales_recent.seller_id = pd.to_numeric(mp_sales_recent.seller_id)
mp_sales_recent.product_id = pd.to_numeric(mp_sales_recent.product_id)
mp_sales_recent.pu_id = pd.to_numeric(mp_sales_recent.pu_id)
mp_sales_recent.item_price = pd.to_numeric(mp_sales_recent.item_price)
mp_sales_recent.nmv = pd.to_numeric(mp_sales_recent.nmv)

In [None]:
filtered_df_sales = pd.merge(
    filtered_df,
    mp_sales,
    how='inner',
    on=['seller_id', 'product_id', 'pu_id'])


In [None]:
# Group by region, product_id, and pu_id
# Calculate the weighted average using sum(price * nmv) / sum(nmv)
filtered_df_sales['price'] = pd.to_numeric(filtered_df_sales['price'])
filtered_df_sales['nmv'] = pd.to_numeric(filtered_df_sales['nmv'])
weighted_avg_price = (
    filtered_df_sales
    .groupby(['region', 'product_id', 'pu_id'])
    .apply(lambda g: (g['price'] * g['nmv']).sum() / g['nmv'].sum())
    .reset_index(name='weighted_avg_price')
)


In [None]:
filtered_df_sales_recent = pd.merge(
    filtered_df,
    mp_sales_recent,
    how='inner',
    on=['seller_id', 'product_id', 'pu_id'])


In [None]:
# Group by region, product_id, and pu_id
# Calculate the weighted average using sum(price * nmv) / sum(nmv)
filtered_df_sales_recent['price'] = pd.to_numeric(filtered_df_sales_recent['price'])
filtered_df_sales_recent['nmv'] = pd.to_numeric(filtered_df_sales_recent['nmv'])

weighted_avg_price_recent = (
    filtered_df_sales_recent
    .groupby(['region', 'product_id', 'pu_id'])
    .apply(lambda g: (g['price'] * g['nmv']).sum() / g['nmv'].sum())
    .reset_index(name='weighted_avg_price_recent')
)


In [None]:
weighted_avg_price=pd.merge(weighted_avg_price, weighted_avg_price_recent, on=['region', 'product_id', 'pu_id'], how='left')

In [None]:
price_bounds = pd.merge(price_bounds, weighted_avg_price, on=['region', 'product_id', 'pu_id'])

In [None]:
price_bounds = price_bounds.drop_duplicates()

In [None]:
price_bounds = price_bounds.round(2)

In [None]:
price_bounds[price_bounds['product_id']==3421]