# Data Extraction Module for Pricing & Offers System


## Imports


In [1]:
import os
import numpy as np
import pandas as pd
import snowflake.connector
import setup_environment_2
import slack
import pytz
from common_functions import upload_dataframe_to_snowflake,send_text_slack

# Cairo timezone for consistent timestamps
CAIRO_TZ = pytz.timezone('Africa/Cairo')


  warn_incompatible_dep(


## Variables


In [2]:
# Region
REGION = "Egypt"

# Snowflake Warehouse
WAREHOUSE = "COMPUTE_WH"

# Date Variables
from datetime import datetime, timedelta
TODAY = datetime.now(CAIRO_TZ).date()
YESTERDAY = TODAY - timedelta(days=1)


## Initialize Environment


In [3]:
setup_environment_2.initialize_env()


/home/ec2-user/.Renviron
/home/ec2-user/service_account_key.json


## Warehouse & Cohort Mapping


In [4]:
# Warehouse Mapping: (region, warehouse_name, warehouse_id, cohort_id)
WAREHOUSE_MAPPING = [
    ('Cairo', 'Mostorod', 1, 700),
    ('Giza', 'Barageel', 236, 701),
    ('Giza', 'Sakkarah', 962, 701),
    ('Delta West', 'El-Mahala', 337, 703),
    ('Delta West', 'Tanta', 8, 703),
    ('Delta East', 'Mansoura FC', 339, 704),
    ('Delta East', 'Sharqya', 170, 704),
    ('Upper Egypt', 'Assiut FC', 501, 1124),
    ('Upper Egypt', 'Bani sweif', 401, 1126),
    ('Upper Egypt', 'Menya Samalot', 703, 1123),
    ('Upper Egypt', 'Sohag', 632, 1125),
    ('Alexandria', 'Khorshed Alex', 797, 702),
]



# All Cohort IDs
COHORT_IDS = [700, 701, 702, 703, 704, 1123, 1124, 1125, 1126]


## Snowflake Query Function


In [5]:
def query_snowflake(query):
    """Execute a query on Snowflake and return results as DataFrame."""
    con = snowflake.connector.connect(
        user=os.environ["SNOWFLAKE_USERNAME"],
        account=os.environ["SNOWFLAKE_ACCOUNT"],
        password=os.environ["SNOWFLAKE_PASSWORD"],
        database=os.environ["SNOWFLAKE_DATABASE"]
    )
    try:
        cur = con.cursor()
        cur.execute("USE WAREHOUSE COMPUTE_WH")
        cur.execute(query)
        data = cur.fetchall()
        columns = [desc[0].lower() for desc in cur.description]  # Get column names from cursor
        return pd.DataFrame(data, columns=columns)
    except Exception as e:
        print(f"Snowflake Error: {e}")
        return pd.DataFrame()
    finally:
        cur.close()
        con.close()


In [6]:
def get_snowflake_timezone():
    """Get the current timezone from Snowflake."""
    query = "SHOW PARAMETERS LIKE 'TIMEZONE'"
    result = query_snowflake(query)
    return result.value[0] if len(result) > 0 else "UTC"


## Helper Functions


In [7]:
def get_warehouse_df():
    """Get warehouse mapping as DataFrame."""
    return pd.DataFrame(
        WAREHOUSE_MAPPING,
        columns=['region', 'warehouse', 'warehouse_id', 'cohort_id'])
    


def convert_to_numeric(df):
    """Convert DataFrame columns to numeric where possible."""
    df.columns = df.columns.str.lower()
    for col in df.columns:
        df[col] = pd.to_numeric(df[col], errors='ignore')
    return df


## Get Snowflake Timezone


In [8]:
TIMEZONE = get_snowflake_timezone()
print(f"Snowflake timezone: {TIMEZONE}")


Snowflake timezone: America/Los_Angeles


In [9]:
# =============================================================================
# 8. ALL-TIME HIGH MARGIN QUERY (P80 margin weighted by gross profit)
# This calculates the top 80% margin based on days with best gross profit
# Gross Profit = NMV × margin, so high-margin + high-sales days rank higher
# =============================================================================
ALL_TIME_HIGH_MARGIN_QUERY = f'''
WITH daily_margin_data AS (
    SELECT 
        pso.warehouse_id,
        pso.product_id,
        so.created_at::DATE AS sale_date,
        SUM(pso.total_price) AS daily_nmv,
        SUM(COALESCE(f.wac_p, 0) * pso.purchased_item_count * pso.basic_unit_count) AS daily_cogs,
        CASE 
            WHEN SUM(pso.total_price) > 0 
            THEN (SUM(pso.total_price) - SUM(COALESCE(f.wac_p, 0) * pso.purchased_item_count * pso.basic_unit_count)) / SUM(pso.total_price)
            ELSE 0 
        END AS daily_margin
    FROM product_sales_order pso
    JOIN sales_orders so ON so.id = pso.sales_order_id
    JOIN finance.all_cogs f ON f.product_id = pso.product_id
        AND f.from_date::DATE <= so.created_at::DATE 
        AND f.to_date::DATE > so.created_at::DATE
    WHERE so.created_at::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 240
        AND so.created_at::DATE < CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE
        AND so.sales_order_status_id NOT IN (7, 12)
        AND so.channel IN ('telesales', 'retailer')
        AND pso.purchased_item_count <> 0
    GROUP BY pso.warehouse_id, pso.product_id, so.created_at::DATE
),

-- Calculate gross profit and rank days by it
ranked_by_gross_profit AS (
    SELECT 
        warehouse_id,
        product_id,
        sale_date,
        daily_nmv,
        daily_margin,
        daily_nmv * daily_margin AS gross_profit,
        -- Rank by gross profit (1 = highest gross profit day)
        ROW_NUMBER() OVER (
            PARTITION BY warehouse_id, product_id 
            ORDER BY daily_nmv * daily_margin DESC
        ) AS gp_rank,
        COUNT(*) OVER (PARTITION BY warehouse_id, product_id) AS total_days
    FROM daily_margin_data
    WHERE daily_nmv > 0 AND daily_margin > 0
)

-- Take P80 of margins from TOP-ranked days by gross profit
-- P80 = take the 80th percentile of margins from the best-performing days
SELECT 
    warehouse_id,
    product_id,
    -- P80 margin from days ranked by gross profit
    PERCENTILE_CONT(0.8) WITHIN GROUP (ORDER BY daily_margin) AS all_time_high_margin,
    -- Also provide some context stats
    MAX(daily_margin) AS max_daily_margin,
    AVG(daily_margin) AS avg_daily_margin,
    COUNT(*) AS days_with_profit
FROM ranked_by_gross_profit
-- Include only top 80% of days by gross profit rank
WHERE gp_rank <= GREATEST(1, CEIL(total_days * 0.8))
GROUP BY warehouse_id, product_id
'''

print("All-time high margin query defined (P80 margin weighted by gross profit)")


All-time high margin query defined (P80 margin weighted by gross profit)


## Market Prices Extraction Queries
Queries for external market price data:
1. **Ben Soliman Prices** - Competitor reference prices
2. **Marketplace Prices** - Min, Max, Mod prices from marketplace
3. **Scrapped Data** - Competitor prices from scraping


In [10]:
# =============================================================================
# 1. BEN SOLIMAN PRICES QUERY
# =============================================================================
BEN_SOLIMAN_QUERY = f'''
WITH lower as (
    select distinct product_id, sku, new_d*bs_price as ben_soliman_price, INJECTION_DATE
    from (
        select maxab_product_id as product_id, maxab_sku as sku, INJECTION_DATE, wac1, wac_p,
            (bs_price/bs_unit_count) as bs_price, diff, cu_price,
            case when p1 > 1 then child_quantity else 0 end as scheck,
            round(p1/2)*2 as p1, p2,
            case when (ROUND(p1 / scheck) * scheck) = 0 then p1 else (ROUND(p1 / scheck) * scheck) end as new_d
        from (
            select sm.*, wac1, wac_p, 
                abs((bs_price/bs_unit_count)-(wac_p*maxab_basic_unit_count))/(wac_p*maxab_basic_unit_count) as diff,
                cpc.price as cu_price, pup.child_quantity,
                round((cu_price/(bs_price/bs_unit_count))) as p1, 
                round(((bs_price/bs_unit_count)/cu_price)) as p2
            from materialized_views.savvy_mapping sm 
            join finance.all_cogs f on f.product_id = sm.maxab_product_id 
                and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
            join PACKING_UNIT_PRODUCTS pu on pu.product_id = sm.maxab_product_id and pu.IS_BASIC_UNIT = 1 
            join cohort_product_packing_units cpc on cpc.PRODUCT_PACKING_UNIT_ID = pu.id and cohort_id = 700 
            join packing_unit_products pup on pup.product_id = sm.maxab_product_id and pup.is_basic_unit = 1  
            where bs_price is not null 
                and INJECTION_DATE::date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 5 
                and diff > 0.3 and p1 > 1
        )
    )
    qualify max(INJECTION_DATE) over(partition by product_id) = INJECTION_DATE
),

m_bs as (
    select z.* from (
        select maxab_product_id as product_id, maxab_sku as sku, avg(bs_final_price) as ben_soliman_price, INJECTION_DATE
        from (
            select *, row_number() over(partition by maxab_product_id order by diff) as rnk_2 
            from (
                select *, (bs_final_price-wac_p)/wac_p as diff_2 
                from (
                    select *, bs_price/maxab_basic_unit_count as bs_final_price 
                    from (
                        select *, row_number() over(partition by maxab_product_id, maxab_pu order by diff) as rnk 
                        from (
                            select *, max(INJECTION_DATE::date) over(partition by maxab_product_id, maxab_pu) as max_date
                            from (
                                select sm.*, wac1, wac_p, 
                                    abs(bs_price-(wac_p*maxab_basic_unit_count))/(wac_p*maxab_basic_unit_count) as diff 
                                from materialized_views.savvy_mapping sm 
                                join finance.all_cogs f on f.product_id = sm.maxab_product_id 
                                    and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
                                where bs_price is not null 
                                    and INJECTION_DATE::date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 5 
                                    and diff < 0.3
                            )
                            qualify max_date = INJECTION_DATE
                        ) qualify rnk = 1 
                    )
                ) where diff_2 between -0.5 and 0.5 
            ) qualify rnk_2 = 1 
        ) group by all
    ) z 
    join finance.all_cogs f on f.product_id = z.product_id 
        and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
    where ben_soliman_price between f.wac_p*0.8 and f.wac_p*1.3
)

select product_id, avg(ben_soliman_price) as ben_soliman_price
from (
    select * from (
        select * from m_bs 
        union all
        select * from lower
    )
    qualify max(INJECTION_DATE) over(partition by product_id) = INJECTION_DATE
)
group by all
'''


In [11]:
# =============================================================================
# 2. MARKETPLACE PRICES QUERY (with region fallback)
# =============================================================================
MARKETPLACE_PRICES_QUERY = f'''
WITH MP as (
    select region, product_id,
        min(min_price) as min_price, min(max_price) as max_price,
        min(mod_price) as mod_price, min(true_min) as true_min, min(true_max) as true_max
    from (
        select mp.region, mp.product_id, mp.pu_id,
            min_price/BASIC_UNIT_COUNT as min_price,
            max_price/BASIC_UNIT_COUNT as max_price,
            mod_price/BASIC_UNIT_COUNT as mod_price,
            TRUE_MIN_PRICE/BASIC_UNIT_COUNT as true_min,
            TRUE_MAX_PRICE/BASIC_UNIT_COUNT as true_max
        from materialized_views.marketplace_prices mp 
        join packing_unit_products pup on pup.product_id = mp.product_id and pup.packing_unit_id = mp.pu_id
        join finance.all_cogs f on f.product_id = mp.product_id 
            and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_date and f.to_date
        where least(min_price, mod_price) between wac_p*0.9 and wac_p*1.3 
    )
    group by all 
),

region_mapping AS (
    SELECT * FROM (VALUES
        ('Delta East', 'Delta West'), ('Delta West', 'Delta East'),
        ('Alexandria', 'Cairo'), ('Alexandria', 'Giza'),
        ('Upper Egypt', 'Cairo'), ('Upper Egypt', 'Giza'),
        ('Cairo', 'Giza'), ('Giza', 'Cairo'),
        ('Delta West', 'Cairo'), ('Delta East', 'Cairo'),
        ('Delta West', 'Giza'), ('Delta East', 'Giza')
    ) AS region_mapping(region, fallback_region)
),

all_regions as (
    SELECT * FROM (VALUES
        ('Cairo'), ('Giza'), ('Delta West'), ('Delta East'), ('Upper Egypt'), ('Alexandria')
    ) AS x(region)
),

full_data as (
    select products.id as product_id, ar.region
    from products, all_regions ar
    where activation = 'true'
)

select region, product_id,
    min(final_min_price) as final_min_price, 
    min(final_max_price) as final_max_price,
    min(final_mod_price) as final_mod_price, 
    min(final_true_min) as final_true_min,
    min(final_true_max) as final_true_max
from (
    SELECT distinct w.region, w.product_id,
        COALESCE(m1.min_price, m2.min_price) AS final_min_price,
        COALESCE(m1.max_price, m2.max_price) AS final_max_price,
        COALESCE(m1.mod_price, m2.mod_price) AS final_mod_price,
        COALESCE(m1.true_min, m2.true_min) AS final_true_min,
        COALESCE(m1.true_max, m2.true_max) AS final_true_max
    FROM full_data w
    LEFT JOIN MP m1 ON w.region = m1.region and w.product_id = m1.product_id
    LEFT JOIN region_mapping rm ON w.region = rm.region
    LEFT JOIN MP m2 ON rm.fallback_region = m2.region AND w.product_id = m2.product_id
)
where final_min_price is not null 
group by all
'''


In [12]:
# =============================================================================
# 3. SCRAPPED DATA QUERY (Competitor prices from scraping)
# =============================================================================
SCRAPPED_DATA_QUERY = f'''
select product_id, region,
    MIN(market_price) AS min_scrapped,
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY market_price) AS scrapped25,
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY market_price) AS scrapped50,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY market_price) AS scrapped75,
    MAX(market_price) AS max_scrapped
from (
    select distinct cmp.*, max(date) over(partition by region, cmp.product_id, competitor) as max_date
    from MATERIALIZED_VIEWS.CLEANED_MARKET_PRICES cmp
    join finance.all_cogs f on f.product_id = cmp.product_id 
        and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_date and f.to_date 
    where date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 7 
        and MARKET_PRICE between f.wac_p * 0.8 and wac_p * 1.3
    qualify date = max_date 
)
group by all
'''


In [13]:
## Additional Data Queries (Sales, Groups, WAC)


In [14]:
# =============================================================================
# 4. PRODUCT BASE DATA QUERY (product_id, sku, brand, cat, wac1, wac_p, current_price)
# =============================================================================
PRODUCT_BASE_QUERY = f'''
WITH skus_prices AS (
    WITH local_prices AS (
        SELECT  
            CASE 
                WHEN cpu.cohort_id IN (700, 695) THEN 'Cairo'
                WHEN cpu.cohort_id IN (701) THEN 'Giza'
                WHEN cpu.cohort_id IN (704, 698) THEN 'Delta East'
                WHEN cpu.cohort_id IN (703, 697) THEN 'Delta West'
                WHEN cpu.cohort_id IN (696, 1123, 1124, 1125, 1126) THEN 'Upper Egypt'
                WHEN cpu.cohort_id IN (702, 699) THEN 'Alexandria'
            END AS region,
            cohort_id,
            pu.product_id,
            pu.packing_unit_id,
            pu.basic_unit_count,
            AVG(cpu.price) AS price
        FROM cohort_product_packing_units cpu
        JOIN PACKING_UNIT_PRODUCTS pu ON pu.id = cpu.product_packing_unit_id
        WHERE cpu.cohort_id IN (700,701,702,703,704,695,696,697,698,699,1123,1124,1125,1126)
            AND cpu.created_at::date <> '2023-07-31'
            AND cpu.is_customized = TRUE
        GROUP BY ALL
    ),
    
    live_prices AS (
        SELECT 
            region, cohort_id, product_id, 
            pu_id AS packing_unit_id, 
            buc AS basic_unit_count, 
            NEW_PRICE AS price
        FROM materialized_views.DBDP_PRICES
        WHERE created_at = CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date
            AND DATE_PART('hour', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::time) 
                BETWEEN SPLIT_PART(time_slot, '-', 1)::int AND (SPLIT_PART(time_slot, '-', 1)::int) + 1
            AND cohort_id IN (700,701,702,703,704,695,696,697,698,699,1123,1124,1125,1126)
    ),
    
    prices AS (
        SELECT *
        FROM (
            SELECT *, 1 AS priority FROM live_prices
            UNION ALL
            SELECT *, 2 AS priority FROM local_prices
        )
        QUALIFY ROW_NUMBER() OVER (PARTITION BY region, cohort_id, product_id, packing_unit_id ORDER BY priority) = 1
    )
    
    SELECT region, cohort_id, product_id, price
    FROM prices
    WHERE basic_unit_count = 1
        AND ((product_id = 1309 AND packing_unit_id = 2) OR (product_id <> 1309))
)

SELECT distinct
    region, cohort_id, p.product_id,
    CONCAT(products.name_ar, ' ', products.size, ' ', product_units.name_ar) AS sku,
    b.name_ar AS brand,
    cat.name_ar AS cat,
    wac1, wac_p, p.price as current_price
FROM skus_prices p
JOIN finance.all_cogs c ON c.product_id = p.product_id 
    AND CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) BETWEEN c.from_date AND c.to_date
JOIN products ON products.id = p.product_id
JOIN categories cat ON cat.id = products.category_id
JOIN brands b ON b.id = products.brand_id
JOIN product_units ON product_units.id = products.unit_id
WHERE wac1 > 0 AND wac_p > 0
GROUP BY ALL
'''

# =============================================================================
# 5. SALES DATA QUERY (120-day NMV by cohort/product)
# =============================================================================
SALES_QUERY = f'''
SELECT DISTINCT cpc.cohort_id, pso.product_id,
    CONCAT(products.name_ar,' ',products.size,' ',product_units.name_ar) as sku,
    brands.name_ar as brand, categories.name_ar as cat,
    sum(pso.total_price) as nmv
FROM product_sales_order pso
JOIN sales_orders so ON so.id = pso.sales_order_id
JOIN COHORT_PRICING_CHANGES cpc ON cpc.id = pso.COHORT_PRICING_CHANGE_id
JOIN products ON products.id = pso.product_id
JOIN brands ON products.brand_id = brands.id 
JOIN categories ON products.category_id = categories.id
JOIN product_units ON product_units.id = products.unit_id 
WHERE so.created_at::date BETWEEN CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 120 
    AND CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 1 
    AND so.sales_order_status_id NOT IN (7, 12)
    AND so.channel IN ('telesales', 'retailer')
    AND pso.purchased_item_count <> 0
    AND cpc.cohort_id IN (700,701,702,703,704,1123,1124,1125,1126)
GROUP BY ALL
'''

# =============================================================================
# 6. MARGIN STATS QUERY (STD and average margins)  
# =============================================================================
MARGIN_STATS_QUERY = f'''
select product_id, cohort_id, 
    (0.6*product_std) + (0.3*brand_std) + (0.1*cat_std) as std, 
    avg_margin
from (
    select product_id, cohort_id, 
        stddev(product_margin) as product_std, 
        stddev(brand_margin) as brand_std,
        stddev(cat_margin) as cat_std, 
        avg(product_margin) as avg_margin
    from (
        select distinct product_id, order_date, cohort_id,
            (nmv-cogs_p)/nmv as product_margin, 
            (brand_nmv-brand_cogs)/brand_nmv as brand_margin,
            (cat_nmv-cat_cogs)/cat_nmv as cat_margin
        from (
            SELECT DISTINCT so.created_at::date as order_date, cpc.cohort_id, pso.product_id,
                brands.name_ar as brand, categories.name_ar as cat,
                sum(COALESCE(f.wac_p,0) * pso.purchased_item_count * pso.basic_unit_count) as cogs_p,
                sum(pso.total_price) as nmv,
                sum(nmv) over(partition by order_date, cat, brand) as brand_nmv,
                sum(cogs_p) over(partition by order_date, cat, brand) as brand_cogs,
                sum(nmv) over(partition by order_date, cat) as cat_nmv,
                sum(cogs_p) over(partition by order_date, cat) as cat_cogs
            FROM product_sales_order pso
            JOIN sales_orders so ON so.id = pso.sales_order_id   
            JOIN COHORT_PRICING_CHANGES cpc on cpc.id = pso.cohort_pricing_change_id
            JOIN products on products.id = pso.product_id
            JOIN brands on products.brand_id = brands.id 
            JOIN categories ON products.category_id = categories.id
            JOIN finance.all_cogs f ON f.product_id = pso.product_id
                AND f.from_date::date <= so.created_at::date AND f.to_date::date > so.created_at::date
            WHERE so.created_at::date between 
                date_trunc('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 120) 
                and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date
                AND so.sales_order_status_id not in (7,12)
                AND so.channel IN ('telesales','retailer')
                AND pso.purchased_item_count <> 0
            GROUP BY ALL
        )
    ) group by all 
)
'''

# =============================================================================
# 7. TARGET MARGINS QUERY
# =============================================================================
TARGET_MARGINS_QUERY = f'''
WITH cat_brand_target as (
    SELECT DISTINCT cat, brand, margin as target_bm
    FROM performance.commercial_targets cplan
    QUALIFY CASE 
        WHEN DATE_TRUNC('month', MAX(DATE) OVER()) = DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date) 
        THEN DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date)
        ELSE DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - INTERVAL '1 month') 
    END = DATE_TRUNC('month', date)
),
cat_target as (
    select cat, sum(target_bm * (target_nmv/cat_total)) as cat_target_margin
    from (
        select *, sum(target_nmv) over(partition by cat) as cat_total
        from (
            select cat, brand, avg(target_bm) as target_bm, sum(target_nmv) as target_nmv
            from (
                SELECT DISTINCT date, city as region, cat, brand, margin as target_bm, nmv as target_nmv
                FROM performance.commercial_targets cplan
                QUALIFY CASE 
                    WHEN DATE_TRUNC('month', MAX(DATE) OVER()) = DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date) 
                    THEN DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date)
                    ELSE DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - INTERVAL '1 month') 
                END = DATE_TRUNC('month', date)
            ) group by all
        )
    ) group by all 
)
SELECT DISTINCT cbt.cat, cbt.brand, cbt.target_bm, ct.cat_target_margin
FROM cat_brand_target cbt
LEFT JOIN cat_target ct ON ct.cat = cbt.cat
'''


In [15]:
## Execute All Queries


In [16]:
# =============================================================================
# Execute all queries
# =============================================================================
print("Loading data from Snowflake...")

# NOTE: Ben Soliman, Marketplace, and Scrapped prices are now fetched via
# market_data_module.ipynb get_market_data() function - no need to load here

# 1. Product Base Data (product_id, sku, brand, cat, wac1, wac_p, current_price)
print("  1. Loading product base data...")
df_product_base = query_snowflake(PRODUCT_BASE_QUERY)
df_product_base = convert_to_numeric(df_product_base)
print(f"     Loaded {len(df_product_base)} product base records")

# 2. Sales Data
print("  2. Loading sales data...")
df_sales = query_snowflake(SALES_QUERY)
df_sales = convert_to_numeric(df_sales)
print(f"     Loaded {len(df_sales)} sales records")

# 3. Margin Stats
print("  3. Loading margin stats...")
df_margin_stats = query_snowflake(MARGIN_STATS_QUERY)
df_margin_stats = convert_to_numeric(df_margin_stats)
print(f"     Loaded {len(df_margin_stats)} margin stat records")

# 4. Target Margins
print("  4. Loading target margins...")
df_targets = query_snowflake(TARGET_MARGINS_QUERY)
df_targets = convert_to_numeric(df_targets)
print(f"     Loaded {len(df_targets)} target margin records")

# 5. Product Groups (from PostgreSQL)
print("  5. Loading product groups...")
df_groups = query_snowflake(
    '''SELECT * FROM materialized_views.sku_commercial_groups'''
)
df_groups.columns = df_groups.columns.str.lower()
df_groups = convert_to_numeric(df_groups)
print(f"     Loaded {len(df_groups)} group records")

# 6. All-Time High Margin (P80 margin weighted by gross profit)
print("  6. Loading all-time high margin data...")
df_all_time_high_margin = query_snowflake(ALL_TIME_HIGH_MARGIN_QUERY)
df_all_time_high_margin = convert_to_numeric(df_all_time_high_margin)
print(f"     Loaded {len(df_all_time_high_margin)} all-time high margin records")

print("\nBase queries completed!")
print("NOTE: Market data (Ben Soliman, Marketplace, Scrapped) will be fetched via market_data_module")
print(f"\n{'='*60}")
print("df_product_base DataFrame available with columns:")
print("  - region, cohort_id, product_id, sku, brand, cat, wac1, wac_p, current_price")
print(f"{'='*60}")


Loading data from Snowflake...
  1. Loading product base data...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 102457 product base records
  2. Loading sales data...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 20979 sales records
  3. Loading margin stats...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 29575 margin stat records
  4. Loading target margins...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 478 target margin records
  5. Loading product groups...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 1610 group records
  6. Loading all-time high margin data...
     Loaded 37142 all-time high margin records

Base queries completed!
NOTE: Market data (Ben Soliman, Marketplace, Scrapped) will be fetched via market_data_module

df_product_base DataFrame available with columns:
  - region, cohort_id, product_id, sku, brand, cat, wac1, wac_p, current_price


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [17]:
# =============================================================================
# PART A: Get market_data from market_data_module
# =============================================================================
# Instead of duplicating the market data processing logic here, 
# we use the centralized market_data_module which handles:
# - Ben Soliman prices
# - Marketplace prices  
# - Scrapped prices
# - Group-level price aggregation
# - Price coverage filtering
# - Price percentile calculation
# - Margin tier conversion
# =============================================================================

print("Loading market_data from market_data_module...")
print("(This fetches Ben Soliman, Marketplace, and Scrapped prices from Snowflake)")
print()

# Run market_data_module to get access to get_market_data() function
%run modules/market_data_module.ipynb

# Get fresh market data using the module (no input required)
market_data = get_market_data()

# The market_data now contains:
# - cohort_id, product_id, region
# - Raw prices: ben_soliman_price, final_min_price, final_max_price, etc.
# - Percentiles: minimum, percentile_25, percentile_50, percentile_75, maximum
# - Margins: below_market, market_min, market_25, market_50, market_75, market_max, above_market

print(f"\n{'='*60}")
print(f"MARKET DATA LOADED FROM MODULE")
print(f"{'='*60}")
print(f"Total records: {len(market_data)}")
print(f"  - With marketplace prices: {(~market_data['final_min_price'].isna()).sum()}")
print(f"  - With scrapped prices: {(~market_data['min_scrapped'].isna()).sum()}")
print(f"  - With Ben Soliman prices: {(~market_data['ben_soliman_price'].isna()).sum()}")


Loading market_data from market_data_module...
(This fetches Ben Soliman, Marketplace, and Scrapped prices from Snowflake)

/home/ec2-user/.Renviron
/home/ec2-user/service_account_key.json
Market Data Module loaded at 2026-01-26 14:50:51 Cairo time
Snowflake timezone: America/Los_Angeles
All queries defined ✓
Helper functions defined ✓
get_market_data() function defined ✓
get_margin_tiers() function defined ✓

MARKET DATA MODULE READY

Available functions (NO INPUT REQUIRED):
  - get_market_data()   : Fetch and process all market prices
  - get_margin_tiers()  : Fetch and calculate margin tiers

Usage:
  %run market_data_module.ipynb
  df_market = get_market_data()
  df_tiers = get_margin_tiers()

FETCHING MARKET DATA
Timestamp: 2026-01-26 14:50:52 Cairo time

Step 1: Fetching raw price data...
  1.1 Ben Soliman prices...
      Loaded 1562 records
  1.2 Marketplace prices...
      Loaded 11483 records
  1.3 Scrapped prices...
      Loaded 5165 records
  1.4 Product groups...
      Load

  .apply(lambda g: pd.Series({


    Records with WAC: 25313

Step 6: Filtering by price coverage...
    Records after price coverage filter: 13519

Step 7: Calculating price percentiles...
    Records after price analysis: 12822

Step 8: Converting prices to margins...

MARKET DATA COMPLETE
Total records: 12822
  - With marketplace prices: 12253
  - With scrapped prices: 6338
  - With Ben Soliman prices: 8773

MARKET DATA LOADED FROM MODULE
Total records: 12822
  - With marketplace prices: 12253
  - With scrapped prices: 6338
  - With Ben Soliman prices: 8773


## PART A: Build Main pricing_data DataFrame
Start with df_product_base (all our SKUs) and LEFT JOIN the processed market_data


In [18]:
# =============================================================================
# PART B: Build Main pricing_data DataFrame from df_product_base
# =============================================================================
print("Building main pricing_data DataFrame...")

# =============================================================================
# Step 1: Start with df_product_base as the MAIN dataframe (all our SKUs)
# =============================================================================
print("  Step 1: Starting with product base (all SKUs)...")
pricing_data = df_product_base.copy()
print(f"     Product base: {len(pricing_data)} records")

# =============================================================================
# Step 2: Add warehouse mapping (warehouse_id and warehouse name)
# =============================================================================
print("  Step 2: Adding warehouse mapping...")
warehouse_df = get_warehouse_df()
pricing_data = pricing_data.merge(
    warehouse_df[['cohort_id', 'warehouse_id', 'warehouse']], 
    on='cohort_id'
)
print(f"     After warehouse mapping: {len(pricing_data)} records")

# =============================================================================
# Step 3: LEFT JOIN processed market_data
# =============================================================================
print("  Step 3: Left joining processed market data...")
pricing_data = pricing_data.merge(
    market_data, 
    on=['cohort_id', 'product_id','region'], 
    how='left'
)
print(f"     After market data join: {len(pricing_data)} records")

# =============================================================================
# Step 4: LEFT JOIN supporting data (sales, margins, targets, groups)
# =============================================================================
print("  Step 4: Left joining supporting data...")

# Merge sales data (nmv only)
pricing_data = pricing_data.merge(
    df_sales[['cohort_id', 'product_id', 'nmv']], 
    on=['cohort_id', 'product_id'], 
    how='left'
)
pricing_data['nmv'] = pricing_data['nmv'].fillna(0)

# Merge margin statistics (by cohort_id + product_id)
pricing_data = pricing_data.merge(df_margin_stats, on=['cohort_id', 'product_id'], how='left')

# Merge target margins (by brand + cat)
pricing_data = pricing_data.merge(df_targets, on=['brand', 'cat'], how='left')
pricing_data['target_margin'] = pricing_data['target_bm'].fillna(pricing_data['cat_target_margin']).fillna(0)
pricing_data = pricing_data.drop(columns=['target_bm', 'cat_target_margin'], errors='ignore')

# Fill NaN values with defaults
pricing_data['std'] = pricing_data['std'].fillna(0.01)
pricing_data['avg_margin'] = pricing_data['avg_margin'].fillna(0)

# Merge product groups
pricing_data = pricing_data.merge(df_groups, on='product_id', how='left')

# =============================================================================
# Step 5: Calculate current margin
# =============================================================================
pricing_data['current_margin'] = (pricing_data['current_price'] - pricing_data['wac_p']) / pricing_data['current_price']

# Remove duplicates
pricing_data = pricing_data.drop_duplicates(subset=['cohort_id', 'product_id','warehouse_id'])

# =============================================================================
# Reorder columns
# =============================================================================
final_columns = [
    # Product Base Info
    'cohort_id', 'product_id', 'region', 'warehouse_id', 'warehouse', 'sku', 'brand', 'cat',
    # Cost & Price
    'wac1', 'wac_p', 'current_price', 'current_margin',
    # Sales
    'nmv',
    # Market Prices (raw)
    'ben_soliman_price', 
    'final_min_price', 'final_max_price', 'final_mod_price', 'final_true_min', 'final_true_max',
    'min_scrapped', 'scrapped25', 'scrapped50', 'scrapped75', 'max_scrapped',
    # Price Percentiles
    'minimum', 'percentile_25', 'percentile_50', 'percentile_75', 'maximum',
    # Margin Tiers
    'below_market', 'market_min', 'market_25', 'market_50', 'market_75', 'market_max', 'above_market',
    # Supporting Data
    'std', 'avg_margin', 'target_margin', 'group'
]
pricing_data = pricing_data[[c for c in final_columns if c in pricing_data.columns]]

print(f"\n{'='*60}")
print(f"PRICING DATA COMPLETE")
print(f"{'='*60}")
print(f"Total records: {len(pricing_data)}")
print(f"\nRecords with market data: {len(pricing_data[~pricing_data['minimum'].isna()])}")
print(f"Records without market data: {len(pricing_data[pricing_data['minimum'].isna()])}")
print(f"\nRecords with sales (nmv > 0): {len(pricing_data[pricing_data['nmv'] > 0])}")
print(f"Records without sales (nmv = 0): {len(pricing_data[pricing_data['nmv'] == 0])}")
print(f"\nSample data:")
pricing_data.head()


Building main pricing_data DataFrame...
  Step 1: Starting with product base (all SKUs)...
     Product base: 102457 records
  Step 2: Adding warehouse mapping...
     After warehouse mapping: 86538 records
  Step 3: Left joining processed market data...
     After market data join: 86538 records
  Step 4: Left joining supporting data...

PRICING DATA COMPLETE
Total records: 86538

Records with market data: 16637
Records without market data: 69901

Records with sales (nmv > 0): 29763
Records without sales (nmv = 0): 56775

Sample data:


Unnamed: 0,cohort_id,product_id,region,warehouse_id,warehouse,sku,brand,cat,wac1,wac_p,...,below_market,market_min,market_25,market_50,market_75,market_max,above_market,std,avg_margin,target_margin
0,1123,3576,Upper Egypt,703,Menya Samalot,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,بيبى جوى,حفاضات أطفال,305.277763,274.938331,...,0.028449,0.028486,0.045021,0.090211,0.102477,0.118787,0.118818,0.009253,0.054681,0.055
1,1124,3577,Upper Egypt,501,Assiut FC,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,بيبى جوى,حفاضات أطفال,351.945769,317.041029,...,0.042172,0.045057,0.047925,0.066593,0.094168,0.138475,0.14081,0.013611,0.056875,0.055
2,703,9070,Delta West,337,El-Mahala,زيت كريستال عباد الشمس - 5 لتر,كريستال,زيوت,872.891686,846.48343,...,0.026984,0.03259,0.054076,0.063883,0.069134,0.069798,0.074924,0.009833,0.037588,0.048514
3,703,9070,Delta West,8,Tanta,زيت كريستال عباد الشمس - 5 لتر,كريستال,زيوت,872.891686,846.48343,...,0.026984,0.03259,0.054076,0.063883,0.069134,0.069798,0.074924,0.009833,0.037588,0.048514
4,704,448,Delta East,339,Mansoura FC,زيت كريستال الممتاز خليط - 700 مل,كريستال الممتاز,زيوت,617.798191,598.881289,...,0.001865,0.001865,0.004354,0.018227,0.03172,0.034062,0.04179,0.013075,0.043425,0.03672


## Discount Analysis - Price & Margin After Discount


In [19]:
# =============================================================================
# Discount Query - Get discount percentage by warehouse and product
# =============================================================================
DISCOUNT_QUERY = f'''
SELECT warehouse_id, product_id, total_discount/total_nmv AS discount_perc
FROM (
    SELECT  
        pso.warehouse_id,
        pso.product_id,
        SUM(pso.total_price) AS total_nmv,
        SUM((ITEM_QUANTITY_DISCOUNT_VALUE * pso.purchased_item_count) + 
            (ITEM_DISCOUNT_VALUE * pso.purchased_item_count)) AS total_discount
    FROM product_sales_order pso 
    JOIN sales_orders so ON so.id = pso.sales_order_id
    WHERE so.created_at::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 1 
        AND so.sales_order_status_id NOT IN (7, 12)
        AND so.channel IN ('telesales', 'retailer')
        AND pso.purchased_item_count <> 0
    GROUP BY ALL
)
WHERE total_nmv > 0
'''

# Execute discount query
print("Loading discount data...")
df_discount = query_snowflake(DISCOUNT_QUERY)
df_discount = convert_to_numeric(df_discount)
print(f"Loaded {len(df_discount)} discount records")


Loading discount data...
Loaded 11616 discount records


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [20]:
# =============================================================================
# Create pricing_with_discount DataFrame
# =============================================================================
print("Creating pricing_with_discount DataFrame...")

# Copy pricing_data
pricing_with_discount = pricing_data.copy()

# Merge discount data (by warehouse_id + product_id)
pricing_with_discount = pricing_with_discount.merge(
    df_discount[['warehouse_id', 'product_id', 'discount_perc']], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill missing discount_perc with 0 (no discount)
pricing_with_discount['discount_perc'] = pricing_with_discount['discount_perc'].fillna(0)

# Merge all-time high margin data (P80 margin weighted by gross profit)
pricing_with_discount = pricing_with_discount.merge(
    df_all_time_high_margin[['warehouse_id', 'product_id', 'all_time_high_margin']], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill missing all_time_high_margin with target_margin (fallback)
pricing_with_discount['all_time_high_margin'] = pricing_with_discount['all_time_high_margin'].fillna(
    pricing_with_discount['target_margin']
)

# =============================================================================
# Calculate price and margin after discount
# =============================================================================
# Price after discount = current_price * (1 - discount_perc)
pricing_with_discount['price_after_discount'] = (
    pricing_with_discount['current_price'] * (1 - pricing_with_discount['discount_perc'])
)

# Margin after discount = (price_after_discount - wac_p) / price_after_discount
pricing_with_discount['margin_after_discount'] = (
    (pricing_with_discount['price_after_discount'] - pricing_with_discount['wac_p']) / 
    pricing_with_discount['price_after_discount']
)

print(f"\n{'='*60}")
print(f"PRICING WITH DISCOUNT DATA COMPLETE")
print(f"{'='*60}")
print(f"Total records: {len(pricing_with_discount)}")
print(f"Records with discount (discount_perc > 0): {len(pricing_with_discount[pricing_with_discount['discount_perc'] > 0])}")
print(f"Records without discount: {len(pricing_with_discount[pricing_with_discount['discount_perc'] == 0])}")
print(f"\nNew columns added:")
print("  - discount_perc: discount percentage from sales")
print("  - price_after_discount: current_price * (1 - discount_perc)")
print("  - margin_after_discount: (price_after_discount - wac_p) / price_after_discount")
print(f"\nSample data with discounts:")
pricing_with_discount[pricing_with_discount['discount_perc'] > 0][
    ['product_id', 'warehouse_id', 'current_price', 'current_margin', 
     'discount_perc', 'price_after_discount', 'margin_after_discount']
].head(10)


Creating pricing_with_discount DataFrame...

PRICING WITH DISCOUNT DATA COMPLETE
Total records: 86538
Records with discount (discount_perc > 0): 4932
Records without discount: 81606

New columns added:
  - discount_perc: discount percentage from sales
  - price_after_discount: current_price * (1 - discount_perc)
  - margin_after_discount: (price_after_discount - wac_p) / price_after_discount

Sample data with discounts:


Unnamed: 0,product_id,warehouse_id,current_price,current_margin,discount_perc,price_after_discount,margin_after_discount
3,9070,8,915.0,0.074881,0.0448,874.008,0.031492
4,448,339,618.5,0.03172,0.0063,614.6034,0.025581
10,1414,1,24.0,0.030203,0.001361,23.967328,0.028881
22,9910,1,409.0,0.058147,0.013874,403.325675,0.044896
31,6496,236,730.5,0.059737,0.000351,730.243456,0.059407
32,6496,962,730.5,0.059737,0.014409,719.973869,0.045991
33,10461,1,49.0,0.059694,0.0081,48.6031,0.052015
40,10574,703,57.0,0.038555,0.00314,56.821004,0.035526
46,858,236,599.5,0.058909,0.000675,599.095562,0.058274
60,856,236,849.75,0.046466,0.0018,848.220034,0.044746


In [21]:
# =============================================================================
# Price Position - Determine where price_after_discount falls in market tiers
# =============================================================================

def get_price_position(row):
    """Determine the price position relative to market price tiers."""
    price = row['price_after_discount']
    wac = row['wac_p']
    
    # Check if we have market data (minimum price exists)
    if pd.isna(row['minimum']) or pd.isna(price):
        return "No Market Data"
    
    # Get price tiers
    min_price = row['minimum']
    p25 = row['percentile_25']
    p50 = row['percentile_50']
    p75 = row['percentile_75']
    max_price = row['maximum']
    
    # Calculate below_market and above_market prices from margins
    # margin = (price - wac) / price  =>  price = wac / (1 - margin)
    below_market_margin = row['below_market']
    above_market_margin = row['above_market']
    
    below_market_price = wac / (1 - below_market_margin) if below_market_margin < 1 else min_price
    above_market_price = wac / (1 - above_market_margin) if above_market_margin < 1 else max_price
    
    # Determine position based on price tiers
    if price < below_market_price:
        return "Below Market"
    elif price < min_price:
        return "Below Min"
    elif price < p25:
        return "At Min"
    elif price < p50:
        return "At 25th"
    elif price < p75:
        return "At 50th"
    elif price < max_price:
        return "At 75th"
    elif price < above_market_price:
        return "At Max"
    else:
        return "Above Market"

# Apply price position function
pricing_with_discount['price_position'] = pricing_with_discount.apply(get_price_position, axis=1)

# Summary of price positions
print(f"\n{'='*60}")
print(f"PRICE POSITION ANALYSIS")
print(f"{'='*60}")
print("\nPrice Position Distribution:")
print(pricing_with_discount['price_position'].value_counts().to_string())
print(f"\nPrice Position Percentages:")
print((pricing_with_discount['price_position'].value_counts(normalize=True) * 100).round(2).astype(str) + '%')

# Sample data showing price position
print(f"\nSample data with price position:")
pricing_with_discount[
    ['product_id', 'warehouse_id', 'sku', 'current_price', 'discount_perc', 
     'price_after_discount', 'minimum', 'maximum', 'price_position']
].head(15)



PRICE POSITION ANALYSIS

Price Position Distribution:
price_position
No Market Data    69901
At 25th            3325
At Min             3269
Below Market       3181
At 50th            3045
Above Market       1371
At 75th            1209
Below Min           799
At Max              438

Price Position Percentages:
price_position
No Market Data    80.77%
At 25th            3.84%
At Min             3.78%
Below Market       3.68%
At 50th            3.52%
Above Market       1.58%
At 75th             1.4%
Below Min          0.92%
At Max             0.51%
Name: proportion, dtype: object

Sample data with price position:


Unnamed: 0,product_id,warehouse_id,sku,current_price,discount_perc,price_after_discount,minimum,maximum,price_position
0,3576,703,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,292.5,0.0,292.5,283.0,312.0,At 25th
1,3577,501,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,326.5,0.0,326.5,332.0,368.0,Below Market
2,9070,337,زيت كريستال عباد الشمس - 5 لتر,915.0,0.0,915.0,875.0,910.0,At Max
3,9070,8,زيت كريستال عباد الشمس - 5 لتر,915.0,0.0448,874.008,875.0,910.0,Below Min
4,448,339,زيت كريستال الممتاز خليط - 700 مل,618.5,0.0063,614.6034,600.0,620.0,At 50th
5,448,170,زيت كريستال الممتاز خليط - 700 مل,618.5,0.0,618.5,600.0,620.0,At 75th
6,13045,339,تمور القصيم كيس شفاف - 10 كجم,759.25,0.0,759.25,,,No Market Data
7,13045,170,تمور القصيم كيس شفاف - 10 كجم,759.25,0.0,759.25,,,No Market Data
8,22327,339,بونكس اوتوماتيك برائحة الفل - 9 كجم,704.0,0.0,704.0,724.625064,806.0,Below Market
9,22327,170,بونكس اوتوماتيك برائحة الفل - 9 كجم,704.0,0.0,704.0,724.625064,806.0,Below Market


In [22]:
# =============================================================================
# Stock Query - Get available stock by warehouse and product
# =============================================================================
STOCK_QUERY = '''
SELECT 
    pw.warehouse_id,
    pw.product_id,
    pw.available_stock::INTEGER AS stocks
FROM product_warehouse pw
WHERE pw.warehouse_id NOT IN (6, 9, 10)
    AND pw.is_basic_unit = 1
'''

# Execute stock query
print("Loading stock data...")
df_stocks = query_snowflake(STOCK_QUERY)
df_stocks = convert_to_numeric(df_stocks)
print(f"Loaded {len(df_stocks)} stock records")

# Merge stock data with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_stocks[['warehouse_id', 'product_id', 'stocks']], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill missing stocks with 0
pricing_with_discount['stocks'] = pricing_with_discount['stocks'].fillna(0).astype(int)

print(f"\nStock data merged!")
print(f"Records with stock (stocks > 0): {len(pricing_with_discount[pricing_with_discount['stocks'] > 0])}")
print(f"Records without stock (stocks = 0): {len(pricing_with_discount[pricing_with_discount['stocks'] == 0])}")
print(f"\nSample data with stocks:")
pricing_with_discount[
    ['product_id', 'warehouse_id', 'sku', 'stocks', 'price_after_discount', 'price_position']
].head(10)


Loading stock data...


  df[col] = pd.to_numeric(df[col], errors='ignore')


Loaded 1848253 stock records

Stock data merged!
Records with stock (stocks > 0): 20898
Records without stock (stocks = 0): 65640

Sample data with stocks:


Unnamed: 0,product_id,warehouse_id,sku,stocks,price_after_discount,price_position
0,3576,703,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,32,292.5,At 25th
1,3577,501,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,30,326.5,Below Market
2,9070,337,زيت كريستال عباد الشمس - 5 لتر,0,915.0,At Max
3,9070,8,زيت كريستال عباد الشمس - 5 لتر,72,874.008,Below Min
4,448,339,زيت كريستال الممتاز خليط - 700 مل,37,614.6034,At 50th
5,448,170,زيت كريستال الممتاز خليط - 700 مل,12,618.5,At 75th
6,13045,339,تمور القصيم كيس شفاف - 10 كجم,60,759.25,No Market Data
7,13045,170,تمور القصيم كيس شفاف - 10 كجم,13,759.25,No Market Data
8,22327,339,بونكس اوتوماتيك برائحة الفل - 9 كجم,28,704.0,Below Market
9,22327,170,بونكس اوتوماتيك برائحة الفل - 9 كجم,21,704.0,Below Market


In [23]:
# =============================================================================
# Zero Demand Query - Identify SKUs with zero/low demand
# =============================================================================
ZERO_DEMAND_QUERY = f'''
WITH last_oss AS (
    SELECT product_id, warehouse_id, TIMESTAMP AS last_in_stock_day
    FROM (
        SELECT *, ROW_NUMBER() OVER(PARTITION BY product_id, warehouse_id ORDER BY TIMESTAMP DESC) AS rnk 
        FROM materialized_views.STOCK_DAY_CLOSE
        WHERE AVAILABLE_STOCK = 0 
            AND TIMESTAMP >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 120
        QUALIFY rnk = 1 
    )
),

current_stocks AS (
    SELECT product_id, warehouse_id, AVAILABLE_STOCK, activation
    FROM PRODUCT_WAREHOUSE
    WHERE IS_BASIC_UNIT = 1
        AND CASE WHEN product_id = 1309 THEN packing_unit_id <> 23 ELSE TRUE END
),

prs AS (
    SELECT DISTINCT 
        product_purchased_receipts.product_id,
        purchased_receipts.warehouse_id,
        purchased_receipts.date::DATE AS date,
        product_purchased_receipts.purchased_item_count * product_purchased_receipts.basic_unit_count AS purchase_min_count
    FROM product_purchased_receipts
    JOIN purchased_receipts ON purchased_receipts.id = product_purchased_receipts.purchased_receipt_id
    JOIN last_oss lo ON product_purchased_receipts.product_id = lo.product_id 
        AND lo.warehouse_id = purchased_receipts.warehouse_id 
        AND purchased_receipts.date > lo.last_in_stock_day 
    WHERE product_purchased_receipts.purchased_item_count <> 0
        AND purchased_receipts.purchased_receipt_status_id IN (4, 5, 7)
        AND purchased_receipts.date::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 120
),

main AS (
    SELECT 
        prs.product_id, 
        prs.warehouse_id, 
        MIN(date) AS first_order_date, 
        SUM(purchase_min_count) AS total_recieved, 
        cs.AVAILABLE_STOCK, 
        cs.activation
    FROM prs 
    JOIN current_stocks cs ON cs.product_id = prs.product_id AND prs.warehouse_id = cs.warehouse_id
    GROUP BY prs.product_id, prs.warehouse_id, cs.AVAILABLE_STOCK, cs.activation
),

sold_days AS (
    SELECT product_id, warehouse_id, COUNT(DISTINCT o_date) AS sales_days
    FROM (
        SELECT DISTINCT
            so.created_at::DATE AS o_date,
            pso.warehouse_id,
            pso.product_id,
            SUM(pso.purchased_item_count * basic_unit_count) AS daily_qty
        FROM product_sales_order pso
        JOIN sales_orders so ON so.id = pso.sales_order_id
        JOIN main m ON m.product_id = pso.product_id 
            AND m.warehouse_id = pso.warehouse_id 
            AND so.created_at::DATE >= m.first_order_date
        WHERE so.created_at::DATE BETWEEN CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 120 
            AND CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE
            AND so.sales_order_status_id NOT IN (7, 12)
            AND so.channel IN ('telesales', 'retailer')
            AND pso.purchased_item_count <> 0
        GROUP BY o_date, pso.warehouse_id, pso.product_id
    )
    GROUP BY product_id, warehouse_id
)

SELECT DISTINCT warehouse_id, product_id
FROM (
    SELECT m.product_id, m.warehouse_id, m.first_order_date, m.activation,
        COALESCE(sd.sales_days, 0) AS sales_days,
        COALESCE(sd.sales_days, 0)::FLOAT / NULLIF((CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 1) - m.first_order_date, 0) AS perc_days
    FROM main m 
    LEFT JOIN sold_days sd ON sd.product_id = m.product_id AND sd.warehouse_id = m.warehouse_id
    WHERE m.first_order_date < CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 10
)
WHERE perc_days <= 0.3
    AND activation = 'true'
'''

# Execute zero demand query
print("Loading zero demand SKUs...")
df_zero_demand = query_snowflake(ZERO_DEMAND_QUERY)
df_zero_demand = convert_to_numeric(df_zero_demand)
print(f"Loaded {len(df_zero_demand)} zero demand SKU records")


Loading zero demand SKUs...
Loaded 4385 zero demand SKU records


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [24]:
# =============================================================================
# Add Zero Demand Flag to pricing_with_discount
# =============================================================================

# Add a marker column to identify zero demand SKUs
df_zero_demand['zero_demand'] = 1

# Merge with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_zero_demand[['warehouse_id', 'product_id', 'zero_demand']], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill missing values with 0 (not zero demand)
pricing_with_discount['zero_demand'] = pricing_with_discount['zero_demand'].fillna(0).astype(int)

print(f"Zero demand flag added!")
print(f"SKUs flagged as zero demand: {len(pricing_with_discount[pricing_with_discount['zero_demand'] == 1])}")
print(f"SKUs with normal demand: {len(pricing_with_discount[pricing_with_discount['zero_demand'] == 0])}")


Zero demand flag added!
SKUs flagged as zero demand: 4231
SKUs with normal demand: 82307


In [25]:
# =============================================================================
# OOS Yesterday Query - Identify SKUs out of stock yesterday
# =============================================================================
OOS_YESTERDAY_QUERY = f'''
SELECT DISTINCT product_id, warehouse_id,
    CASE WHEN opening_stocks = 0 AND closing_stocks = 0 THEN 1
         ELSE 0 
    END AS oos_yesterday
FROM (
    SELECT 
        timestamp,
        product_id,
        warehouse_id, 
        AVAILABLE_STOCK AS closing_stocks,
        LAG(AVAILABLE_STOCK) OVER (PARTITION BY product_id, warehouse_id ORDER BY TIMESTAMP) AS opening_stocks
    FROM materialized_views.stock_day_close
    WHERE timestamp::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 2
    QUALIFY opening_stocks IS NOT NULL
)
WHERE oos_yesterday = 1
'''

# Execute OOS yesterday query
print("Loading OOS yesterday data...")
df_oos_yesterday = query_snowflake(OOS_YESTERDAY_QUERY)
df_oos_yesterday = convert_to_numeric(df_oos_yesterday)
print(f"Loaded {len(df_oos_yesterday)} OOS yesterday records")


Loading OOS yesterday data...
Loaded 1896232 OOS yesterday records


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [26]:
# =============================================================================
# Add OOS Yesterday Flag to pricing_with_discount
# =============================================================================

# Merge with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_oos_yesterday[['warehouse_id', 'product_id', 'oos_yesterday']], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill missing values with 0 (not OOS yesterday)
pricing_with_discount['oos_yesterday'] = pricing_with_discount['oos_yesterday'].fillna(0).astype(int)

print(f"OOS yesterday flag added!")
print(f"SKUs out of stock yesterday: {len(pricing_with_discount[pricing_with_discount['oos_yesterday'] == 1])}")
print(f"SKUs in stock yesterday: {len(pricing_with_discount[pricing_with_discount['oos_yesterday'] == 0])}")


OOS yesterday flag added!
SKUs out of stock yesterday: 65227
SKUs in stock yesterday: 21311


In [27]:
# =============================================================================
# Running Rate Query - Get in-stock running rate by warehouse and product
# =============================================================================
RUNNING_RATE_QUERY = f'''
WITH params AS (
    SELECT
        CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE AS run_date,
        DATEADD(month, -3, CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE) AS history_start
),

-- Daily sales aggregation
sales_base AS (
    SELECT
        pso.product_id,
        pso.warehouse_id,
        DATE_TRUNC('day', pso.created_at)::DATE AS date,
        SUM(pso.purchased_item_count * pso.basic_unit_count) AS sold_units,
        SUM(pso.purchased_item_count * pso.basic_unit_count * pso.item_price)
            / NULLIF(SUM(pso.purchased_item_count * pso.basic_unit_count), 0) AS avg_selling_price,
        COUNT(DISTINCT so.retailer_id) AS retailer_count
    FROM product_sales_order pso
    JOIN sales_orders so ON pso.sales_order_id = so.id
    WHERE DATE_TRUNC('day', pso.created_at)::DATE >= (SELECT history_start FROM params)
    GROUP BY 1, 2, 3
),

-- Stock daily metrics
stock_daily AS (
    SELECT
        product_id,
        warehouse_id,
        DATE_TRUNC('day', TIMESTAMP)::DATE AS date,
        MAX_BY(available_stock, TIMESTAMP) AS stock_closing,
        24 * SUM(CASE WHEN activation = FALSE OR available_stock = 0 THEN 1 ELSE 0 END)::FLOAT 
            / NULLIF(COUNT(*), 0) AS oos_hours,
        MAX(CASE WHEN activation = TRUE AND available_stock > 0 THEN 1 ELSE 0 END) AS in_stock_flag
    FROM materialized_views.STOCK_SNAP_SHOTS_RECENT
    WHERE product_id IS NOT NULL
    GROUP BY product_id, warehouse_id, date
),

-- Join sales + stock + WAC (only in-stock days)
base_data AS (
    SELECT
        sb.product_id,
        sb.warehouse_id,
        sb.date,
        sb.sold_units,
        sb.avg_selling_price,
        sb.retailer_count,
        sd.oos_hours,
        sd.in_stock_flag,
        ac.wac_p AS wac,
        CASE WHEN DAYOFWEEKISO(sb.date) IN (5, 6) THEN 1 ELSE 0 END AS is_weekend
    FROM sales_base sb
    LEFT JOIN stock_daily sd ON sb.product_id = sd.product_id 
        AND sb.warehouse_id = sd.warehouse_id AND sb.date = sd.date
    LEFT JOIN finance.ALL_COGS ac ON sb.product_id = ac.product_id 
        AND sb.date BETWEEN ac.from_date AND ac.to_date
    WHERE sd.in_stock_flag = 1
),

-- Stats per SKU x Warehouse
sku_wh_stats AS (
    SELECT
        product_id, warehouse_id,
        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY sold_units) AS med_units,
        PERCENTILE_CONT(0.95) WITHIN GROUP (ORDER BY sold_units) AS pct95_units,
        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY retailer_count) AS med_retailers,
        PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY 
            CASE WHEN avg_selling_price IS NULL OR avg_selling_price = 0 THEN 0 
            ELSE (avg_selling_price - COALESCE(wac, 0)) / NULLIF(avg_selling_price, 0) END
        ) AS med_margin
    FROM base_data
    GROUP BY product_id, warehouse_id
),

-- Cap outliers and adjust for retailer spikes
adjusted AS (
    SELECT
        b.product_id, b.warehouse_id, b.date, b.in_stock_flag, b.oos_hours, b.is_weekend,
        b.avg_selling_price, b.wac, s.med_margin,
        CASE 
            WHEN b.retailer_count > GREATEST(2, s.med_retailers * 2) 
                AND b.retailer_count > 0 AND s.med_retailers IS NOT NULL
            THEN ROUND(LEAST(b.sold_units, s.pct95_units) * (s.med_retailers::FLOAT / NULLIF(b.retailer_count::FLOAT, 0)), 0)
            ELSE LEAST(b.sold_units, s.pct95_units)
        END AS units_adjusted
    FROM base_data b
    LEFT JOIN sku_wh_stats s ON b.product_id = s.product_id AND b.warehouse_id = s.warehouse_id
),

-- Apply weights (recency, stock availability, weekend, margin)
weighted AS (
    SELECT
        product_id, warehouse_id, date, units_adjusted,
        (
            -- Recency weight
            CASE WHEN date >= DATEADD(day, -21, (SELECT run_date FROM params)) THEN 1.5
                 WHEN date >= DATEADD(day, -90, (SELECT run_date FROM params)) THEN 1.0
                 ELSE 0.5 END
            -- In-stock weight
            * CASE WHEN in_stock_flag = 1 AND COALESCE(oos_hours, 0) < 12 THEN 1.4
                   WHEN in_stock_flag = 1 AND COALESCE(oos_hours, 0) >= 12 THEN 0.9
                   ELSE 0.6 END
            -- Weekend weight
            * CASE WHEN is_weekend = 1 THEN 0.7 ELSE 1.0 END
            -- Margin weight
            * CASE WHEN avg_selling_price IS NULL OR avg_selling_price = 0 OR med_margin IS NULL THEN 1.0
                   WHEN ((avg_selling_price - COALESCE(wac, 0)) / NULLIF(avg_selling_price, 0)) < med_margin
                   THEN 1.0 + LEAST((med_margin - ((avg_selling_price - COALESCE(wac, 0)) / NULLIF(avg_selling_price, 0))) * 2.0, 0.6)
                   WHEN ((avg_selling_price - COALESCE(wac, 0)) / NULLIF(avg_selling_price, 0)) > med_margin
                   THEN 1.0 - LEAST((((avg_selling_price - COALESCE(wac, 0)) / NULLIF(avg_selling_price, 0)) - med_margin) * 2.0, 0.4)
                   ELSE 1.0 END
        ) AS final_weight
    FROM adjusted
    WHERE units_adjusted IS NOT NULL
),

-- Weighted average forecast
forecast_base AS (
    SELECT
        product_id, warehouse_id,
        SUM(units_adjusted * final_weight) / NULLIF(SUM(final_weight), 0) AS weighted_avg_units
    FROM weighted
    GROUP BY product_id, warehouse_id
),

-- Zero-sales last 4 days (with stock) exclusion flag
last4_flag AS (
    SELECT product_id, warehouse_id,
        CASE WHEN COUNT(*) = 4 
             AND SUM(CASE WHEN COALESCE(sold_units, 0) = 0 AND in_stock_flag = 1 THEN 1 ELSE 0 END) = 4
        THEN 1 ELSE 0 END AS exclude_flag
    FROM base_data
    WHERE date >= DATEADD(day, -4, (SELECT run_date FROM params)) 
        AND date < (SELECT run_date FROM params)
    GROUP BY product_id, warehouse_id
),

-- Zero sales excluded (in stock but no sales)
zero_sales_excluded AS (
    SELECT DISTINCT s.warehouse_id, s.product_id
    FROM (
        SELECT pw.warehouse_id, pw.product_id, SUM(pw.available_stock)::INT AS stocks
        FROM product_warehouse pw
        WHERE pw.warehouse_id NOT IN (6, 9, 10) AND pw.is_basic_unit = 1 AND pw.available_stock > 0
        GROUP BY pw.warehouse_id, pw.product_id
    ) s
    LEFT JOIN (
        SELECT pso.product_id, pso.warehouse_id, SUM(pso.total_price) AS nmv
        FROM product_sales_order pso
        JOIN sales_orders so ON so.id = pso.sales_order_id
        WHERE so.created_at::date BETWEEN CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 5 
            AND CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 1
            AND so.sales_order_status_id NOT IN (7, 12) AND so.channel IN ('telesales', 'retailer')
            AND pso.purchased_item_count <> 0
        GROUP BY pso.product_id, pso.warehouse_id
    ) md ON md.product_id = s.product_id AND md.warehouse_id = s.warehouse_id
    LEFT JOIN finance.all_cogs f ON f.product_id = s.product_id
        AND f.from_date::date <= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE
        AND f.to_date::date > CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE
    LEFT JOIN (
        SELECT pr.warehouse_id, ppr.product_id, SUM(ppr.final_price) AS total_prs
        FROM product_purchased_receipts ppr
        JOIN purchased_receipts pr ON pr.id = ppr.purchased_receipt_id
        WHERE pr.date::date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 4
            AND pr.is_actual = 'true' AND pr.purchased_receipt_status_id IN (4, 5, 7)
            AND ppr.purchased_item_count <> 0
        GROUP BY pr.warehouse_id, ppr.product_id
    ) prs ON prs.product_id = s.product_id AND prs.warehouse_id = s.warehouse_id
    WHERE COALESCE(md.nmv, 0) = 0 
        AND COALESCE(prs.total_prs, 0) < 0.7 * (COALESCE(f.wac_p, 0) * s.stocks)
),

-- First sale date for new products
first_sale AS (
    SELECT product_id, warehouse_id, MIN(date) AS first_sale_date
    FROM base_data WHERE sold_units > 0
    GROUP BY product_id, warehouse_id
)

-- Final output: running rate per warehouse/product
SELECT
    fb.warehouse_id,
    fb.product_id,
    CASE
        WHEN l4.exclude_flag = 1 THEN 0
        WHEN fs.first_sale_date >= DATEADD(day, -2, (SELECT run_date FROM params))
        THEN GREATEST(CEIL(fb.weighted_avg_units), 1)
        ELSE CEIL(fb.weighted_avg_units)
    END AS In_stock_rr
FROM forecast_base fb
LEFT JOIN last4_flag l4 ON fb.product_id = l4.product_id AND fb.warehouse_id = l4.warehouse_id
LEFT JOIN first_sale fs ON fb.product_id = fs.product_id AND fb.warehouse_id = fs.warehouse_id
LEFT JOIN zero_sales_excluded zse ON fb.product_id = zse.product_id AND fb.warehouse_id = zse.warehouse_id
WHERE zse.product_id IS NULL
'''

# Execute running rate query
print("Loading running rate data (this may take a moment)...")
df_running_rate = query_snowflake(RUNNING_RATE_QUERY)
df_running_rate = convert_to_numeric(df_running_rate)
print(f"Loaded {len(df_running_rate)} running rate records")


Loading running rate data (this may take a moment)...
Loaded 22899 running rate records


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [28]:
# =============================================================================
# Merge Running Rate and Calculate DOH (Days on Hand)
# =============================================================================

# Merge running rate data with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_running_rate[['warehouse_id', 'product_id', 'in_stock_rr']], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill missing running rate with 0
pricing_with_discount['in_stock_rr'] = pricing_with_discount['in_stock_rr'].fillna(0)

# Calculate DOH (Days on Hand) = stocks / in_stock_rr
# Handle division by zero - if running rate is 0, DOH is infinite (use 999)
pricing_with_discount['doh'] = np.select(
    [
        (pricing_with_discount['in_stock_rr'] > 0) & (pricing_with_discount['stocks'] > 0),
        pricing_with_discount['stocks'] == 0
    ],
    [
        pricing_with_discount['stocks'] / pricing_with_discount['in_stock_rr'],
        0
    ],
    default=999
)


In [29]:
# =============================================================================
# Product Classification Query - ABC Classification based on order contribution
# =============================================================================
PRODUCT_CLASSIFICATION_QUERY = f'''
WITH order_counts AS (
    SELECT 
        pso.warehouse_id,
        pso.product_id,
        COUNT(DISTINCT pso.sales_order_id) AS order_count
    FROM product_sales_order pso
    JOIN sales_orders so ON so.id = pso.sales_order_id
    WHERE so.created_at::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 90
        AND so.sales_order_status_id NOT IN (7, 12)
        AND so.channel IN ('telesales', 'retailer')
        AND pso.purchased_item_count <> 0
    GROUP BY pso.warehouse_id, pso.product_id
),

warehouse_totals AS (
    SELECT 
        warehouse_id,
        SUM(order_count) AS total_orders
    FROM order_counts
    GROUP BY warehouse_id
),

ranked_products AS (
    SELECT 
        oc.warehouse_id,
        oc.product_id,
        oc.order_count,
        wt.total_orders,
        oc.order_count::FLOAT / NULLIF(wt.total_orders, 0) AS contribution,
        SUM(oc.order_count::FLOAT / NULLIF(wt.total_orders, 0)) 
            OVER (PARTITION BY oc.warehouse_id ORDER BY oc.order_count DESC 
                  ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW) AS cumulative_contribution
    FROM order_counts oc
    JOIN warehouse_totals wt ON oc.warehouse_id = wt.warehouse_id
)

SELECT 
    warehouse_id,
    product_id,
    order_count,
    contribution,
    cumulative_contribution,
    CASE 
        WHEN cumulative_contribution <= 0.3 THEN 'A'
        WHEN cumulative_contribution <= 0.75 THEN 'B'
        ELSE 'C'
    END AS abc_class
FROM ranked_products
'''

# Execute product classification query
print("Loading product classification data...")
df_classification = query_snowflake(PRODUCT_CLASSIFICATION_QUERY)
df_classification = convert_to_numeric(df_classification)
print(f"Loaded {len(df_classification)} product classification records")
print(f"\nClassification distribution:")
print(df_classification['abc_class'].value_counts().to_string())


Loading product classification data...
Loaded 27814 product classification records

Classification distribution:
abc_class
C    22512
B     4605
A      697


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [30]:
# =============================================================================
# Add ABC Classification to pricing_with_discount
# =============================================================================

# Merge classification data with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_classification[['warehouse_id', 'product_id', 'order_count', 'contribution', 'abc_class']], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill missing values - products without orders in last 3 months get class 'C'
pricing_with_discount['order_count'] = pricing_with_discount['order_count'].fillna(0).astype(int)
pricing_with_discount['contribution'] = pricing_with_discount['contribution'].fillna(0)
pricing_with_discount['abc_class'] = pricing_with_discount['abc_class'].fillna('C')

print(f"ABC Classification added!")
print(f"\nClassification in pricing_with_discount:")
print(pricing_with_discount['abc_class'].value_counts().to_string())
print(f"\nSample data with classification:")
pricing_with_discount[
    ['product_id', 'warehouse_id', 'sku', 'order_count', 'contribution', 'abc_class', 'stocks', 'doh']
].head(15)


ABC Classification added!

Classification in pricing_with_discount:
abc_class
C    81500
B     4381
A      657

Sample data with classification:


Unnamed: 0,product_id,warehouse_id,sku,order_count,contribution,abc_class,stocks,doh
0,3576,703,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,94,0.000565,C,32,6.4
1,3577,501,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,50,0.000393,C,30,10.0
2,9070,337,زيت كريستال عباد الشمس - 5 لتر,27,0.000127,C,0,0.0
3,9070,8,زيت كريستال عباد الشمس - 5 لتر,60,0.000205,C,72,24.0
4,448,339,زيت كريستال الممتاز خليط - 700 مل,159,0.000619,B,37,12.333333
5,448,170,زيت كريستال الممتاز خليط - 700 مل,52,0.000245,C,12,6.0
6,13045,339,تمور القصيم كيس شفاف - 10 كجم,11,4.3e-05,C,60,30.0
7,13045,170,تمور القصيم كيس شفاف - 10 كجم,5,2.4e-05,C,13,6.5
8,22327,339,بونكس اوتوماتيك برائحة الفل - 9 كجم,29,0.000113,C,28,9.333333
9,22327,170,بونكس اوتوماتيك برائحة الفل - 9 كجم,9,4.2e-05,C,21,10.5


In [31]:
# =============================================================================
# PO (Purchase Order) Data Query - Last PO status and rejection count
# =============================================================================
PO_DATA_QUERY = '''
WITH last_data AS (
    SELECT product_id, warehouse_id, confirmation_status, PO_date::DATE AS last_po_date, ordered_qty
    FROM (
        SELECT 
            product_id,
            Target_WAREHOUSE_ID AS warehouse_id,
            confirmation_status,
            created_at AS PO_date,
            MIN_QUANTITY AS ordered_qty,
            reason,
            MAX(created_at) OVER (PARTITION BY product_id, Target_WAREHOUSE_ID) AS last_po
        FROM retool.PO_INITIAL_PLAN
        WHERE created_at::DATE >= CURRENT_DATE - 15 
    ) x
    WHERE last_po = PO_date
),

last_15_data AS (
    SELECT 
        product_id,
        target_WAREHOUSE_ID AS warehouse_id,
        COUNT(DISTINCT CASE WHEN confirmation_status <> 'yes' THEN created_at END) AS no_last_15
    FROM retool.PO_INITIAL_PLAN
    WHERE created_at::DATE >= CURRENT_DATE - 15 
    GROUP BY 1, 2
)

SELECT 
    ld.product_id,
    ld.warehouse_id,
    ld.confirmation_status,
    ld.last_po_date,
    ld.ordered_qty,
    COALESCE(lfd.no_last_15, 0) AS no_last_15
FROM last_data ld 
LEFT JOIN last_15_data lfd 
    ON lfd.product_id = ld.product_id 
    AND lfd.warehouse_id = ld.warehouse_id
'''

# Execute PO data query using dwh_pg_query
print("Loading PO data...")
df_po_data = setup_environment_2.dwh_pg_query(
    PO_DATA_QUERY, 
    columns=['product_id', 'warehouse_id', 'confirmation_status', 'last_po_date', 'ordered_qty', 'no_last_15']
)
df_po_data.columns = df_po_data.columns.str.lower()
df_po_data = convert_to_numeric(df_po_data)
print(f"Loaded {len(df_po_data)} PO records")
print(f"\nConfirmation status distribution:")
print(df_po_data['confirmation_status'].value_counts().to_string())


Loading PO data...
Loaded 16410 PO records

Confirmation status distribution:
confirmation_status
yes    12536
no      3837


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [32]:
# =============================================================================
# Add PO Data to pricing_with_discount
# =============================================================================

# Merge PO data with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_po_data[['warehouse_id', 'product_id', 'confirmation_status', 'last_po_date', 'ordered_qty', 'no_last_15']], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill missing values
pricing_with_discount['ordered_qty'] = pricing_with_discount['ordered_qty'].fillna(0)
pricing_with_discount['no_last_15'] = pricing_with_discount['no_last_15'].fillna(0).astype(int)

print(f"PO data added!")
print(f"\nRecords with PO data: {len(pricing_with_discount[~pricing_with_discount['confirmation_status'].isna()])}")
print(f"Records without PO data: {len(pricing_with_discount[pricing_with_discount['confirmation_status'].isna()])}")
print(f"\nSample data with PO info:")
pricing_with_discount[
    ['product_id', 'warehouse_id', 'sku', 'confirmation_status', 'last_po_date', 'ordered_qty', 'no_last_15']
].dropna(subset=['confirmation_status']).head(15)


PO data added!

Records with PO data: 15599
Records without PO data: 70939

Sample data with PO info:


Unnamed: 0,product_id,warehouse_id,sku,confirmation_status,last_po_date,ordered_qty,no_last_15
0,3576,703,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,yes,2026-01-25,3.0,1
1,3577,501,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,yes,2026-01-21,30.0,1
2,9070,337,زيت كريستال عباد الشمس - 5 لتر,yes,2026-01-22,11.0,0
3,9070,8,زيت كريستال عباد الشمس - 5 لتر,yes,2026-01-14,4.0,0
4,448,339,زيت كريستال الممتاز خليط - 700 مل,yes,2026-01-21,32.0,0
5,448,170,زيت كريستال الممتاز خليط - 700 مل,yes,2026-01-22,12.0,0
6,13045,339,تمور القصيم كيس شفاف - 10 كجم,yes,2026-01-17,64.0,0
7,13045,170,تمور القصيم كيس شفاف - 10 كجم,no,2026-01-25,3.0,2
10,1414,1,تونة دولفين مفتتة حار - 140 جم,yes,2026-01-24,1872.0,0
11,3575,501,حفاضات بيبى جوى مضغوطة وسط مقاس 3 - 58 حفاضة,yes,2026-01-25,3.0,2


In [33]:
# =============================================================================
# Leadtime Query - Supplier leadtime by brand, category, and warehouse
# =============================================================================
LEADTIME_QUERY = '''
SELECT brand, cat, warehouse_id, leadtime
FROM (
    SELECT a.*, b.name_ar AS brand, c.name_ar AS cat
    FROM (
        SELECT DISTINCT 
            sl.supplier_id, 
            warehouse_id, 
            category_id, 
            brand_id, 
            sl.updated_at, 
            leadtime,
            MAX(sl.updated_at) OVER (PARTITION BY sl.supplier_id, warehouse_id) AS last_update
        FROM retool.SUPPLIER_MOQ sl 
        JOIN retool.PO_SUPPLIER_MAPPING sm ON sl.supplier_id = sm.supplier_id 
    ) a
    JOIN brands b ON b.id = a.brand_id 
    JOIN categories c ON c.id = a.category_id
    WHERE a.updated_at = last_update
) d
'''

# Execute leadtime query using dwh_pg_query
print("Loading leadtime data...")
df_leadtime = setup_environment_2.dwh_pg_query(
    LEADTIME_QUERY, 
    columns=['brand', 'cat', 'warehouse_id', 'leadtime']
)
df_leadtime.columns = df_leadtime.columns.str.lower()
df_leadtime = convert_to_numeric(df_leadtime)
print(f"Loaded {len(df_leadtime)} leadtime records")


Loading leadtime data...
Loaded 14876 leadtime records


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [34]:
# =============================================================================
# Add Leadtime to pricing_with_discount
# =============================================================================

# Merge leadtime data with pricing_with_discount (by brand, cat, warehouse_id)
pricing_with_discount = pricing_with_discount.merge(
    df_leadtime[['brand', 'cat', 'warehouse_id', 'leadtime']], 
    on=['brand', 'cat', 'warehouse_id'], 
    how='left'
)

# Fill missing leadtime with 0 or a default value
pricing_with_discount['leadtime'] = pricing_with_discount['leadtime'].fillna(72)


print(f"Leadtime data added!")
print(f"\nRecords with leadtime: {len(pricing_with_discount[pricing_with_discount['leadtime'] > 0])}")
print(f"Records without leadtime: {len(pricing_with_discount[pricing_with_discount['leadtime'] == 0])}")
print(f"\nLeadtime distribution:")
print(pricing_with_discount['leadtime'].describe())

# =============================================================================
# Calculate Expected Receiving Day
# If confirmation_status is 'no': add 2 extra days (48 hours) before adding leadtime
# expected_receiving_day = last_po_date + ((2 + leadtime) / 24) if not confirmed
# expected_receiving_day = last_po_date + (leadtime / 24) if confirmed
# =============================================================================

# Convert last_po_date to datetime if not already
pricing_with_discount['last_po_date'] = pd.to_datetime(pricing_with_discount['last_po_date'], errors='coerce')

# Calculate adjusted leadtime: add 48 hours (2 days) if confirmation_status is 'no'
pricing_with_discount['adjusted_leadtime'] = np.where(
    pricing_with_discount['confirmation_status'].str.lower() == 'no',
    pricing_with_discount['leadtime'] + 48,  # Add 2 days (48 hours) if not confirmed
    pricing_with_discount['leadtime']
)

# Calculate expected receiving day (leadtime is in hours, divide by 24 for days)
pricing_with_discount['expected_receiving_day'] = pricing_with_discount['last_po_date'] + pd.to_timedelta(
    pricing_with_discount['adjusted_leadtime'] / 24, unit='D'
)

# Set expected_receiving_day to empty if it's in the past (smaller than today)
pricing_with_discount['expected_receiving_day'] = np.where(
    pricing_with_discount['expected_receiving_day'] < pd.Timestamp(TODAY),
    pd.NaT,
    pricing_with_discount['expected_receiving_day']
)
# Convert back to datetime (np.where returns object type)
pricing_with_discount['expected_receiving_day'] = pd.to_datetime(pricing_with_discount['expected_receiving_day'])

print(f"\nExpected receiving day calculated!")
print(f"Records with expected receiving day (future dates only): {len(pricing_with_discount[~pricing_with_discount['expected_receiving_day'].isna()])}")
print(f"Records with past expected dates (set to empty): {len(pricing_with_discount[pricing_with_discount['expected_receiving_day'].isna() & pricing_with_discount['last_po_date'].notna()])}")
print(f"Records with confirmation_status='no' (added 2 extra days): {len(pricing_with_discount[pricing_with_discount['confirmation_status'].str.lower() == 'no'])}")
print(f"\nSample data with expected receiving day:")
pricing_with_discount[~pricing_with_discount['last_po_date'].isna()][
    ['product_id', 'warehouse_id', 'sku', 'confirmation_status', 'last_po_date', 'leadtime', 'adjusted_leadtime', 'expected_receiving_day', 'doh']
].head(15)


Leadtime data added!

Records with leadtime: 91668
Records without leadtime: 0

Leadtime distribution:
count    91668.000000
mean        55.193612
std         30.616254
min         24.000000
25%         48.000000
50%         48.000000
75%         72.000000
max        168.000000
Name: leadtime, dtype: float64

Expected receiving day calculated!
Records with expected receiving day (future dates only): 9766
Records with past expected dates (set to empty): 6710
Records with confirmation_status='no' (added 2 extra days): 3788

Sample data with expected receiving day:


Unnamed: 0,product_id,warehouse_id,sku,confirmation_status,last_po_date,leadtime,adjusted_leadtime,expected_receiving_day,doh
0,3576,703,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,yes,2026-01-25,48.0,48.0,2026-01-27,6.4
1,3577,501,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,yes,2026-01-21,48.0,48.0,NaT,10.0
2,9070,337,زيت كريستال عباد الشمس - 5 لتر,yes,2026-01-22,48.0,48.0,NaT,0.0
3,9070,8,زيت كريستال عباد الشمس - 5 لتر,yes,2026-01-14,48.0,48.0,NaT,24.0
4,448,339,زيت كريستال الممتاز خليط - 700 مل,yes,2026-01-21,48.0,48.0,NaT,12.333333
5,448,170,زيت كريستال الممتاز خليط - 700 مل,yes,2026-01-22,48.0,48.0,NaT,6.0
6,13045,339,تمور القصيم كيس شفاف - 10 كجم,yes,2026-01-17,48.0,48.0,NaT,30.0
7,13045,170,تمور القصيم كيس شفاف - 10 كجم,no,2026-01-25,48.0,96.0,2026-01-29,6.5
10,1414,1,تونة دولفين مفتتة حار - 140 جم,yes,2026-01-24,48.0,48.0,2026-01-26,5.758713
11,3575,501,حفاضات بيبى جوى مضغوطة وسط مقاس 3 - 58 حفاضة,yes,2026-01-25,48.0,48.0,2026-01-27,7.333333


In [35]:
# =============================================================================
# SKIP: Margin Boundaries Query - Now fetched via market_data_module.get_margin_tiers()
# =============================================================================
# The margin boundaries and tier calculation is now centralized in market_data_module.
# We'll use get_margin_tiers() to get pre-calculated margin tiers.

print("Loading margin tiers from market_data_module...")
df_margin_tiers = get_margin_tiers()
print(f"Loaded {len(df_margin_tiers)} margin tier records from module")


Loading margin tiers from market_data_module...

FETCHING MARGIN TIERS
Timestamp: 2026-01-26 14:54:23 Cairo time

Step 1: Fetching margin boundaries from PRODUCT_STATISTICS...
    Loaded 18274 records

Step 2: Adding cohort IDs...
    Records with cohorts: 25186

Step 3: Calculating margin tiers...

MARGIN TIERS COMPLETE
Total records: 25186

Margin Tier Structure:
  margin_tier_below:   effective_min - step (1 below)
  margin_tier_1:       effective_min_margin
  margin_tier_2:       effective_min + 1*step
  margin_tier_3:       effective_min + 2*step
  margin_tier_4:       effective_min + 3*step
  margin_tier_5:       max_boundary
  margin_tier_above_1: max_boundary + 1*step
  margin_tier_above_2: max_boundary + 2*step
Loaded 25186 margin tier records from module


In [36]:
# =============================================================================
# Add Margin Tiers from market_data_module (Pre-calculated)
# =============================================================================
# The margin tiers are now calculated in market_data_module.get_margin_tiers()
# We just need to merge them with pricing_with_discount

# Merge pre-calculated margin tiers
pricing_with_discount = pricing_with_discount.merge(
    df_margin_tiers[[
        'product_id', 'region', 'cohort_id',
        'optimal_bm', 'min_boundary', 'max_boundary', 'median_bm',
        'effective_min_margin', 'margin_step',
        'margin_tier_below', 'margin_tier_1', 'margin_tier_2', 'margin_tier_3',
        'margin_tier_4', 'margin_tier_5', 'margin_tier_above_1', 'margin_tier_above_2'
    ]], 
    on=['product_id', 'region','cohort_id'],
    how='left'
)

print(f"Margin tiers merged from module!")
print(f"\nRecords with margin tiers: {len(pricing_with_discount[~pricing_with_discount['max_boundary'].isna()])}")
print(f"Records without margin tiers: {len(pricing_with_discount[pricing_with_discount['max_boundary'].isna()])}")

print(f"\nMargin Tier Structure (from market_data_module):")
print(f"  margin_tier_below:   effective_min - step (1 below)")
print(f"  margin_tier_1:       effective_min_margin")
print(f"  margin_tier_2:       effective_min + 1*step")
print(f"  margin_tier_3:       effective_min + 2*step")
print(f"  margin_tier_4:       effective_min + 3*step")
print(f"  margin_tier_5:       max_boundary")
print(f"  margin_tier_above_1: max_boundary + 1*step")
print(f"  margin_tier_above_2: max_boundary + 2*step")

print(f"\nSample margin tiers:")
pricing_with_discount[~pricing_with_discount['max_boundary'].isna()][
    ['product_id', 'sku', 'effective_min_margin', 'max_boundary', 'margin_step',
     'margin_tier_below', 'margin_tier_1', 'margin_tier_3', 'margin_tier_5', 
     'margin_tier_above_1', 'margin_tier_above_2']
].head(10)


Margin tiers merged from module!

Records with margin tiers: 35088
Records without margin tiers: 56580

Margin Tier Structure (from market_data_module):
  margin_tier_below:   effective_min - step (1 below)
  margin_tier_1:       effective_min_margin
  margin_tier_2:       effective_min + 1*step
  margin_tier_3:       effective_min + 2*step
  margin_tier_4:       effective_min + 3*step
  margin_tier_5:       max_boundary
  margin_tier_above_1: max_boundary + 1*step
  margin_tier_above_2: max_boundary + 2*step

Sample margin tiers:


Unnamed: 0,product_id,sku,effective_min_margin,max_boundary,margin_step,margin_tier_below,margin_tier_1,margin_tier_3,margin_tier_5,margin_tier_above_1,margin_tier_above_2
0,3576,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,0.046728,0.07,0.005818,0.04091,0.046728,0.058364,0.07,0.075818,0.081636
1,3577,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,0.0385,0.076257,0.009439,0.029061,0.0385,0.057378,0.076257,0.085696,0.095135
2,9070,زيت كريستال عباد الشمس - 5 لتر,0.031155,0.072771,0.010404,0.020751,0.031155,0.051963,0.072771,0.083175,0.093579
3,9070,زيت كريستال عباد الشمس - 5 لتر,0.031155,0.072771,0.010404,0.020751,0.031155,0.051963,0.072771,0.083175,0.093579
4,448,زيت كريستال الممتاز خليط - 700 مل,0.025704,0.05172,0.006504,0.0192,0.025704,0.038712,0.05172,0.058224,0.064728
5,448,زيت كريستال الممتاز خليط - 700 مل,0.025704,0.05172,0.006504,0.0192,0.025704,0.038712,0.05172,0.058224,0.064728
8,22327,بونكس اوتوماتيك برائحة الفل - 9 كجم,0.052936,0.1059,0.013241,0.039694,0.052936,0.079418,0.1059,0.119141,0.132382
9,22327,بونكس اوتوماتيك برائحة الفل - 9 كجم,0.052936,0.1059,0.013241,0.039694,0.052936,0.079418,0.1059,0.119141,0.132382
10,1414,تونة دولفين مفتتة حار - 140 جم,0.034116,0.063737,0.007405,0.02671,0.034116,0.048926,0.063737,0.071142,0.078547
11,3575,حفاضات بيبى جوى مضغوطة وسط مقاس 3 - 58 حفاضة,0.042983,0.07,0.006754,0.036228,0.042983,0.056491,0.07,0.076754,0.083509


In [37]:
# =============================================================================
# Minimum Selling Quantity Query - Get min selling qty per product
# =============================================================================
MIN_SELLING_QTY_QUERY = f'''
SELECT product_id, min_selling_qty
FROM (
    SELECT *, MIN(basic_unit_count) OVER (PARTITION BY product_id) AS min_selling_qty
    FROM (
        SELECT DISTINCT
            pso.product_id,
            pso.PACKING_UNIT_ID,
            pup.basic_unit_count,
            SUM(pso.total_price) AS nmv,
            SUM(pso.total_price) / SUM(nmv) OVER (PARTITION BY pso.product_id) AS cntrb
        FROM product_sales_order pso
        JOIN PACKING_UNIT_PRODUCTS pup ON pup.product_id = pso.product_id 
            AND pup.PACKING_UNIT_ID = pso.PACKING_UNIT_ID
        JOIN sales_orders so ON so.id = pso.sales_order_id
        WHERE so.created_at::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 120
            AND so.sales_order_status_id NOT IN (7, 12)
            AND so.channel IN ('telesales', 'retailer')
            AND pso.purchased_item_count <> 0
        GROUP BY ALL
        QUALIFY cntrb > 0.05
    )
    QUALIFY basic_unit_count = min_selling_qty
)
'''

# Execute min selling qty query
print("Loading minimum selling quantity data...")
df_min_selling_qty = query_snowflake(MIN_SELLING_QTY_QUERY)
df_min_selling_qty = convert_to_numeric(df_min_selling_qty)
print(f"Loaded {len(df_min_selling_qty)} min selling qty records")


Loading minimum selling quantity data...
Loaded 3901 min selling qty records


  df[col] = pd.to_numeric(df[col], errors='ignore')


In [38]:
# =============================================================================
# Add Min Selling Qty and Below Min Stock Flag to pricing_with_discount
# =============================================================================

# Merge min selling qty with pricing_with_discount (by product_id)
pricing_with_discount = pricing_with_discount.merge(
    df_min_selling_qty[['product_id', 'min_selling_qty']], 
    on='product_id', 
    how='left'
)

# Fill missing min_selling_qty with 1 (default)
pricing_with_discount['min_selling_qty'] = pricing_with_discount['min_selling_qty'].fillna(1).astype(int)

# Create flag: below_min_stock_flag = 1 if (RR = 0 AND stocks > 0 AND stocks < min_selling_qty)
pricing_with_discount['below_min_stock_flag'] = np.where(
    (pricing_with_discount['in_stock_rr'] == 0) & 
    (pricing_with_discount['stocks'] > 0) &
    (pricing_with_discount['stocks'] < pricing_with_discount['min_selling_qty']),
    1, 0
)

print(f"Min selling qty and below_min_stock_flag added!")
print(f"\nSKUs flagged (zero RR & stocks < min_selling_qty): {len(pricing_with_discount[pricing_with_discount['below_min_stock_flag'] == 1])}")
print(f"SKUs not flagged: {len(pricing_with_discount[pricing_with_discount['below_min_stock_flag'] == 0])}")
print(f"\nSample flagged SKUs:")
pricing_with_discount[pricing_with_discount['below_min_stock_flag'] == 1][
    ['product_id', 'warehouse_id', 'sku', 'stocks', 'min_selling_qty', 'in_stock_rr', 'below_min_stock_flag']
].head(15)


Min selling qty and below_min_stock_flag added!

SKUs flagged (zero RR & stocks < min_selling_qty): 115
SKUs not flagged: 91553

Sample flagged SKUs:


Unnamed: 0,product_id,warehouse_id,sku,stocks,min_selling_qty,in_stock_rr,below_min_stock_flag
1567,9122,170,مولبد ماكسى مضغوطة حماية ضد البكتيريا طويل جدا...,1,4,0.0,1
1687,70,401,عسل البوادى اسود - 680 جم,5,6,0.0,1
7362,3429,797,تونة دولفين قطع 140 جم + 10 جم - 150 جم,2,12,0.0,1
8296,12456,339,صن شاين اكسبريس تونة مفتتة حار - 150 جم,4,12,0.0,1
8297,12456,170,صن شاين اكسبريس تونة مفتتة حار - 150 جم,3,12,0.0,1
8489,1053,962,حلاوة الرشيدى الميزان - 545 جم,2,3,0.0,1
9084,3557,797,تونة صن شاين اكسبريس قطع - 150 جم,3,12,0.0,1
9933,1053,632,حلاوة الرشيدى الميزان - 545 جم,1,3,0.0,1
10499,2480,339,توليدو تونة قطعه واحدة- 185 جم,7,12,0.0,1
11722,633,339,توليدو تونة مفتتة حار سهلة الفتح- 140 جم,7,12,0.0,1


In [39]:
# =============================================================================
# Yesterday's Discount Analysis Query
# Gets: SKU discount, Quantity discount, Tier 1/2/3 NMV breakdown and contributions
# =============================================================================
YESTERDAY_DISCOUNT_QUERY = f'''
WITH qd_det AS (
    -- Map dynamic tags to warehouse IDs using name matching
    SELECT DISTINCT 
        dt.id AS tag_id, 
        dt.name AS tag_name,
        REPLACE(w.name, ' ', '') AS warehouse_name,
        w.id AS warehouse_id,
        warehouse_name ILIKE '%' || CASE 
            WHEN SPLIT_PART(tag_name, '_', 1) = 'El' THEN SPLIT_PART(tag_name, '_', 2) 
            ELSE SPLIT_PART(tag_name, '_', 1) 
        END || '%' AS contains_flag
    FROM dynamic_tags dt
    JOIN dynamic_taggables dta ON dt.id = dta.dynamic_tag_id 
    CROSS JOIN warehouses w 
    WHERE dt.id > 3000
        AND dt.name LIKE '%QD_rets%'
        AND w.id IN (1, 236, 337, 8, 339, 170, 501, 401, 703, 632, 797, 962)
        AND contains_flag = 'true'
),

qd_config AS (
    SELECT * 
    FROM (
        SELECT 
            product_id,
            start_at,
            end_at,
            packing_unit_id,
            id AS qd_id,
            qd.warehouse_id,
            MAX(CASE WHEN tier = 1 THEN quantity END) AS tier_1_qty,
            MAX(CASE WHEN tier = 1 THEN discount_percentage END) AS tier_1_discount_pct,
            MAX(CASE WHEN tier = 2 THEN quantity END) AS tier_2_qty,
            MAX(CASE WHEN tier = 2 THEN discount_percentage END) AS tier_2_discount_pct,
            MAX(CASE WHEN tier = 3 THEN quantity END) AS tier_3_qty,
            MAX(CASE WHEN tier = 3 THEN discount_percentage END) AS tier_3_discount_pct
        FROM (
            SELECT 
                qd.id,
                qdv.product_id,
                qdv.packing_unit_id,
                qdv.quantity,
                qdv.discount_percentage,
                qd.dynamic_tag_id,
                qd.start_at,
                qd.end_at,
                ROW_NUMBER() OVER (
                    PARTITION BY qdv.product_id, qdv.packing_unit_id, qd.id 
                    ORDER BY qdv.quantity
                ) AS tier
            FROM quantity_discounts qd 
            JOIN quantity_discount_values qdv ON qd.id = qdv.quantity_discount_id 
            WHERE CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 1 
                  BETWEEN qd.start_at::DATE AND qd.end_at::DATE
                AND qd.start_at::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 5
        ) qd_tiers
        JOIN qd_det qd ON qd.tag_id = qd_tiers.dynamic_tag_id
        GROUP BY ALL
    )
    QUALIFY ROW_NUMBER() OVER (PARTITION BY product_id, packing_unit_id, warehouse_id ORDER BY start_at DESC) = 1
),

-- Get all sales from yesterday
yesterday_sales AS (
    SELECT 
        pso.warehouse_id,
        pso.product_id,
        so.retailer_id,
        pso.packing_unit_id,
        pso.purchased_item_count AS qty,
        pso.total_price AS nmv,
        pso.item_price / pso.basic_unit_count AS unit_price,
        pso.ITEM_DISCOUNT_VALUE AS sku_discount_per_unit,
        pso.ITEM_QUANTITY_DISCOUNT_VALUE AS qty_discount_per_unit,
        pso.ITEM_DISCOUNT_VALUE * pso.purchased_item_count AS sku_discount_total,
        pso.ITEM_QUANTITY_DISCOUNT_VALUE * pso.purchased_item_count AS qty_discount_total,
        qd.tier_1_qty,
        qd.tier_2_qty,
        qd.tier_3_qty,
        qd.tier_1_discount_pct,
        qd.tier_2_discount_pct,
        qd.tier_3_discount_pct,
        -- Determine tier used
        CASE 
            WHEN pso.ITEM_QUANTITY_DISCOUNT_VALUE = 0 OR qd.tier_1_qty IS NULL THEN 'Base'
            WHEN qd.tier_3_qty IS NOT NULL AND pso.purchased_item_count >= qd.tier_3_qty THEN 'Tier 3'
            WHEN qd.tier_2_qty IS NOT NULL AND pso.purchased_item_count >= qd.tier_2_qty THEN 'Tier 2'
            WHEN qd.tier_1_qty IS NOT NULL AND pso.purchased_item_count >= qd.tier_1_qty THEN 'Tier 1'
            ELSE 'Base'
        END AS tier_used
    FROM product_sales_order pso
    JOIN sales_orders so ON so.id = pso.sales_order_id
    LEFT JOIN qd_config qd 
        ON qd.product_id = pso.product_id 
        AND qd.packing_unit_id = pso.packing_unit_id
        AND qd.warehouse_id = so.warehouse_id
    WHERE so.created_at::DATE = CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 1
        AND so.sales_order_status_id NOT IN (7, 12)
        AND so.channel IN ('telesales', 'retailer')
        AND pso.purchased_item_count <> 0
)

SELECT 
    warehouse_id,
    product_id,
    SUM(nmv) AS total_nmv,
    SUM(CASE WHEN sku_discount_per_unit > 0 THEN nmv ELSE 0 END) AS sku_discount_nmv,
    SUM(CASE WHEN qty_discount_per_unit > 0 THEN nmv ELSE 0 END) AS qty_discount_nmv,
    SUM(CASE WHEN tier_used = 'Tier 1' THEN nmv ELSE 0 END) AS tier1_nmv,
    SUM(CASE WHEN tier_used = 'Tier 2' THEN nmv ELSE 0 END) AS tier2_nmv,
    SUM(CASE WHEN tier_used = 'Tier 3' THEN nmv ELSE 0 END) AS tier3_nmv,
    -- Tier quantities and discount percentages (from the active QD config)
    MAX(tier_1_qty) AS tier_1_qty,
    MAX(tier_1_discount_pct) AS tier_1_discount_pct,
    MAX(tier_2_qty) AS tier_2_qty,
    MAX(tier_2_discount_pct) AS tier_2_discount_pct,
    MAX(tier_3_qty) AS tier_3_qty,
    MAX(tier_3_discount_pct) AS tier_3_discount_pct
FROM yesterday_sales
GROUP BY warehouse_id, product_id
HAVING SUM(nmv) > 0
ORDER BY total_nmv DESC
'''

# Execute yesterday discount query
print("Loading yesterday's discount analysis data...")
df_yesterday_discount = query_snowflake(YESTERDAY_DISCOUNT_QUERY)
df_yesterday_discount = convert_to_numeric(df_yesterday_discount)
print(f"Loaded {len(df_yesterday_discount)} SKU discount records from yesterday")

# Calculate contributions in Python
df_yesterday_discount['sku_discount_nmv_cntrb'] = (
    df_yesterday_discount['sku_discount_nmv'] / df_yesterday_discount['total_nmv'] * 100
).round(2)
df_yesterday_discount['qty_discount_nmv_cntrb'] = (
    df_yesterday_discount['qty_discount_nmv'] / df_yesterday_discount['total_nmv'] * 100
).round(2)
df_yesterday_discount['tier1_nmv_cntrb'] = (
    df_yesterday_discount['tier1_nmv'] / df_yesterday_discount['total_nmv'] * 100
).round(2)
df_yesterday_discount['tier2_nmv_cntrb'] = (
    df_yesterday_discount['tier2_nmv'] / df_yesterday_discount['total_nmv'] * 100
).round(2)
df_yesterday_discount['tier3_nmv_cntrb'] = (
    df_yesterday_discount['tier3_nmv'] / df_yesterday_discount['total_nmv'] * 100
).round(2)

# Summary
print(f"\n{'='*60}")
print(f"YESTERDAY'S DISCOUNT ANALYSIS SUMMARY")
print(f"{'='*60}")
print(f"\nTotal NMV yesterday: {df_yesterday_discount['total_nmv'].sum():,.0f}")
print(f"SKU Discount NMV: {df_yesterday_discount['sku_discount_nmv'].sum():,.0f}")
print(f"Quantity Discount NMV: {df_yesterday_discount['qty_discount_nmv'].sum():,.0f}")
print(f"\nNMV by Tier:")
print(f"  Tier 1: {df_yesterday_discount['tier1_nmv'].sum():,.0f}")
print(f"  Tier 2: {df_yesterday_discount['tier2_nmv'].sum():,.0f}")
print(f"  Tier 3: {df_yesterday_discount['tier3_nmv'].sum():,.0f}")

df_yesterday_discount.head(10)


Loading yesterday's discount analysis data...
Loaded 9920 SKU discount records from yesterday

YESTERDAY'S DISCOUNT ANALYSIS SUMMARY

Total NMV yesterday: 18,485,779
SKU Discount NMV: 3,870,274
Quantity Discount NMV: 1,975,349

NMV by Tier:
  Tier 1: 709,927
  Tier 2: 1,140,094
  Tier 3: 66,233


  df[col] = pd.to_numeric(df[col], errors='ignore')


Unnamed: 0,warehouse_id,product_id,total_nmv,sku_discount_nmv,qty_discount_nmv,tier1_nmv,tier2_nmv,tier3_nmv,tier_1_qty,tier_1_discount_pct,tier_2_qty,tier_2_discount_pct,tier_3_qty,tier_3_discount_pct,sku_discount_nmv_cntrb,qty_discount_nmv_cntrb,tier1_nmv_cntrb,tier2_nmv_cntrb,tier3_nmv_cntrb
0,1,8089,111941.5,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0
1,962,6097,98509.015,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0
2,339,615,96738.77,78521.0,15633.0,1158.0,14475.0,0.0,4.0,1.32,9.0,3.27,,,81.17,16.16,1.2,14.96,0.0
3,962,8089,89148.5,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0
4,962,7630,87151.16,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0
5,632,2424,71036.5,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0
6,236,8089,70297.75,0.0,0.0,0.0,0.0,0.0,,,,,,,0.0,0.0,0.0,0.0,0.0
7,1,2912,68697.4503,0.0,12205.0,0.0,12205.0,0.0,4.0,1.31,7.0,2.52,,,0.0,17.77,0.0,17.77,0.0
8,1,151,62270.0,10127.5,6195.0,6195.0,0.0,0.0,4.0,0.57,7.0,1.67,,,16.26,9.95,9.95,0.0,0.0
9,1,130,61781.25,36574.5,0.0,0.0,0.0,0.0,,,,,,,59.2,0.0,0.0,0.0,0.0


In [40]:
# =============================================================================
# Add Yesterday's Discount Analysis to pricing_with_discount (Contributions Only)
# =============================================================================

# Merge yesterday discount data with pricing_with_discount - contributions + tier config
pricing_with_discount = pricing_with_discount.merge(
    df_yesterday_discount[[
        'warehouse_id', 'product_id', 
        'sku_discount_nmv_cntrb', 'qty_discount_nmv_cntrb',
        'tier1_nmv_cntrb', 'tier2_nmv_cntrb', 'tier3_nmv_cntrb',
        'tier_1_qty', 'tier_1_discount_pct',
        'tier_2_qty', 'tier_2_discount_pct',
        'tier_3_qty', 'tier_3_discount_pct'
    ]].rename(columns={
        'sku_discount_nmv_cntrb': 'yesterday_sku_disc_cntrb',
        'qty_discount_nmv_cntrb': 'yesterday_qty_disc_cntrb',
        'tier1_nmv_cntrb': 'yesterday_t1_cntrb',
        'tier2_nmv_cntrb': 'yesterday_t2_cntrb',
        'tier3_nmv_cntrb': 'yesterday_t3_cntrb',
        'tier_1_qty': 'qd_tier_1_qty',
        'tier_1_discount_pct': 'qd_tier_1_disc_pct',
        'tier_2_qty': 'qd_tier_2_qty',
        'tier_2_discount_pct': 'qd_tier_2_disc_pct',
        'tier_3_qty': 'qd_tier_3_qty',
        'tier_3_discount_pct': 'qd_tier_3_disc_pct'
    }), 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill NaN for SKUs that had no sales yesterday
contrib_cols = [
    'yesterday_sku_disc_cntrb', 'yesterday_qty_disc_cntrb',
    'yesterday_t1_cntrb', 'yesterday_t2_cntrb', 'yesterday_t3_cntrb'
]
for col in contrib_cols:
    if col in pricing_with_discount.columns:
        pricing_with_discount[col] = pricing_with_discount[col].fillna(0)

# Fill NaN for QD tier config (0 means no tier configured)
qd_config_cols = [
    'qd_tier_1_qty', 'qd_tier_1_disc_pct',
    'qd_tier_2_qty', 'qd_tier_2_disc_pct',
    'qd_tier_3_qty', 'qd_tier_3_disc_pct'
]
for col in qd_config_cols:
    if col in pricing_with_discount.columns:
        pricing_with_discount[col] = pricing_with_discount[col].fillna(0)

print(f"Yesterday's discount contributions and QD tier config added!")
print(f"\nSKUs with discount data: {len(pricing_with_discount[pricing_with_discount['yesterday_sku_disc_cntrb'] > 0]) + len(pricing_with_discount[pricing_with_discount['yesterday_qty_disc_cntrb'] > 0])}")
print(f"SKUs with QD tier config: {len(pricing_with_discount[pricing_with_discount['qd_tier_1_qty'] > 0])}")
print(f"\nSample data with yesterday's discount contributions and QD tiers:")
pricing_with_discount[pricing_with_discount['qd_tier_1_qty'] > 0][
    ['product_id', 'warehouse_id', 'sku', 
     'yesterday_sku_disc_cntrb', 'yesterday_qty_disc_cntrb',
     'qd_tier_1_qty', 'qd_tier_1_disc_pct', 'qd_tier_2_qty', 'qd_tier_2_disc_pct', 'qd_tier_3_qty', 'qd_tier_3_disc_pct']
].head(15)


Yesterday's discount contributions and QD tier config added!

SKUs with discount data: 4597
SKUs with QD tier config: 2200

Sample data with yesterday's discount contributions and QD tiers:


Unnamed: 0,product_id,warehouse_id,sku,yesterday_sku_disc_cntrb,yesterday_qty_disc_cntrb,qd_tier_1_qty,qd_tier_1_disc_pct,qd_tier_2_qty,qd_tier_2_disc_pct,qd_tier_3_qty,qd_tier_3_disc_pct
35,6496,962,سندة زيت خليط - 900 مل,42.14,0.0,4.0,0.48,7.0,0.92,44.0,1.36
89,75,632,دقيق حبوبة - 1 كجم,0.0,0.0,4.0,0.88,11.0,2.74,0.0,0.0
90,13023,339,شوكولاتة تاوتاو كرانش بالكاكاو - 12 قطعه,0.0,0.0,5.0,0.45,9.0,0.88,687.0,1.6
95,5729,1,جبنة رودس شيدر - 250 جم,49.84,0.0,4.0,0.85,7.0,2.0,0.0,0.0
108,10993,339,لبن المراعى كامل الدسم - 1.5 لتر,0.0,0.0,5.0,0.87,11.0,2.52,0.0,0.0
109,10993,170,لبن المراعى كامل الدسم - 1.5 لتر,0.0,32.63,4.0,0.64,9.0,1.58,62.0,1.6
152,8943,401,طحينة البوادى ظرف - 22 جرام,0.0,0.0,4.0,0.85,7.0,1.89,0.0,0.0
161,12721,401,ماكسى كولا - 1 لتر,21.43,71.43,4.0,0.47,9.0,1.16,338.0,1.81
182,5209,236,لبان تشكلتس فراولة - 0.5 جنية,39.66,22.86,4.0,0.97,7.0,1.88,0.0,0.0
187,990,703,لبن المراعى كامل دسم بلاستيك - 1 لتر,0.0,54.29,4.0,0.58,7.0,1.46,61.0,1.81


In [41]:
# =============================================================================
# Performance Benchmark Query
# Gets: Yesterday qty, Recent 7d qty, MTD qty, and P80 benchmarks (240 days)
# Uses materialized_views.stock_day_close for in-stock determination
# =============================================================================
PERFORMANCE_BENCHMARK_QUERY = f'''
WITH params AS (
    SELECT
        CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE AS today,
        CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 1 AS yesterday,
        CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 240 AS history_start,
        DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE) AS current_month_start,
        DAY(CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE) AS current_day_of_month
),

-- Daily sales aggregation (240 days) - includes qty and retailer count
daily_sales AS (
    SELECT
        pso.warehouse_id,
        pso.product_id,
        so.created_at::DATE AS sale_date,
        SUM(pso.purchased_item_count * pso.basic_unit_count) AS daily_qty,
        COUNT(DISTINCT so.retailer_id) AS daily_retailers
    FROM product_sales_order pso
    JOIN sales_orders so ON so.id = pso.sales_order_id
    CROSS JOIN params p
    WHERE so.created_at::DATE >= p.history_start
        AND so.created_at::DATE < p.today
        AND so.sales_order_status_id NOT IN (7, 12)
        AND so.channel IN ('telesales', 'retailer')
        AND pso.purchased_item_count <> 0
    GROUP BY pso.warehouse_id, pso.product_id, so.created_at::DATE
),

-- Daily stock status using stock_day_close
-- In-stock = opening (prev day close) > 0 AND closing > 0
daily_stock AS (
    SELECT
        sdc.warehouse_id,
        sdc.product_id,
        sdc.TIMESTAMP::DATE AS stock_date,
        sdc.available_stock,
        LAG(sdc.available_stock, 1) OVER (
            PARTITION BY sdc.warehouse_id, sdc.product_id 
            ORDER BY sdc.TIMESTAMP::DATE
        ) AS opening_stock,
        CASE 
            WHEN LAG(sdc.available_stock, 1) OVER (
                    PARTITION BY sdc.warehouse_id, sdc.product_id ORDER BY sdc.TIMESTAMP::DATE
                 ) > 0 
                 AND sdc.available_stock > 0 
            THEN 1 
            ELSE 0 
        END AS in_stock_flag
    FROM materialized_views.stock_day_close sdc
    CROSS JOIN params p
    WHERE sdc.TIMESTAMP::DATE >= p.history_start - 1  -- Need one extra day for LAG
        AND sdc.TIMESTAMP::DATE < p.today
),

-- Combine sales with stock status
daily_with_stock AS (
    SELECT
        COALESCE(ds.warehouse_id, st.warehouse_id) AS warehouse_id,
        COALESCE(ds.product_id, st.product_id) AS product_id,
        COALESCE(ds.sale_date, st.stock_date) AS the_date,
        COALESCE(ds.daily_qty, 0) AS daily_qty,
        COALESCE(ds.daily_retailers, 0) AS daily_retailers,
        COALESCE(st.in_stock_flag, 0) AS in_stock_flag
    FROM daily_sales ds
    FULL OUTER JOIN daily_stock st 
        ON ds.warehouse_id = st.warehouse_id 
        AND ds.product_id = st.product_id 
        AND ds.sale_date = st.stock_date
    WHERE COALESCE(ds.sale_date, st.stock_date) >= (SELECT history_start FROM params)
),

-- Calculate P80 benchmark (in-stock days only, 240 days, EXCLUDING last 7 days)
p80_daily_benchmark AS (
    SELECT
        warehouse_id,
        product_id,
        PERCENTILE_CONT(0.8) WITHIN GROUP (ORDER BY daily_qty) AS p80_daily_240d,
        AVG(daily_qty) AS avg_daily_240d,
        STDDEV(daily_qty) AS std_daily_240d,
        COUNT(*) AS in_stock_days_240d
    FROM daily_with_stock
    CROSS JOIN params p
    WHERE in_stock_flag = 1
        AND the_date >= p.history_start
        AND the_date < p.today - 7  -- Exclude last 7 days from benchmark
    GROUP BY warehouse_id, product_id
),

-- Calculate P70 retailer benchmark (in-stock days only, 240 days, EXCLUDING last 7 days)
p70_retailer_benchmark AS (
    SELECT
        warehouse_id,
        product_id,
        PERCENTILE_CONT(0.7) WITHIN GROUP (ORDER BY daily_retailers) AS p70_daily_retailers_240d,
        AVG(daily_retailers) AS avg_daily_retailers_240d,
        STDDEV(daily_retailers) AS std_daily_retailers_240d
    FROM daily_with_stock
    CROSS JOIN params p
    WHERE in_stock_flag = 1
        AND the_date >= p.history_start
        AND the_date < p.today - 7  -- Exclude last 7 days from benchmark
    GROUP BY warehouse_id, product_id
),

-- Calculate 7-day rolling SUM for P80 recent benchmark
rolling_7d AS (
    SELECT
        warehouse_id,
        product_id,
        the_date,
        SUM(daily_qty) OVER (
            PARTITION BY warehouse_id, product_id 
            ORDER BY the_date 
            ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
        ) AS rolling_7d_sum,
        SUM(in_stock_flag) OVER (
            PARTITION BY warehouse_id, product_id 
            ORDER BY the_date 
            ROWS BETWEEN 6 PRECEDING AND CURRENT ROW
        ) AS in_stock_days_7d
    FROM daily_with_stock
),

p80_7d_benchmark AS (
    SELECT
        warehouse_id,
        product_id,
        PERCENTILE_CONT(0.8) WITHIN GROUP (ORDER BY rolling_7d_sum) AS p80_7d_rolling_240d
    FROM rolling_7d
    CROSS JOIN params p
    WHERE the_date >= p.history_start + 7  -- Need 7 days for rolling
        AND the_date < p.today - 7  -- Exclude last 7 days from benchmark
        AND in_stock_days_7d >= 4  -- At least 4 of 7 days in stock
    GROUP BY warehouse_id, product_id
),

-- MTD benchmark: P80 of same MTD period totals (last 12 months)
-- Sum all sales from day 1 to current day of month for each historical month
mtd_historical AS (
    SELECT
        dws.warehouse_id,
        dws.product_id,
        DATE_TRUNC('month', dws.the_date) AS period_month_start,
        SUM(dws.daily_qty) AS mtd_total_qty  -- Sum of all days from 1 to current_day_of_month
    FROM daily_with_stock dws
    CROSS JOIN params p
    WHERE DAY(dws.the_date) <= p.current_day_of_month  -- Only days up to current day of month
    GROUP BY dws.warehouse_id, dws.product_id, DATE_TRUNC('month', dws.the_date)
),

mtd_by_period AS (
    SELECT
        mh.warehouse_id,
        mh.product_id,
        mh.period_month_start,
        mh.mtd_total_qty AS mtd_qty_at_day  -- Total MTD qty for that month
    FROM mtd_historical mh
    CROSS JOIN params p
    WHERE mh.period_month_start >= DATEADD(month, -12, p.current_month_start)
        AND mh.period_month_start < p.current_month_start
),

p80_mtd_benchmark AS (
    SELECT
        warehouse_id,
        product_id,
        PERCENTILE_CONT(0.8) WITHIN GROUP (ORDER BY mtd_qty_at_day) AS p80_mtd_12mo,
        AVG(mtd_qty_at_day) AS avg_mtd_12mo
    FROM mtd_by_period
    GROUP BY warehouse_id, product_id
    HAVING COUNT(*) >= 3  -- At least 3 months of data
),

-- Current period quantities
current_metrics AS (
    SELECT
        warehouse_id,
        product_id,
        -- Yesterday
        SUM(CASE WHEN the_date = (SELECT yesterday FROM params) THEN daily_qty ELSE 0 END) AS yesterday_qty,
        SUM(CASE WHEN the_date = (SELECT yesterday FROM params) THEN daily_retailers ELSE 0 END) AS yesterday_retailers,
        -- Recent 7 days
        SUM(CASE WHEN the_date >= (SELECT today FROM params) - 7 AND the_date < (SELECT today FROM params) THEN daily_qty ELSE 0 END) AS recent_7d_qty,
        SUM(CASE WHEN the_date >= (SELECT today FROM params) - 7 AND the_date < (SELECT today FROM params) AND in_stock_flag = 1 THEN 1 ELSE 0 END) AS recent_7d_in_stock_days,
        -- MTD
        SUM(CASE WHEN the_date >= (SELECT current_month_start FROM params) AND the_date < (SELECT today FROM params) THEN daily_qty ELSE 0 END) AS mtd_qty,
        SUM(CASE WHEN the_date >= (SELECT current_month_start FROM params) AND the_date < (SELECT today FROM params) AND in_stock_flag = 1 THEN 1 ELSE 0 END) AS mtd_in_stock_days
    FROM daily_with_stock
    GROUP BY warehouse_id, product_id
)

-- Final output
SELECT
    cm.warehouse_id,
    cm.product_id,
    
    -- Current period quantities
    cm.yesterday_qty,
    cm.yesterday_retailers,
    cm.recent_7d_qty,
    cm.recent_7d_in_stock_days,
    cm.mtd_qty,
    cm.mtd_in_stock_days,
    
    -- Quantity Benchmarks (P80)
    COALESCE(pb.p80_daily_240d, 1) AS p80_daily_240d,
    COALESCE(pb.avg_daily_240d, 0) AS avg_daily_240d,
    COALESCE(pb.std_daily_240d, 0) AS std_daily_240d,
    COALESCE(pb.in_stock_days_240d, 0) AS in_stock_days_240d,
    COALESCE(p7.p80_7d_rolling_240d, pb.p80_daily_240d * 7, 1) AS p80_7d_sum_240d,
    COALESCE(pm.p80_mtd_12mo, pb.p80_daily_240d * (SELECT current_day_of_month FROM params), 1) AS p80_mtd_12mo,
    
    -- Retailer Benchmarks (P70)
    COALESCE(pr.p70_daily_retailers_240d, 1) AS p70_daily_retailers_240d,
    COALESCE(pr.avg_daily_retailers_240d, 0) AS avg_daily_retailers_240d,
    COALESCE(pr.std_daily_retailers_240d, 0) AS std_daily_retailers_240d,
    
    -- Performance ratios (all comparing sums to sums)
    -- Yesterday: daily qty vs P80 daily
    ROUND(cm.yesterday_qty / NULLIF(COALESCE(pb.p80_daily_240d, 1), 0), 2) AS yesterday_ratio,
    -- Recent 7d: 7-day sum vs P80 of 7-day sums
    ROUND(cm.recent_7d_qty / NULLIF(COALESCE(p7.p80_7d_rolling_240d, pb.p80_daily_240d * 7, 1), 0), 2) AS recent_ratio,
    -- MTD: MTD sum vs P80 of historical MTD sums
    ROUND(cm.mtd_qty / NULLIF(COALESCE(pm.p80_mtd_12mo, pb.p80_daily_240d * (SELECT current_day_of_month FROM params), 1), 0), 2) AS mtd_ratio,
    -- Retailer ratio: yesterday retailers vs P70 daily retailers
    ROUND(cm.yesterday_retailers / NULLIF(COALESCE(pr.p70_daily_retailers_240d, 1), 0), 2) AS yesterday_retailer_ratio

FROM current_metrics cm
LEFT JOIN p80_daily_benchmark pb ON cm.warehouse_id = pb.warehouse_id AND cm.product_id = pb.product_id
LEFT JOIN p80_7d_benchmark p7 ON cm.warehouse_id = p7.warehouse_id AND cm.product_id = p7.product_id
LEFT JOIN p80_mtd_benchmark pm ON cm.warehouse_id = pm.warehouse_id AND cm.product_id = pm.product_id
LEFT JOIN p70_retailer_benchmark pr ON cm.warehouse_id = pr.warehouse_id AND cm.product_id = pr.product_id
where cm.warehouse_id in (1, 236, 337, 8, 339, 170, 501, 401, 703, 632, 797, 962)
'''

# Execute benchmark query
print("Loading performance benchmark data (this may take a moment due to 240-day history)...")
df_benchmarks = query_snowflake(PERFORMANCE_BENCHMARK_QUERY)
df_benchmarks = convert_to_numeric(df_benchmarks)
print(f"Loaded {len(df_benchmarks)} benchmark records")

# =============================================================================
# Apply Minimum Thresholds to Benchmark Values
# - Daily quantity benchmarks should not be below 5
# - Daily retailers benchmarks should not be less than 2
# This ensures performance calculations don't use unrealistic benchmarks
# =============================================================================
MIN_DAILY_QTY_BENCHMARK = 5
MIN_DAILY_RETAILERS_BENCHMARK = 2

# Apply minimums to daily benchmarks
df_benchmarks['p80_daily_240d'] = df_benchmarks['p80_daily_240d'].clip(lower=MIN_DAILY_QTY_BENCHMARK)
df_benchmarks['p70_daily_retailers_240d'] = df_benchmarks['p70_daily_retailers_240d'].clip(lower=MIN_DAILY_RETAILERS_BENCHMARK)

# Apply proportional minimums to interval-based benchmarks
df_benchmarks['p80_7d_sum_240d'] = df_benchmarks['p80_7d_sum_240d'].clip(lower=MIN_DAILY_QTY_BENCHMARK * 7)  # 35

# For MTD, calculate dynamic minimum based on days in current period
# mtd_in_stock_days represents how many days of data we have in current month
df_benchmarks['p80_mtd_12mo'] = df_benchmarks.apply(
    lambda row: max(row['p80_mtd_12mo'], MIN_DAILY_QTY_BENCHMARK * max(row['mtd_in_stock_days'], 1)),
    axis=1
)

print(f"Applied minimum thresholds: qty >= {MIN_DAILY_QTY_BENCHMARK}/day, retailers >= {MIN_DAILY_RETAILERS_BENCHMARK}/day")

# Preview
df_benchmarks


Loading performance benchmark data (this may take a moment due to 240-day history)...


  df[col] = pd.to_numeric(df[col], errors='ignore')


Loaded 295777 benchmark records
Applied minimum thresholds: qty >= 5/day, retailers >= 2/day


Unnamed: 0,warehouse_id,product_id,yesterday_qty,yesterday_retailers,recent_7d_qty,recent_7d_in_stock_days,mtd_qty,mtd_in_stock_days,p80_daily_240d,avg_daily_240d,...,in_stock_days_240d,p80_7d_sum_240d,p80_mtd_12mo,p70_daily_retailers_240d,avg_daily_retailers_240d,std_daily_retailers_240d,yesterday_ratio,recent_ratio,mtd_ratio,yesterday_retailer_ratio
0,401,2380,0,0,0,0,0,0,5.0,0.000000,...,0,35.0,5.0,2.000,0.000000,0.000000,0.00,0.00,,0.00
1,170,17644,0,0,0,0,0,0,5.0,0.000000,...,0,35.0,5.0,2.000,0.000000,0.000000,0.00,0.00,,0.00
2,501,8522,0,0,0,0,0,0,5.0,0.000000,...,0,35.0,5.0,2.000,0.000000,0.000000,0.00,0.00,,0.00
3,632,19880,1,1,1,0,1,0,5.0,1.400000,...,5,35.0,5.0,2.000,1.000000,0.707107,0.45,0.11,,1.00
4,401,11734,0,0,0,0,0,0,5.0,0.696970,...,66,35.0,15.0,2.000,0.500000,0.749358,0.00,0.00,0.00,0.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
295772,962,12772,0,0,0,0,0,0,9.4,5.333333,...,3,65.8,5.0,2.199,1.666667,2.081666,0.00,0.00,0.00,0.00
295773,962,10597,0,0,7,7,26,11,5.0,1.686047,...,86,35.0,55.0,2.000,1.174419,1.382171,0.00,0.47,0.68,0.00
295774,962,130,78,51,609,7,2737,17,146.4,124.494949,...,99,931.0,3031.8,67.000,59.121212,15.659171,0.53,0.65,0.90,0.76
295775,962,3585,0,0,6,7,19,11,5.0,1.486486,...,111,35.0,55.0,2.000,1.081081,1.096566,0.00,0.43,0.49,0.00


In [42]:
# =============================================================================
# Add Performance Benchmarks and Tags to pricing_with_discount
# =============================================================================

# Merge benchmark data with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_benchmarks[[
        'warehouse_id', 'product_id',
        'yesterday_qty', 'yesterday_retailers', 'recent_7d_qty', 'recent_7d_in_stock_days', 'mtd_qty', 'mtd_in_stock_days',
        'p80_daily_240d', 'avg_daily_240d','std_daily_240d', 'in_stock_days_240d',
        'p80_7d_sum_240d', 'p80_mtd_12mo',
        'p70_daily_retailers_240d', 'avg_daily_retailers_240d', 'std_daily_retailers_240d',
        'yesterday_ratio', 'recent_ratio', 'mtd_ratio', 'yesterday_retailer_ratio'
    ]], 
    on=['warehouse_id', 'product_id'], 
    how='left'
)

# Fill NaN values
qty_cols = ['yesterday_qty', 'yesterday_retailers', 'recent_7d_qty', 'recent_7d_in_stock_days', 'mtd_qty', 'mtd_in_stock_days']
for col in qty_cols:
    pricing_with_discount[col] = pricing_with_discount[col].fillna(0)

benchmark_cols = ['p80_daily_240d', 'p80_7d_sum_240d', 'p80_mtd_12mo', 'p70_daily_retailers_240d']
for col in benchmark_cols:
    pricing_with_discount[col] = pricing_with_discount[col].fillna(1)  # Default to 1 to avoid division issues

ratio_cols = ['yesterday_ratio', 'recent_ratio', 'mtd_ratio', 'yesterday_retailer_ratio']
for col in ratio_cols:
    pricing_with_discount[col] = pricing_with_discount[col].fillna(0)

pricing_with_discount['avg_daily_240d'] = pricing_with_discount['avg_daily_240d'].fillna(0)
pricing_with_discount['in_stock_days_240d'] = pricing_with_discount['in_stock_days_240d'].fillna(0)
pricing_with_discount['avg_daily_retailers_240d'] = pricing_with_discount['avg_daily_retailers_240d'].fillna(0)
pricing_with_discount['std_daily_retailers_240d'] = pricing_with_discount['std_daily_retailers_240d'].fillna(0)

# =============================================================================
# Performance Tags - Classify each ratio
# =============================================================================
def get_performance_tag(ratio):
    """
    Classify performance based on ratio to benchmark
    On Track: 90% to 115% of benchmark
    Upper tiers: start from 115%
    Lower tiers: start from 90%
    """
    if pd.isna(ratio) or ratio == 0:
        return 'No Data'
    elif ratio >= 1.75:
        return 'Star Performer'      # 🌟 75%+ above benchmark
    elif ratio > 1.15:
        return 'Over Achiever'       # 🔥 15%+ above benchmark  
    elif ratio >= 0.90:
        return 'On Track'            # ✅ Meeting benchmark (90%-115%)
    elif ratio >= 0.70:
        return 'Underperforming'     # ⚠️ 10%-30% below benchmark
    elif ratio >= 0.40:
        return 'Struggling'          # 🔻 30%-60% below benchmark
    else:
        return 'Critical'            # 🚨 60%+ below benchmark

# Apply tags to each timeframe
pricing_with_discount['yesterday_status'] = pricing_with_discount['yesterday_ratio'].apply(get_performance_tag)
pricing_with_discount['recent_status'] = pricing_with_discount['recent_ratio'].apply(get_performance_tag)
pricing_with_discount['mtd_status'] = pricing_with_discount['mtd_ratio'].apply(get_performance_tag)

# =============================================================================
# Combined Performance Score (weighted average of ratios)
# Approach 2: Scale Weights by In-Stock Percentage
# =============================================================================

# Calculate days in month so far (excluding today)
days_in_month_so_far = max(TODAY.day - 1, 1)  # At least 1 to avoid division by zero

# Calculate in-stock percentages for each period
pricing_with_discount['yesterday_in_stock_pct'] = 1 - pricing_with_discount['oos_yesterday']
pricing_with_discount['recent_7d_in_stock_pct'] = pricing_with_discount['recent_7d_in_stock_days'] / 7
pricing_with_discount['mtd_in_stock_pct'] = pricing_with_discount['mtd_in_stock_days'] / days_in_month_so_far

# Base weights: Yesterday 20%, Recent 7d 40%, MTD 40%
# Scale by in-stock percentage
# NOTE: MTD weight = 0 for first 3 days of month (unreliable data)
MTD_RELIABLE_DAY = 3  # Only use MTD when day >= 3
pricing_with_discount['yesterday_raw_weight'] = 0.2 * pricing_with_discount['yesterday_in_stock_pct']
pricing_with_discount['recent_7d_raw_weight'] = 0.4 * pricing_with_discount['recent_7d_in_stock_pct']
pricing_with_discount['mtd_raw_weight'] = np.where(
    TODAY.day >= MTD_RELIABLE_DAY,
    0.4 * pricing_with_discount['mtd_in_stock_pct'],
    0  # Set MTD weight to 0 at start of month
)

# Total raw weight for normalization
pricing_with_discount['total_raw_weight'] = (
    pricing_with_discount['yesterday_raw_weight'] + 
    pricing_with_discount['recent_7d_raw_weight'] + 
    pricing_with_discount['mtd_raw_weight']
)

# Normalized weights (sum to 1)
pricing_with_discount['yesterday_norm_weight'] = np.where(
    pricing_with_discount['total_raw_weight'] > 0,
    pricing_with_discount['yesterday_raw_weight'] / pricing_with_discount['total_raw_weight'],
    0
)
pricing_with_discount['recent_7d_norm_weight'] = np.where(
    pricing_with_discount['total_raw_weight'] > 0,
    pricing_with_discount['recent_7d_raw_weight'] / pricing_with_discount['total_raw_weight'],
    0
)
pricing_with_discount['mtd_norm_weight'] = np.where(
    pricing_with_discount['total_raw_weight'] > 0,
    pricing_with_discount['mtd_raw_weight'] / pricing_with_discount['total_raw_weight'],
    0
)

# Combined performance ratio with dynamic weights based on in-stock days
pricing_with_discount['combined_perf_ratio'] = (
    pricing_with_discount['yesterday_norm_weight'] * pricing_with_discount['yesterday_ratio'].clip(upper=3) +
    pricing_with_discount['recent_7d_norm_weight'] * pricing_with_discount['recent_ratio'].clip(upper=3) +
    pricing_with_discount['mtd_norm_weight'] * pricing_with_discount['mtd_ratio'].clip(upper=3)
)

# Handle cases where total_raw_weight = 0 (completely OOS in all periods)
pricing_with_discount['combined_perf_ratio'] = pricing_with_discount['combined_perf_ratio'].fillna(0)

# Clean up intermediate columns (optional - keep for debugging)
weight_debug_cols = ['yesterday_in_stock_pct', 'recent_7d_in_stock_pct', 'mtd_in_stock_pct',
                     'yesterday_raw_weight', 'recent_7d_raw_weight', 'mtd_raw_weight', 'total_raw_weight',
                     'yesterday_norm_weight', 'recent_7d_norm_weight', 'mtd_norm_weight']
# Uncomment to drop: pricing_with_discount.drop(columns=weight_debug_cols, inplace=True)

print(f"\nDynamic weight calculation complete!")
print(f"Days in month so far: {days_in_month_so_far}")
print(f"\nSample of weight distributions:")
print(pricing_with_discount[pricing_with_discount['total_raw_weight'] > 0][
    ['product_id', 'warehouse_id', 'oos_yesterday', 'recent_7d_in_stock_days', 'mtd_in_stock_days',
     'yesterday_norm_weight', 'recent_7d_norm_weight', 'mtd_norm_weight', 'combined_perf_ratio']
].head(10))

pricing_with_discount['combined_status'] = pricing_with_discount['combined_perf_ratio'].apply(get_performance_tag)

# =============================================================================
# High Performer Flag (for immediate action consideration)
# =============================================================================
# Flag SKUs that are significantly over-achieving and may need action (price increase, etc.)
pricing_with_discount['high_performer_flag'] = np.where(
    (pricing_with_discount['yesterday_ratio'] >= 1.5) & 
    (pricing_with_discount['recent_ratio'] >= 1.3) &
    (pricing_with_discount['mtd_ratio'] >= 1.2),
    1, 0
)

# Star performer flag (exceptional - all metrics 2x+ benchmark)
pricing_with_discount['star_performer_flag'] = np.where(
    (pricing_with_discount['yesterday_ratio'] >= 2.0) & 
    (pricing_with_discount['recent_ratio'] >= 1.5) &
    (pricing_with_discount['mtd_ratio'] >= 1.5),
    1, 0
)

# =============================================================================
# Summary
# =============================================================================
print(f"Performance benchmarks added!")
print(f"\n{'='*60}")
print(f"PERFORMANCE STATUS DISTRIBUTION")
print(f"{'='*60}")

print(f"\nYesterday Status:")
print(pricing_with_discount['yesterday_status'].value_counts().to_string())

print(f"\nRecent 7d Status:")
print(pricing_with_discount['recent_status'].value_counts().to_string())

print(f"\nMTD Status:")
print(pricing_with_discount['mtd_status'].value_counts().to_string())

print(f"\nCombined Status:")
print(pricing_with_discount['combined_status'].value_counts().to_string())

print(f"\n{'='*60}")
print(f"HIGH PERFORMERS (Action Candidates)")
print(f"{'='*60}")
print(f"High Performers (flag=1): {len(pricing_with_discount[pricing_with_discount['high_performer_flag'] == 1])}")
print(f"Star Performers (flag=1): {len(pricing_with_discount[pricing_with_discount['star_performer_flag'] == 1])}")

# Show top performers
print(f"\nTop 15 Star Performers:")
pricing_with_discount[pricing_with_discount['star_performer_flag'] == 1].nlargest(15, 'combined_perf_ratio')[
    ['product_id', 'warehouse_id', 'sku', 
     'yesterday_ratio', 'recent_ratio', 'mtd_ratio', 'combined_perf_ratio',
     'yesterday_status', 'combined_status']
]



Dynamic weight calculation complete!
Days in month so far: 25

Sample of weight distributions:
    product_id  warehouse_id  oos_yesterday  recent_7d_in_stock_days  \
0         3576           703              0                        7   
1         3577           501              0                        1   
3         9070             8              0                        7   
4          448           339              0                        7   
5          448           170              0                        7   
6        13045           339              0                        7   
7        13045           170              0                        7   
8        22327           339              0                        7   
9        22327           170              0                        7   
10        1414             1              0                        7   

    mtd_in_stock_days  yesterday_norm_weight  recent_7d_norm_weight  \
0                  14               0.24

Unnamed: 0,product_id,warehouse_id,sku,yesterday_ratio,recent_ratio,mtd_ratio,combined_perf_ratio,yesterday_status,combined_status
60646,473,1,سمنة هانم ظرف بطعم الزبدة الصفرا- 350 جم,11.0,3.25,5.43,3.0,Star Performer,Star Performer
60647,473,1,سمنة هانم ظرف بطعم الزبدة الصفرا- 350 جم,11.0,3.25,5.43,3.0,Star Performer,Star Performer
81105,8688,797,عصير تانج برتقال بودر برطمان - 450 جم,7.5,4.59,3.61,3.0,Star Performer,Star Performer
4281,5849,1,مكرونة شعرية كايرو - 400 جم,4.67,3.63,3.6,3.0,Star Performer,Star Performer
4438,2310,962,دابو بسكوت شاي سادة - 2 جنية,8.89,4.4,21.11,3.0,Star Performer,Star Performer
5824,11492,1,سمنة حبوبة نباتى صفراء برطمان - 600 جم,10.0,9.3,3.62,3.0,Star Performer,Star Performer
5976,8089,1,مناديل فاميليا سحب تريبل 3 طبقة - 500 منديل,14.63,3.18,1.98,3.0,Star Performer,Star Performer
10116,24707,8,تورتا ديلايتس كـيك كاكاو محشو بكريمه الشكولاته...,41.0,42.0,42.0,3.0,Star Performer,Star Performer
10335,12925,501,زيت ثمرات خليط - 550 مل,12.0,5.6,4.87,3.0,Star Performer,Star Performer
11445,24711,236,بليزو وايت ويفر رول محـشو بكريمه الفانيليا مغط...,27.0,28.0,28.0,3.0,Star Performer,Star Performer


In [43]:
# =============================================================================
# No NMV in Last 4 Months Flag
# Identifies SKUs that have not generated any NMV in the past 4 months (120 days)
# =============================================================================
NO_NMV_4M_QUERY = f'''
WITH nmv_last_4m AS (
    SELECT 
        pso.warehouse_id,
        pso.product_id,
        SUM(pso.total_price) AS total_nmv_4m
    FROM product_sales_order pso
    JOIN sales_orders so ON so.id = pso.sales_order_id
    WHERE so.created_at::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 120
        AND so.created_at::DATE < CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE
        AND so.sales_order_status_id NOT IN (7, 12)
        AND so.channel IN ('telesales', 'retailer')
        AND pso.purchased_item_count <> 0
    GROUP BY pso.warehouse_id, pso.product_id
    HAVING SUM(pso.total_price) > 0
)
SELECT 
    warehouse_id,
    product_id,
    total_nmv_4m
FROM nmv_last_4m
'''

# Execute query
print("Loading SKUs with NMV in last 4 months...")
df_nmv_4m = query_snowflake(NO_NMV_4M_QUERY)
df_nmv_4m = convert_to_numeric(df_nmv_4m)
print(f"Found {len(df_nmv_4m)} SKU-warehouse combinations with NMV in last 4 months")

# Merge and create no_nmv_4m flag
pricing_with_discount = pricing_with_discount.merge(
    df_nmv_4m[['warehouse_id', 'product_id', 'total_nmv_4m']],
    on=['warehouse_id', 'product_id'],
    how='left'
)

# Flag SKUs with no NMV in last 4 months
# 1 = No NMV (should potentially be filtered), 0 = Has NMV
pricing_with_discount['no_nmv_4m'] = np.where(
    pricing_with_discount['total_nmv_4m'].isna() | (pricing_with_discount['total_nmv_4m'] == 0),
    1, 0
)

# Fill NaN for total_nmv_4m
pricing_with_discount['total_nmv_4m'] = pricing_with_discount['total_nmv_4m'].fillna(0)

print(f"\n{'='*60}")
print(f"NO NMV IN LAST 4 MONTHS ANALYSIS")
print(f"{'='*60}")
print(f"Total records: {len(pricing_with_discount)}")
print(f"SKUs with NO NMV in 4 months (no_nmv_4m=1): {len(pricing_with_discount[pricing_with_discount['no_nmv_4m'] == 1])}")
print(f"SKUs with NMV in 4 months (no_nmv_4m=0): {len(pricing_with_discount[pricing_with_discount['no_nmv_4m'] == 0])}")

# Show sample of SKUs with no NMV
print(f"\nSample SKUs with no NMV in last 4 months:")
pricing_with_discount[pricing_with_discount['no_nmv_4m'] == 1][
    ['product_id', 'warehouse_id', 'sku', 'stocks', 'in_stock_rr', 'zero_demand', 'no_nmv_4m']
].head(15)


Loading SKUs with NMV in last 4 months...
Found 29051 SKU-warehouse combinations with NMV in last 4 months

NO NMV IN LAST 4 MONTHS ANALYSIS
Total records: 91668
SKUs with NO NMV in 4 months (no_nmv_4m=1): 62424
SKUs with NMV in 4 months (no_nmv_4m=0): 29244

Sample SKUs with no NMV in last 4 months:


  df[col] = pd.to_numeric(df[col], errors='ignore')


Unnamed: 0,product_id,warehouse_id,sku,stocks,in_stock_rr,zero_demand,no_nmv_4m
14,13045,401,تمور القصيم كيس شفاف - 10 كجم,0,0.0,0,1
15,11503,632,مكرونة الملكة لازانيا - 400 جم,0,0.0,0,1
16,11503,632,مكرونة الملكة لازانيا - 400 جم,0,0.0,0,1
18,2712,337,ريحانة ثوم بودر- 20 جم,0,0.0,0,1
19,2712,8,ريحانة ثوم بودر- 20 جم,0,0.0,0,1
20,2537,501,زيت عافية عباد الشمس - 2.2 لتر,0,0.0,0,1
21,858,401,هنادى زيت خليط- 2.1 لتر,0,0.0,0,1
26,7513,337,شهرزاد ملوخية - 400 جم,0,0.0,0,1
27,7513,8,شهرزاد ملوخية - 400 جم,0,0.0,0,1
28,13893,401,سبيرو سباتس كولا - 330 جم,0,0.0,0,1


In [44]:
# =============================================================================
# Normal Refill Query - Avg qty & stddev for frequent retailers (last 120 days)
# Frequent retailer definition based on ABC classification (from existing dataframe):
#   - Class A: bought 4+ times
#   - Class B: bought 3+ times
#   - Class C: bought 2+ times
# =============================================================================
NORMAL_REFILL_QUERY = f'''
WITH params AS (
    SELECT 
        CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE AS today,
        CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 120 AS history_start
),

-- Get retailer order counts per product-warehouse (last 120 days)
retailer_orders AS (
    SELECT 
        pso.warehouse_id,
        pso.product_id,
        so.retailer_id,
        COUNT(DISTINCT so.id) AS order_count
    FROM product_sales_order pso
    JOIN sales_orders so ON so.id = pso.sales_order_id
    CROSS JOIN params p
    WHERE so.created_at::DATE >= p.history_start
        AND so.created_at::DATE < p.today
        AND so.sales_order_status_id NOT IN (7, 12)
        AND so.channel IN ('telesales', 'retailer')
        AND pso.purchased_item_count <> 0
    GROUP BY pso.warehouse_id, pso.product_id, so.retailer_id
),

-- Get individual order quantities per retailer
order_quantities AS (
    SELECT 
        pso.warehouse_id,
        pso.product_id,
        so.retailer_id,
        so.id AS order_id,
        SUM(pso.purchased_item_count * pso.basic_unit_count) AS order_qty
    FROM product_sales_order pso
    JOIN sales_orders so ON so.id = pso.sales_order_id
    CROSS JOIN params p
    WHERE so.created_at::DATE >= p.history_start
        AND so.created_at::DATE < p.today
        AND so.sales_order_status_id NOT IN (7, 12)
        AND so.channel IN ('telesales', 'retailer')
        AND pso.purchased_item_count <> 0
    GROUP BY pso.warehouse_id, pso.product_id, so.retailer_id, so.id
)

-- Return retailer-level data with order counts for Python filtering
SELECT 
    oq.warehouse_id,
    oq.product_id,
    oq.retailer_id,
    ro.order_count,
    oq.order_id,
    oq.order_qty
FROM order_quantities oq
JOIN retailer_orders ro 
    ON ro.warehouse_id = oq.warehouse_id 
    AND ro.product_id = oq.product_id 
    AND ro.retailer_id = oq.retailer_id
'''

# Execute normal refill query
print("Loading retailer order data for normal refill calculation (last 120 days)...")
df_retailer_orders = query_snowflake(NORMAL_REFILL_QUERY)
df_retailer_orders = convert_to_numeric(df_retailer_orders)
print(f"Loaded {len(df_retailer_orders)} retailer order records")

# Get ABC classification from existing dataframe
abc_mapping = pricing_with_discount[['warehouse_id', 'product_id', 'abc_class']].drop_duplicates()
print(f"ABC classification mapping: {len(abc_mapping)} product-warehouse combinations")

# Merge ABC classification into retailer orders
df_retailer_orders = df_retailer_orders.merge(
    abc_mapping,
    on=['warehouse_id', 'product_id'],
    how='inner'
)
print(f"Records after ABC merge: {len(df_retailer_orders)}")

# Filter frequent retailers based on ABC class thresholds
# Class A: 4+ orders, Class B: 3+ orders, Class C: 2+ orders
df_frequent = df_retailer_orders[
    ((df_retailer_orders['abc_class'] == 'A') & (df_retailer_orders['order_count'] >= 4)) |
    ((df_retailer_orders['abc_class'] == 'B') & (df_retailer_orders['order_count'] >= 3)) |
    ((df_retailer_orders['abc_class'] == 'C') & (df_retailer_orders['order_count'] >= 2))
].copy()
print(f"Records from frequent retailers: {len(df_frequent)}")

# Calculate normal_refill (avg qty) and refill_stddev per product-warehouse
df_normal_refill = df_frequent.groupby(['warehouse_id', 'product_id']).agg(
    frequent_retailer_count=('retailer_id', 'nunique'),
    frequent_order_count=('order_id', 'nunique'),
    normal_refill=('order_qty', 'mean'),
    refill_stddev=('order_qty', 'std')
).reset_index()

# Round values and fill NaN stddev (when only 1 order)
df_normal_refill['normal_refill'] = df_normal_refill['normal_refill'].round(2)
df_normal_refill['refill_stddev'] = df_normal_refill['refill_stddev'].fillna(0).round(2)

# Filter to products with at least 2 orders for meaningful stats
df_normal_refill = df_normal_refill[df_normal_refill['frequent_order_count'] >= 2]
print(f"Final normal refill records (min 2 orders): {len(df_normal_refill)}")

# Merge with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_normal_refill[['warehouse_id', 'product_id', 'frequent_retailer_count', 
                      'frequent_order_count', 'normal_refill', 'refill_stddev']],
    on=['warehouse_id', 'product_id'],
    how='left'
)

# Fill NaN values
pricing_with_discount['frequent_retailer_count'] = pricing_with_discount['frequent_retailer_count'].fillna(0)
pricing_with_discount['frequent_order_count'] = pricing_with_discount['frequent_order_count'].fillna(0)
pricing_with_discount['normal_refill'] = pricing_with_discount['normal_refill'].fillna(0)
pricing_with_discount['refill_stddev'] = pricing_with_discount['refill_stddev'].fillna(0)

print(f"\n{'='*60}")
print(f"NORMAL REFILL ANALYSIS (Frequent Retailers - 120 days)")
print(f"{'='*60}")
print(f"Records with normal_refill data: {len(pricing_with_discount[pricing_with_discount['normal_refill'] > 0])}")
print(f"Records without normal_refill data: {len(pricing_with_discount[pricing_with_discount['normal_refill'] == 0])}")
print(f"\nNormal refill distribution:")
print(pricing_with_discount[pricing_with_discount['normal_refill'] > 0]['normal_refill'].describe())
print(f"\nSample data:")
pricing_with_discount[pricing_with_discount['normal_refill'] > 0][
    ['product_id', 'warehouse_id', 'sku', 'abc_class', 'frequent_retailer_count', 
     'frequent_order_count', 'normal_refill', 'refill_stddev', 'in_stock_rr']
].head(15)


Loading retailer order data for normal refill calculation (last 120 days)...


  df[col] = pd.to_numeric(df[col], errors='ignore')


Loaded 4821351 retailer order records
ABC classification mapping: 86538 product-warehouse combinations
Records after ABC merge: 4596801
Records from frequent retailers: 1626132
Final normal refill records (min 2 orders): 21836

NORMAL REFILL ANALYSIS (Frequent Retailers - 120 days)
Records with normal_refill data: 22972
Records without normal_refill data: 68696

Normal refill distribution:
count    22972.000000
mean         3.134262
std         33.214172
min          1.000000
25%          1.220000
50%          1.710000
75%          2.850000
max       4534.000000
Name: normal_refill, dtype: float64

Sample data:


Unnamed: 0,product_id,warehouse_id,sku,abc_class,frequent_retailer_count,frequent_order_count,normal_refill,refill_stddev,in_stock_rr
0,3576,703,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,C,32.0,93.0,2.98,2.22,5.0
1,3577,501,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,C,15.0,45.0,2.22,3.86,3.0
2,9070,337,زيت كريستال عباد الشمس - 5 لتر,C,4.0,10.0,1.0,0.0,2.0
3,9070,8,زيت كريستال عباد الشمس - 5 لتر,C,18.0,47.0,1.36,0.64,3.0
4,448,339,زيت كريستال الممتاز خليط - 700 مل,B,10.0,36.0,1.11,0.32,3.0
5,448,170,زيت كريستال الممتاز خليط - 700 مل,C,11.0,22.0,1.18,0.5,2.0
6,13045,339,تمور القصيم كيس شفاف - 10 كجم,C,1.0,2.0,1.5,0.71,2.0
7,13045,170,تمور القصيم كيس شفاف - 10 كجم,C,1.0,2.0,1.0,0.0,2.0
8,22327,339,بونكس اوتوماتيك برائحة الفل - 9 كجم,C,3.0,6.0,2.17,1.6,3.0
10,1414,1,تونة دولفين مفتتة حار - 140 جم,B,169.0,676.0,21.14,26.52,373.0


In [45]:
# =============================================================================
# Live Cart Rules Query - Get current cart rules from the system
# Merges on product_id and cohort_id
# =============================================================================
LIVE_CART_RULES_QUERY = f'''
SELECT 
    cppu.cohort_id,
    pup.product_id,
    pup.packing_unit_id,
    pup.basic_unit_count,
    COALESCE(cppu.MAX_PER_SALES_ORDER, cppu2.MAX_PER_SALES_ORDER) AS current_cart_rule
FROM COHORT_PRODUCT_PACKING_UNITS cppu 
JOIN PACKING_UNIT_PRODUCTS pup ON cppu.PRODUCT_PACKING_UNIT_ID = pup.id 
JOIN cohorts c ON c.id = cppu.cohort_id
LEFT JOIN COHORT_PRODUCT_PACKING_UNITS cppu2 
    ON cppu.PRODUCT_PACKING_UNIT_ID = cppu2.PRODUCT_PACKING_UNIT_ID 
    AND cppu2.cohort_id = c.FALLBACK_COHORT_ID
WHERE cppu.cohort_id IN ({','.join(map(str, COHORT_IDS))})
'''

# Execute live cart rules query
print("Loading live cart rules...")
df_cart_rules = query_snowflake(LIVE_CART_RULES_QUERY)
df_cart_rules = convert_to_numeric(df_cart_rules)
print(f"Loaded {len(df_cart_rules)} cart rule records")

# Aggregate to product-cohort level (take the cart rule for basic unit, or min if multiple)
# Filter to basic unit (packing_unit_id where basic_unit_count = 1) for simpler merging
df_cart_rules_basic = df_cart_rules[df_cart_rules['basic_unit_count'] == 1].copy()
print(f"Basic unit cart rules: {len(df_cart_rules_basic)}")

# If no basic unit, take the minimum cart rule per product-cohort
df_cart_rules_agg = df_cart_rules.groupby(['cohort_id', 'product_id']).agg(
    current_cart_rule=('current_cart_rule', 'min')
).reset_index()

# Prefer basic unit cart rule, fallback to aggregated
df_cart_rules_final = df_cart_rules_basic[['cohort_id', 'product_id', 'current_cart_rule']].drop_duplicates()
df_cart_rules_final = df_cart_rules_final.merge(
    df_cart_rules_agg[['cohort_id', 'product_id', 'current_cart_rule']].rename(columns={'current_cart_rule': 'cart_rule_agg'}),
    on=['cohort_id', 'product_id'],
    how='outer'
)
df_cart_rules_final['current_cart_rule'] = df_cart_rules_final['current_cart_rule'].fillna(df_cart_rules_final['cart_rule_agg'])
df_cart_rules_final = df_cart_rules_final[['cohort_id', 'product_id', 'current_cart_rule']].drop_duplicates()
print(f"Final cart rules (product-cohort level): {len(df_cart_rules_final)}")

# Merge with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_cart_rules_final,
    on=['cohort_id', 'product_id'],
    how='left'
)

# Fill NaN cart rules with 0 (no cart rule set)
pricing_with_discount['current_cart_rule'] = pricing_with_discount['current_cart_rule'].fillna(0)

print(f"\n{'='*60}")
print(f"LIVE CART RULES ANALYSIS")
print(f"{'='*60}")
print(f"Records with cart rule > 0: {len(pricing_with_discount[pricing_with_discount['current_cart_rule'] > 0])}")
print(f"Records without cart rule: {len(pricing_with_discount[pricing_with_discount['current_cart_rule'] == 0])}")
print(f"\nCart rule distribution:")
print(pricing_with_discount[pricing_with_discount['current_cart_rule'] > 0]['current_cart_rule'].describe())
print(f"\nSample data with cart rules:")
pricing_with_discount[pricing_with_discount['current_cart_rule'] > 0][
    ['product_id', 'cohort_id', 'warehouse_id', 'sku', 'current_price', 'current_cart_rule', 'in_stock_rr']
].head(15)


Loading live cart rules...


  df[col] = pd.to_numeric(df[col], errors='ignore')


Loaded 110660 cart rule records
Basic unit cart rules: 72995
Final cart rules (product-cohort level): 72977

LIVE CART RULES ANALYSIS
Records with cart rule > 0: 91625
Records without cart rule: 55

Cart rule distribution:
count    91625.000000
mean        71.124638
std        641.179648
min          1.000000
25%         10.000000
50%         18.000000
75%         25.000000
max      10000.000000
Name: current_cart_rule, dtype: float64

Sample data with cart rules:


Unnamed: 0,product_id,cohort_id,warehouse_id,sku,current_price,current_cart_rule,in_stock_rr
0,3576,1123,703,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,292.5,5.0,5.0
1,3577,1124,501,حفاضات بيبى جوى مضغوطة كبير جدا مقاس 5 - 58 حفاضة,326.5,3.0,3.0
2,9070,703,337,زيت كريستال عباد الشمس - 5 لتر,915.0,5.0,2.0
3,9070,703,8,زيت كريستال عباد الشمس - 5 لتر,915.0,5.0,3.0
4,448,704,339,زيت كريستال الممتاز خليط - 700 مل,618.5,5.0,3.0
5,448,704,170,زيت كريستال الممتاز خليط - 700 مل,618.5,5.0,2.0
6,13045,704,339,تمور القصيم كيس شفاف - 10 كجم,759.25,9.0,2.0
7,13045,704,170,تمور القصيم كيس شفاف - 10 كجم,759.25,9.0,2.0
8,22327,704,339,بونكس اوتوماتيك برائحة الفل - 9 كجم,704.0,10.0,3.0
9,22327,704,170,بونكس اوتوماتيك برائحة الفل - 9 كجم,704.0,10.0,2.0


In [46]:
# =============================================================================
# Commercial Constraint Minimum Price Query
# Gets the minimum price constraints from finance.minimum_prices
# =============================================================================
COMMERCIAL_MIN_PRICE_QUERY = f'''
WITH to_remove AS (
    SELECT 
        check_date AS start_date,
        (check_date + INTERVAL '1 month') + 6 AS end_date 
    FROM (
        SELECT 
            CASE 
                WHEN DATE_PART('day', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE) < 7 
                THEN DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - INTERVAL '1 month') 
                ELSE DATE_FROM_PARTS(
                    YEAR(CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE), 
                    MONTH(CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE), 
                    1
                )  
            END AS check_date
    )
) 

SELECT  
    sku_id AS product_id,
    sku,
    brand AS comm_brand,
    cat AS comm_cat,
    region,
    created_at AS comm_created_at,
    min_price AS commercial_min_price
FROM (
    SELECT 
        product_id AS sku_id,
        product_name AS sku,
        brand,
        category AS cat,
        region,
        min_price,
        created_at,
        MAX(created_at) OVER (PARTITION BY product_id, region) AS latest_date
    FROM finance.minimum_prices
    WHERE is_deleted = 'false'
        AND created_at BETWEEN (SELECT start_date FROM to_remove) AND (SELECT end_date FROM to_remove)
) comm
WHERE created_at = latest_date
'''

# Execute commercial min price query
print("Loading commercial minimum price constraints...")
df_commercial_min = query_snowflake(COMMERCIAL_MIN_PRICE_QUERY)
df_commercial_min = convert_to_numeric(df_commercial_min)
print(f"Loaded {len(df_commercial_min)} commercial min price records")

# Merge with pricing_with_discount on product_id and region
pricing_with_discount = pricing_with_discount.merge(
    df_commercial_min[['product_id', 'region', 'commercial_min_price']],
    on=['product_id', 'region'],
    how='left'
)

# Fill NaN with 0 (no commercial constraint)
pricing_with_discount['commercial_min_price'] = pricing_with_discount['commercial_min_price'].fillna(0)

print(f"\n{'='*60}")
print(f"COMMERCIAL MINIMUM PRICE CONSTRAINTS")
print(f"{'='*60}")
print(f"Records with commercial min price: {len(pricing_with_discount[pricing_with_discount['commercial_min_price'] > 0])}")
print(f"Records without commercial min price: {len(pricing_with_discount[pricing_with_discount['commercial_min_price'] == 0])}")
print(f"\nCommercial min price distribution:")
print(pricing_with_discount[pricing_with_discount['commercial_min_price'] > 0]['commercial_min_price'].describe())
print(f"\nSample data with commercial constraints:")
pricing_with_discount[pricing_with_discount['commercial_min_price'] > 0][
    ['product_id', 'region', 'warehouse_id', 'sku', 'current_price', 'commercial_min_price', 'price_after_discount']
].head(15)


Loading commercial minimum price constraints...
Loaded 979 commercial min price records

COMMERCIAL MINIMUM PRICE CONSTRAINTS
Records with commercial min price: 2107
Records without commercial min price: 89573

Commercial min price distribution:
count    2107.000000
mean      180.549976
std       171.097050
min        11.750000
25%        49.500000
50%       114.000000
75%       255.000000
max       815.000000
Name: commercial_min_price, dtype: float64

Sample data with commercial constraints:


  df[col] = pd.to_numeric(df[col], errors='ignore')


Unnamed: 0,product_id,region,warehouse_id,sku,current_price,commercial_min_price,price_after_discount
90,13023,Delta East,339,شوكولاتة تاوتاو كرانش بالكاكاو - 12 قطعه,48.5,48.5,48.5
91,13023,Delta East,170,شوكولاتة تاوتاو كرانش بالكاكاو - 12 قطعه,48.5,48.5,48.5
92,13023,Upper Egypt,501,شوكولاتة تاوتاو كرانش بالكاكاو - 12 قطعه,49.75,48.5,49.75
154,20969,Upper Egypt,501,باى لاك معمول ب السمسم محشو ب التمر - 5 جنية,50.5,48.5,50.5
172,20966,Upper Egypt,401,ﺑﺎﻯ ﻻﻙ ﻭﻳﻔﺮ ﺑﺴﻜﻮﻳﺖ ﻛﺎﻛﺎﻭ ﺑﻜﺮﻳﻤﺔ ﺟﻮﺯ ﺍﻟﻬﻨﺪ - 5 ...,49.25,48.5,49.25
178,144,Upper Egypt,632,عصير بيتى كوكتيل - 235 مل,187.0,184.5,185.160148
212,10181,Upper Egypt,401,مسحوق ليدر اتوماتيك جردل لافندر (1ك زيادة) - 5...,220.0,220.0,220.0
217,143,Upper Egypt,501,عصير بيتى تفاح - 235 مل,187.0,184.5,186.583925
218,143,Delta East,339,عصير بيتى تفاح - 235 مل,187.0,184.5,186.497084
219,143,Delta East,170,عصير بيتى تفاح - 235 مل,187.0,184.5,186.596312


In [47]:
# =============================================================================
# Active SKU Discount Query - Get current SKU discount percentage per warehouse
# =============================================================================
ACTIVE_SKU_DISCOUNT_QUERY = f'''
WITH active_sku_discount AS ( 
    SELECT 
        x.id AS sku_discount_id,
        retailer_id,
        product_id,
        packing_unit_id,
        DISCOUNT_PERCENTAGE,
        start_at,
        end_at 
    FROM (
        SELECT 
            sd.*,
            f.value::INT AS retailer_id 
        FROM SKU_DISCOUNTS sd,
        LATERAL FLATTEN(
            input => SPLIT(
                REPLACE(REPLACE(REPLACE(sd.retailer_ids, '{{', ''), '}}', ''), '"', ''), 
                ','
            )
        ) f
        WHERE start_at::DATE <= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE
            AND end_at::DATE >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE
    ) x 
    JOIN SKU_DISCOUNT_VALUES sdv ON x.id = sdv.sku_discount_id
    WHERE name_en = 'Special Discounts'
    QUALIFY MAX(start_at) OVER (PARTITION BY retailer_id, product_id, packing_unit_id) = start_at 
)

SELECT 
    product_id, 
    warehouse_id,
    AVG(DISCOUNT_PERCENTAGE) AS active_sku_disc_pct 
FROM (
    SELECT 
        asd.*,
        warehouse_id 
    FROM active_sku_discount asd 
    JOIN materialized_views.retailer_polygon rp ON rp.retailer_id = asd.retailer_id
    JOIN WAREHOUSE_DISPATCHING_RULES wdr ON wdr.product_id = asd.product_id
    JOIN DISPATCHING_POLYGONS dp ON dp.id = wdr.DISPATCHING_POLYGON_ID AND dp.district_id = rp.district_id
)
GROUP BY ALL
'''

# Execute active SKU discount query
print("Loading active SKU discount data...")
df_active_sku_disc = query_snowflake(ACTIVE_SKU_DISCOUNT_QUERY)
df_active_sku_disc = convert_to_numeric(df_active_sku_disc)
print(f"Loaded {len(df_active_sku_disc)} active SKU discount records")

# Merge with pricing_with_discount
pricing_with_discount = pricing_with_discount.merge(
    df_active_sku_disc[['product_id', 'warehouse_id', 'active_sku_disc_pct']],
    on=['product_id', 'warehouse_id'],
    how='left'
)

# Fill NaN with 0 (no active SKU discount)
pricing_with_discount['active_sku_disc_pct'] = pricing_with_discount['active_sku_disc_pct'].fillna(0)

print(f"\n{'='*60}")
print(f"ACTIVE SKU DISCOUNT ANALYSIS")
print(f"{'='*60}")
print(f"Records with active SKU discount: {len(pricing_with_discount[pricing_with_discount['active_sku_disc_pct'] > 0])}")
print(f"Records without active SKU discount: {len(pricing_with_discount[pricing_with_discount['active_sku_disc_pct'] == 0])}")
print(f"\nActive SKU discount distribution:")
print(pricing_with_discount[pricing_with_discount['active_sku_disc_pct'] > 0]['active_sku_disc_pct'].describe())
print(f"\nSample data with active SKU discounts:")
pricing_with_discount[pricing_with_discount['active_sku_disc_pct'] > 0][
    ['product_id', 'warehouse_id', 'sku', 'current_price', 'active_sku_disc_pct', 'discount_perc']
].head(15)


Loading active SKU discount data...
Loaded 14459 active SKU discount records

ACTIVE SKU DISCOUNT ANALYSIS
Records with active SKU discount: 15252
Records without active SKU discount: 76428

Active SKU discount distribution:
count    15252.000000
mean         1.451839
std          1.210029
min          0.260000
25%          0.550000
50%          1.000000
75%          1.920000
max          5.000000
Name: active_sku_disc_pct, dtype: float64

Sample data with active SKU discounts:


  df[col] = pd.to_numeric(df[col], errors='ignore')


Unnamed: 0,product_id,warehouse_id,sku,current_price,active_sku_disc_pct,discount_perc
0,3576,703,حفاضات بيبى جوى مضغوطة جيب مانع للتسريب كبير م...,292.5,1.04,0.0
3,9070,8,زيت كريستال عباد الشمس - 5 لتر,915.0,4.48,0.0448
4,448,339,زيت كريستال الممتاز خليط - 700 مل,618.5,0.63,0.0063
5,448,170,زيت كريستال الممتاز خليط - 700 مل,618.5,0.63,0.0
8,22327,339,بونكس اوتوماتيك برائحة الفل - 9 كجم,704.0,0.92,0.0
9,22327,170,بونكس اوتوماتيك برائحة الفل - 9 كجم,704.0,0.92,0.0
10,1414,1,تونة دولفين مفتتة حار - 140 جم,24.0,0.38,0.001361
11,3575,501,حفاضات بيبى جوى مضغوطة وسط مقاس 3 - 58 حفاضة,295.5,0.46,0.0
13,9609,962,سمن جنة - 2.5 كجم,923.75,1.01,0.0
17,1703,632,مناديل فاميليا تواليت 2 طبقة - 2 رول,179.25,0.97,0.0


In [48]:
final_df = pricing_with_discount[(pricing_with_discount['no_nmv_4m']==0)|(pricing_with_discount['stocks']>0)]

In [49]:
# Drop duplicates before saving
final_df = final_df.drop_duplicates(subset=['product_id', 'warehouse_id'], keep='first')
final_df.to_excel('pricing_with_discount.xlsx', index=False)
print(f"Exported {len(final_df)} records (duplicates removed)")

Exported 28491 records (duplicates removed)


In [50]:
final_df['created_at'] = TODAY
final_df['created_at'] =pd.to_datetime(final_df['created_at']).dt.date

In [53]:
status = upload_dataframe_to_snowflake("Egypt", final_df, "MATERIALIZED_VIEWS", "Pricing_data_extraction", "append", auto_create_table=True, conn=None)

# Send Slack notification
if status:
    slack_message = f"""✅ *Data Extraction Script Completed Successfully*
    
📅 Date: {TODAY}
📊 Records uploaded: {len(final_df):,}
🗄️ Table: MATERIALIZED_VIEWS.Pricing_data_extraction
⏰ Completed at: {datetime.now(CAIRO_TZ).strftime('%Y-%m-%d %H:%M:%S')} Cairo time"""
    
    send_text_slack('new-pricing-logic',slack_message)
    print("✅ Slack notification sent!")
else:
    error_message = f"""❌ *Data Extraction Script Failed*
    
📅 Date: {TODAY}
⏰ Failed at: {datetime.now(CAIRO_TZ).strftime('%Y-%m-%d %H:%M:%S')} Cairo time
⚠️ Upload to Snowflake failed - please check logs"""
    
    send_text_slack('new-pricing-logic',error_message)
    print("❌ Error notification sent to Slack!")

/home/ec2-user/service_account_key.json


  success, _, _, _ = write_pandas(


/home/ec2-user/service_account_key.json
Message Sent
✅ Slack notification sent!
