# Market Data Module

This module provides fresh market prices and margin tiers data on demand.

**NO INPUT REQUIRED** - All data is fetched directly from Snowflake.

## Functions
- `get_market_data()` - Fetch and process market prices from all sources (Ben Soliman, Marketplace, Scrapped)
- `get_margin_tiers()` - Calculate margin tiers for products from PRODUCT_STATISTICS

## Output Columns

**Market Data (from `get_market_data()`):**
- Raw prices: `ben_soliman_price`, `final_min_price`, `final_max_price`, etc.
- Price percentiles: `minimum`, `percentile_25`, `percentile_50`, `percentile_75`, `maximum`
- Margin tiers: `below_market`, `market_min`, `market_25`, `market_50`, `market_75`, `market_max`, `above_market`

**Margin Tiers (from `get_margin_tiers()`):**
- `margin_tier_below`, `margin_tier_1`, `margin_tier_2`, `margin_tier_3`, `margin_tier_4`, `margin_tier_5`, `margin_tier_above_1`, `margin_tier_above_2`

## Usage
```python
%run market_data_module.ipynb

# Get market data (no input required)
df_market = get_market_data()

# Get margin tiers (no input required)
df_margin_tiers = get_margin_tiers()
```


In [None]:
# =============================================================================
# IMPORTS & CONFIGURATION
# =============================================================================
import pandas as pd
import numpy as np
from datetime import datetime
import pytz
import snowflake.connector
import os

# Import setup_environment_2 for credentials
import sys
sys.path.append('..')
import setup_environment_2

# Initialize environment (loads Snowflake credentials)
setup_environment_2.initialize_env()

# Cairo timezone
CAIRO_TZ = pytz.timezone('Africa/Cairo')
CAIRO_NOW = datetime.now(CAIRO_TZ)

# =============================================================================
# SNOWFLAKE CONNECTION
# =============================================================================
def query_snowflake(query):
    """Execute a query on Snowflake and return results as DataFrame."""
    con = snowflake.connector.connect(
        user=os.environ["SNOWFLAKE_USERNAME"],
        account=os.environ["SNOWFLAKE_ACCOUNT"],
        password=os.environ["SNOWFLAKE_PASSWORD"],
        database=os.environ["SNOWFLAKE_DATABASE"]
    )
    try:
        cur = con.cursor()
        cur.execute("USE WAREHOUSE COMPUTE_WH")
        cur.execute(query)
        data = cur.fetchall()
        columns = [desc[0].lower() for desc in cur.description]
        df = pd.DataFrame(data, columns=columns)
        # Convert decimal.Decimal to float
        for col in df.columns:
            if df[col].dtype == object:
                try:
                    df[col] = df[col].apply(lambda x: float(x) if hasattr(x, '__float__') else x)
                except:
                    pass
        return df
    finally:
        con.close()

def get_snowflake_timezone():
    result = query_snowflake("SHOW PARAMETERS LIKE 'TIMEZONE'")
    return result['value'].iloc[0] if len(result) > 0 else "UTC"

# Get timezone for queries
TIMEZONE = get_snowflake_timezone()

# Region-Cohort mapping
REGION_COHORT_DF = pd.DataFrame({
    'region': ['Cairo', 'Giza', 'Delta West', 'Delta East', 
               'Upper Egypt', 'Upper Egypt', 'Upper Egypt', 'Upper Egypt', 'Alexandria'],
    'cohort_id': [700, 701, 703, 704, 1124, 1126, 1123, 1125, 702]
})

print(f"Market Data Module loaded at {CAIRO_NOW.strftime('%Y-%m-%d %H:%M:%S')} Cairo time")
print(f"Snowflake timezone: {TIMEZONE}")


  warn_incompatible_dep(


/home/ec2-user/.Renviron
/home/ec2-user/service_account_key.json
Market Data Module loaded at 2026-01-24 01:55:57 Cairo time
Snowflake timezone: America/Los_Angeles


In [None]:
# =============================================================================
# ALL QUERIES FOR MARKET DATA MODULE
# =============================================================================
# Note: TIMEZONE is set dynamically from Snowflake in the imports cell above

# =============================================================================
# 1. BEN SOLIMAN PRICES QUERY
# =============================================================================
BEN_SOLIMAN_QUERY = f'''
WITH lower as (
    select distinct product_id, new_d*bs_price as ben_soliman_price, INJECTION_DATE
    from (
        select maxab_product_id as product_id, INJECTION_DATE, wac1, wac_p,
            (bs_price) as bs_price, diff, cu_price,
            case when p1 > 1 then child_quantity else 0 end as scheck,
            round(p1/2)*2 as p1, p2,
            case when (ROUND(p1 / scheck) * scheck) = 0 then p1 else (ROUND(p1 / scheck) * scheck) end as new_d
        from (
            select sm.*, wac1, wac_p, 
                abs((bs_price)-(wac_p*maxab_basic_unit_count))/(wac_p*maxab_basic_unit_count) as diff,
                cpc.price as cu_price, pup.child_quantity,
                round((cu_price/bs_price)) as p1, 
                round(((bs_price)/cu_price)) as p2
            from materialized_views.savvy_mapping sm 
            join finance.all_cogs f on f.product_id = sm.maxab_product_id 
                and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
            join PACKING_UNIT_PRODUCTS pu on pu.product_id = sm.maxab_product_id and pu.IS_BASIC_UNIT = 1 
            join cohort_product_packing_units cpc on cpc.PRODUCT_PACKING_UNIT_ID = pu.id and cohort_id = 700 
            join packing_unit_products pup on pup.product_id = sm.maxab_product_id and pup.is_basic_unit = 1  
            where bs_price is not null 
                and INJECTION_DATE::date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 5 
                and diff > 0.3 and p1 > 1
        )
    )
    qualify max(INJECTION_DATE) over(partition by product_id) = INJECTION_DATE
),

m_bs as (
    select z.* from (
        select maxab_product_id as product_id, avg(bs_final_price) as ben_soliman_price, INJECTION_DATE
        from (
            select *, row_number() over(partition by maxab_product_id order by diff) as rnk_2 
            from (
                select *, (bs_final_price-wac_p)/wac_p as diff_2 
                from (
                    select *, bs_price/maxab_basic_unit_count as bs_final_price 
                    from (
                        select *, row_number() over(partition by maxab_product_id, maxab_pu order by diff) as rnk 
                        from (
                            select *, max(INJECTION_DATE::date) over(partition by maxab_product_id, maxab_pu) as max_date
                            from (
                                select sm.*, wac1, wac_p, 
                                    abs(bs_price-(wac_p*maxab_basic_unit_count))/(wac_p*maxab_basic_unit_count) as diff 
                                from materialized_views.savvy_mapping sm 
                                join finance.all_cogs f on f.product_id = sm.maxab_product_id 
                                    and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
                                where bs_price is not null 
                                    and INJECTION_DATE::date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 5 
                                    and diff < 0.3
                            )
                            qualify max_date = INJECTION_DATE
                        ) qualify rnk = 1 
                    )
                ) where diff_2 between -0.5 and 0.5 
            ) qualify rnk_2 = 1 
        ) group by all
    ) z 
    join finance.all_cogs f on f.product_id = z.product_id 
        and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
    where ben_soliman_price between f.wac_p*0.8 and f.wac_p*1.3
)
select product_id,avg(ben_soliman_price) as ben_soliman_price
from (
select product_id,ben_soliman_price,INJECTION_DATE
from (
    select * from (
        select *,1 as prio from m_bs 
        union all
        select *, 2 as prio from lower
    )
    qualify max(INJECTION_DATE) over(partition by product_id) = INJECTION_DATE
)
qualify prio = min(prio)over(partition by product_id)
)
group by all
'''

# =============================================================================
# 2. MARKETPLACE PRICES QUERY (with region fallback)
# =============================================================================
MARKETPLACE_PRICES_QUERY = f'''
WITH MP as (
    select region, product_id,
        min(min_price) as min_price, min(max_price) as max_price,
        min(mod_price) as mod_price, min(true_min) as true_min, min(true_max) as true_max
    from (
        select mp.region, mp.product_id, mp.pu_id,
            min_price/BASIC_UNIT_COUNT as min_price,
            max_price/BASIC_UNIT_COUNT as max_price,
            mod_price/BASIC_UNIT_COUNT as mod_price,
            TRUE_MIN_PRICE/BASIC_UNIT_COUNT as true_min,
            TRUE_MAX_PRICE/BASIC_UNIT_COUNT as true_max
        from materialized_views.marketplace_prices mp 
        join packing_unit_products pup on pup.product_id = mp.product_id and pup.packing_unit_id = mp.pu_id
        join finance.all_cogs f on f.product_id = mp.product_id 
            and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_date and f.to_date
        where least(min_price, mod_price) between wac_p*0.9 and wac_p*1.3 
    )
    group by all 
),

region_mapping AS (
    SELECT * FROM (VALUES
        ('Delta East', 'Delta West'), ('Delta West', 'Delta East'),
        ('Alexandria', 'Cairo'), ('Alexandria', 'Giza'),
        ('Upper Egypt', 'Cairo'), ('Upper Egypt', 'Giza'),
        ('Cairo', 'Giza'), ('Giza', 'Cairo'),
        ('Delta West', 'Cairo'), ('Delta East', 'Cairo'),
        ('Delta West', 'Giza'), ('Delta East', 'Giza')
    ) AS region_mapping(region, fallback_region)
),

all_regions as (
    SELECT * FROM (VALUES
        ('Cairo'), ('Giza'), ('Delta West'), ('Delta East'), ('Upper Egypt'), ('Alexandria')
    ) AS x(region)
),

full_data as (
    select products.id as product_id, ar.region
    from products, all_regions ar
    where activation = 'true'
)

select region, product_id,
    min(final_min_price) as final_min_price, 
    min(final_max_price) as final_max_price,
    min(final_mod_price) as final_mod_price, 
    min(final_true_min) as final_true_min,
    min(final_true_max) as final_true_max
from (
    SELECT distinct w.region, w.product_id,
        COALESCE(m1.min_price, m2.min_price) AS final_min_price,
        COALESCE(m1.max_price, m2.max_price) AS final_max_price,
        COALESCE(m1.mod_price, m2.mod_price) AS final_mod_price,
        COALESCE(m1.true_min, m2.true_min) AS final_true_min,
        COALESCE(m1.true_max, m2.true_max) AS final_true_max
    FROM full_data w
    LEFT JOIN MP m1 ON w.region = m1.region and w.product_id = m1.product_id
    LEFT JOIN region_mapping rm ON w.region = rm.region
    LEFT JOIN MP m2 ON rm.fallback_region = m2.region AND w.product_id = m2.product_id
)
where final_min_price is not null 
group by all
'''

# =============================================================================
# 3. SCRAPPED DATA QUERY (Competitor prices from scraping)
# =============================================================================
SCRAPPED_QUERY = f'''
select product_id, region,
    MIN(market_price) AS min_scrapped,
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY market_price) AS scrapped25,
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY market_price) AS scrapped50,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY market_price) AS scrapped75,
    MAX(market_price) AS max_scrapped
from (
    select distinct cmp.*, max(date) over(partition by region, cmp.product_id, competitor) as max_date
    from MATERIALIZED_VIEWS.CLEANED_MARKET_PRICES cmp
    join finance.all_cogs f on f.product_id = cmp.product_id 
        and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_date and f.to_date 
    where date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 7 
        and MARKET_PRICE between f.wac_p * 0.8 and wac_p * 1.3
    qualify date = max_date 
)
group by all
'''

# =============================================================================
# 4. PRODUCT GROUPS QUERY
# =============================================================================
GROUPS_QUERY = '''
SELECT * FROM materialized_views.sku_commercial_groups
'''

# =============================================================================
# 5. SALES DATA QUERY (for NMV weighting in group processing)
# =============================================================================
SALES_QUERY = f'''
SELECT DISTINCT cpc.cohort_id, pso.product_id,
    CONCAT(products.name_ar,' ',products.size,' ',product_units.name_ar) as sku,
    brands.name_ar as brand, categories.name_ar as cat,
    sum(pso.total_price) as nmv
FROM product_sales_order pso
JOIN sales_orders so ON so.id = pso.sales_order_id
JOIN COHORT_PRICING_CHANGES cpc ON cpc.id = pso.COHORT_PRICING_CHANGE_id
JOIN products ON products.id = pso.product_id
JOIN brands ON products.brand_id = brands.id 
JOIN categories ON products.category_id = categories.id
JOIN product_units ON product_units.id = products.unit_id 
WHERE so.created_at::date BETWEEN CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 120 
    AND CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 1 
    AND so.sales_order_status_id NOT IN (7, 12)
    AND so.channel IN ('telesales', 'retailer')
    AND pso.purchased_item_count <> 0
    AND cpc.cohort_id IN (700,701,702,703,704,1123,1124,1125,1126)
GROUP BY ALL
'''

# =============================================================================
# 6. MARGIN STATS QUERY (STD and average margins)
# =============================================================================
MARGIN_STATS_QUERY = f'''
select product_id, cohort_id, 
    (0.6*product_std) + (0.3*brand_std) + (0.1*cat_std) as std, 
    avg_margin
from (
    select product_id, cohort_id, 
        stddev(product_margin) as product_std, 
        stddev(brand_margin) as brand_std,
        stddev(cat_margin) as cat_std, 
        avg(product_margin) as avg_margin
    from (
        select distinct product_id, order_date, cohort_id,
            (nmv-cogs_p)/nmv as product_margin, 
            (brand_nmv-brand_cogs)/brand_nmv as brand_margin,
            (cat_nmv-cat_cogs)/cat_nmv as cat_margin
        from (
            SELECT DISTINCT so.created_at::date as order_date, cpc.cohort_id, pso.product_id,
                brands.name_ar as brand, categories.name_ar as cat,
                sum(COALESCE(f.wac_p,0) * pso.purchased_item_count * pso.basic_unit_count) as cogs_p,
                sum(pso.total_price) as nmv,
                sum(nmv) over(partition by order_date, cat, brand) as brand_nmv,
                sum(cogs_p) over(partition by order_date, cat, brand) as brand_cogs,
                sum(nmv) over(partition by order_date, cat) as cat_nmv,
                sum(cogs_p) over(partition by order_date, cat) as cat_cogs
            FROM product_sales_order pso
            JOIN sales_orders so ON so.id = pso.sales_order_id   
            JOIN COHORT_PRICING_CHANGES cpc on cpc.id = pso.cohort_pricing_change_id
            JOIN products on products.id = pso.product_id
            JOIN brands on products.brand_id = brands.id 
            JOIN categories ON products.category_id = categories.id
            JOIN finance.all_cogs f ON f.product_id = pso.product_id
                AND f.from_date::date <= so.created_at::date AND f.to_date::date > so.created_at::date
            WHERE so.created_at::date between 
                date_trunc('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 120) 
                and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date
                AND so.sales_order_status_id not in (7,12)
                AND so.channel IN ('telesales','retailer')
                AND pso.purchased_item_count <> 0
            GROUP BY ALL
        )
    ) group by all 
)
'''

# =============================================================================
# 7. TARGET MARGINS QUERY
# =============================================================================
TARGET_MARGINS_QUERY = f'''
WITH cat_brand_target as (
    SELECT DISTINCT cat, brand, margin as target_bm
    FROM performance.commercial_targets cplan
    QUALIFY CASE 
        WHEN DATE_TRUNC('month', MAX(DATE) OVER()) = DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date) 
        THEN DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date)
        ELSE DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - INTERVAL '1 month') 
    END = DATE_TRUNC('month', date)
),
cat_target as (
    select cat, sum(target_bm * (target_nmv/cat_total)) as cat_target_margin
    from (
        select *, sum(target_nmv) over(partition by cat) as cat_total
        from (
            select cat, brand, avg(target_bm) as target_bm, sum(target_nmv) as target_nmv
            from (
                SELECT DISTINCT date, city as region, cat, brand, margin as target_bm, nmv as target_nmv
                FROM performance.commercial_targets cplan
                QUALIFY CASE 
                    WHEN DATE_TRUNC('month', MAX(DATE) OVER()) = DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date) 
                    THEN DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date)
                    ELSE DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - INTERVAL '1 month') 
                END = DATE_TRUNC('month', date)
            ) group by all
        )
    ) group by all 
)
SELECT DISTINCT cbt.cat, cbt.brand, cbt.target_bm, ct.cat_target_margin
FROM cat_brand_target cbt
LEFT JOIN cat_target ct ON ct.cat = cbt.cat
'''

# =============================================================================
# 8. PRODUCT BASE QUERY (WAC data)
# =============================================================================
PRODUCT_BASE_QUERY = f'''
SELECT DISTINCT
    CASE 
        WHEN cohort_id IN (700, 695) THEN 'Cairo'
        WHEN cohort_id IN (701) THEN 'Giza'
        WHEN cohort_id IN (704, 698) THEN 'Delta East'
        WHEN cohort_id IN (703, 697) THEN 'Delta West'
        WHEN cohort_id IN (696, 1123, 1124, 1125, 1126) THEN 'Upper Egypt'
        WHEN cohort_id IN (702, 699) THEN 'Alexandria'
    END AS region,
    cohort_id,
    f.product_id,
    brands.name_ar as brand,
    categories.name_ar as cat,
    f.wac1,
    f.wac_p
FROM finance.all_cogs f
JOIN products ON products.id = f.product_id
JOIN brands ON products.brand_id = brands.id
JOIN categories ON products.category_id = categories.id
CROSS JOIN (
    SELECT DISTINCT cohort_id 
    FROM COHORT_PRICING_CHANGES 
    WHERE cohort_id IN (700,701,702,703,704,1123,1124,1125,1126)
) cohorts
WHERE CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) BETWEEN f.from_date AND f.to_date
    AND products.activation = 'true'
'''

# =============================================================================
# 9. MARGIN BOUNDARIES QUERY (for margin tiers)
# =============================================================================
MARGIN_BOUNDARIES_QUERY = f'''
SELECT 
    region,
    product_id,
    optimal_bm,
    MIN_BOUNDARY,
    MAX_BOUNDARY,
    MEDIAN_BM
FROM (
    SELECT 
        region,
        product_id,
        target_bm,
        optimal_bm,
        MIN_BOUNDARY,
        MAX_BOUNDARY,
        MEDIAN_BM,
        MAX(created_at) OVER (PARTITION BY product_id, region) AS max_date,
        created_at
    FROM materialized_views.PRODUCT_STATISTICS
    WHERE created_at::DATE >= DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::DATE - 60)
    QUALIFY max_date = created_at
)
'''

print("All queries defined ✓")


Market price queries defined ✓


In [None]:
# =============================================================================
# HELPER FUNCTIONS
# =============================================================================

def price_analysis(row):
    """
    Analyze prices and calculate percentiles for a product.
    
    Collects prices from all sources (Ben Soliman, Marketplace, Scrapped),
    filters for valid prices within acceptable range, and calculates percentiles.
    """
    wac = row['wac_p']
    avg_margin = row['avg_margin'] if row['avg_margin'] >= 0.01 else row['target_margin']
    std = np.maximum(row['std'], 0.0025)
    target_margin = row['target_margin']
    max_marg = np.maximum(avg_margin, target_margin)
    
    # Collect all price points from different sources
    price_list = [
        row.get('ben_soliman_price'), 
        row.get('final_min_price'), 
        row.get('final_mod_price'),
        row.get('final_max_price'), 
        row.get('final_true_min'), 
        row.get('final_true_max'),
        row.get('min_scrapped'), 
        row.get('scrapped25'), 
        row.get('scrapped50'), 
        row.get('scrapped75'), 
        row.get('max_scrapped')
    ]
    
    # Filter valid prices within acceptable range
    valid_prices = sorted({
        x for x in price_list 
        if x and not pd.isna(x) and x != 0 
        and wac / (1 - (avg_margin - (10 * std))) <= x <= wac / (1 - (max_marg + 10 * std))
        and x >= wac * (0.9 + target_margin * 0.7)
    })
    
    if not valid_prices:
        return np.nan, np.nan, np.nan, np.nan, np.nan
    
    return (
        np.min(valid_prices),
        np.percentile(valid_prices, 25),
        np.percentile(valid_prices, 50),
        np.percentile(valid_prices, 75),
        np.max(valid_prices)
    )


def calculate_step_bounds(row):
    """Calculate below/above market bounds based on price steps."""
    wac = row['wac_p']
    std = row['std']
    target_margin = row.get('target_margin', 0.05)
    
    prices = [
        row['minimum'], 
        row['percentile_25'], 
        row['percentile_50'], 
        row['percentile_75'], 
        row['maximum']
    ]
    
    # Calculate valid steps between price points
    valid_steps = []
    for i in range(len(prices) - 1):
        step = prices[i + 1] - prices[i]
        if (step / wac) <= std * 1.2:
            valid_steps.append(step)
    
    avg_step = np.mean(valid_steps) if valid_steps else min(2 * std, 0.2 * target_margin)
    
    new_min = prices[0] - avg_step if (prices[0] - avg_step) >= wac else prices[0]
    new_max = prices[-1] + avg_step if (prices[-1] + avg_step) >= wac else prices[-1]
    
    return new_min, new_max


def weighted_median(series, weights):
    """Calculate weighted median of a series."""
    valid = ~series.isna() & ~weights.isna()
    s = series[valid]
    w = weights[valid]
    if len(s) == 0:
        return np.nan
    order = np.argsort(s)
    s, w = s.iloc[order], w.iloc[order]
    return s.iloc[np.searchsorted(np.cumsum(w), w.sum() / 2)]


print("Helper functions defined ✓")


Price analysis helper functions defined ✓


In [None]:
# =============================================================================
# MAIN FUNCTION 1: get_market_data()
# =============================================================================
# This function fetches and processes all market price data from Snowflake
# NO INPUT REQUIRED - all data is fetched directly

def get_market_data() -> pd.DataFrame:
    """
    Fetch and process all market prices from Snowflake.
    
    NO INPUT REQUIRED - All data is fetched directly from Snowflake.
    
    Process:
    1. Fetch Ben Soliman, Marketplace, and Scrapped prices
    2. Outer join all price sources
    3. Add cohort IDs and supporting data (sales, margin stats, targets)
    4. Process group-level prices (weighted median)
    5. Apply price coverage filtering
    6. Calculate price percentiles
    7. Convert prices to margins
    
    Returns:
        DataFrame with columns:
        - cohort_id, product_id, region
        - Raw prices: ben_soliman_price, final_min_price, etc.
        - Percentiles: minimum, percentile_25, percentile_50, percentile_75, maximum
        - Margins: below_market, market_min, market_25, market_50, market_75, market_max, above_market
    """
    print("\n" + "="*70)
    print("FETCHING MARKET DATA")
    print("="*70)
    print(f"Timestamp: {datetime.now(CAIRO_TZ).strftime('%Y-%m-%d %H:%M:%S')} Cairo time")
    
    # =========================================================================
    # Step 1: Fetch all raw price data
    # =========================================================================
    print("\nStep 1: Fetching raw price data...")
    
    print("  1.1 Ben Soliman prices...")
    df_ben_soliman = query_snowflake(BEN_SOLIMAN_QUERY)
    print(f"      Loaded {len(df_ben_soliman)} records")
    
    print("  1.2 Marketplace prices...")
    df_marketplace = query_snowflake(MARKETPLACE_PRICES_QUERY)
    df_marketplace['final_true_min'] = np.nan
    print(f"      Loaded {len(df_marketplace)} records")
    
    print("  1.3 Scrapped prices...")
    df_scrapped = query_snowflake(SCRAPPED_QUERY)
    print(f"      Loaded {len(df_scrapped)} records")
    
    print("  1.4 Product groups...")
    df_groups = query_snowflake(GROUPS_QUERY)
    print(f"      Loaded {len(df_groups)} records")
    
    print("  1.5 Sales data (for NMV weighting)...")
    df_sales = query_snowflake(SALES_QUERY)
    print(f"      Loaded {len(df_sales)} records")
    
    print("  1.6 Margin stats...")
    df_margin_stats = query_snowflake(MARGIN_STATS_QUERY)
    print(f"      Loaded {len(df_margin_stats)} records")
    
    print("  1.7 Target margins...")
    df_targets = query_snowflake(TARGET_MARGINS_QUERY)
    print(f"      Loaded {len(df_targets)} records")
    
    print("  1.8 Product base (WAC)...")
    df_product_base = query_snowflake(PRODUCT_BASE_QUERY)
    print(f"      Loaded {len(df_product_base)} records")
    
    # =========================================================================
    # Step 2: Outer join all market price sources
    # =========================================================================
    print("\nStep 2: Joining all market price sources (outer join)...")
    
    # Start with marketplace prices (has region + product_id)
    market_data = df_marketplace.copy()
    
    # Outer join with scrapped data (by region + product_id)
    market_data = market_data.merge(df_scrapped, on=['region', 'product_id'], how='outer')
    
    # Outer join with Ben Soliman prices (by product_id only - expand to all regions)
    all_regions = pd.DataFrame({'region': ['Cairo', 'Giza', 'Delta West', 'Delta East', 'Upper Egypt', 'Alexandria']})
    df_ben_soliman_expanded = df_ben_soliman.merge(all_regions, how='cross')
    
    # Outer join with Ben Soliman
    market_data = market_data.merge(df_ben_soliman_expanded, on=['region', 'product_id'], how='outer')
    
    print(f"    Market prices base: {len(market_data)} records")
    
    # =========================================================================
    # Step 3: Add cohort IDs and supporting data
    # =========================================================================
    print("\nStep 3: Adding cohort IDs and supporting data...")
    
    market_data = market_data.merge(REGION_COHORT_DF, on='region')
    
    # Add sales data for NMV weighting
    market_data = market_data.merge(
        df_sales[['cohort_id', 'product_id', 'nmv', 'sku', 'brand', 'cat']], 
        on=['cohort_id', 'product_id'], 
        how='left'
    )
    market_data['nmv'] = market_data['nmv'].fillna(0)
    
    # Merge product groups
    market_data = market_data.merge(df_groups, on='product_id', how='left')
    
    # Remove duplicates
    market_data = market_data.drop_duplicates(subset=['cohort_id', 'product_id'])
    
    print(f"    Records after adding cohorts: {len(market_data)}")
    
    # =========================================================================
    # Step 4: Group-level price processing
    # =========================================================================
    print("\nStep 4: Processing group-level prices...")
    
    price_cols = [
        'ben_soliman_price', 'final_min_price', 'final_max_price', 'final_mod_price', 
        'final_true_min', 'final_true_max', 'min_scrapped', 'scrapped25', 
        'scrapped50', 'scrapped75', 'max_scrapped'
    ]
    
    groups_data = market_data[~market_data['group_id'].isna()].copy()
    
    if len(groups_data) > 0:
        groups_data['group_nmv'] = groups_data.groupby(['group_id', 'cohort_id'])['nmv'].transform('sum')
        groups_data['cntrb'] = (groups_data['nmv'] / groups_data['group_nmv']).fillna(1)
        
        # Flag if any price column is non-NaN
        groups_data['flag_non_nan'] = groups_data[price_cols].notna().any(axis=1).astype(int)
        
        # Perform weighted aggregation
        groups_agg = (
            groups_data[groups_data['flag_non_nan'] == 1]
            .groupby(['group_id', 'cohort_id'])
            .apply(lambda g: pd.Series({
                col: weighted_median(g[col], g['cntrb']) for col in price_cols
            }))
            .reset_index()
        )
        
        # Fill missing prices with group-level prices
        merged = market_data.merge(groups_agg, on=['group_id', 'cohort_id'], how='left', suffixes=('', '_group'))
        for col in price_cols:
            if f'{col}_group' in merged.columns:
                merged[col] = merged[col].fillna(merged[f'{col}_group'])
        
        market_data = merged.drop(columns=[f'{c}_group' for c in price_cols if f'{c}_group' in merged.columns], errors='ignore')
        
        # Add missing group SKUs
        missing_groups_skus = df_groups.merge(groups_agg, on='group_id')
        missing_groups_skus = missing_groups_skus.merge(REGION_COHORT_DF, on='cohort_id')
        market_data = pd.concat([market_data, missing_groups_skus])
        market_data = market_data.drop_duplicates(subset=['cohort_id', 'product_id'], keep='first')
    
    print(f"    Records after group processing: {len(market_data)}")
    
    # =========================================================================
    # Step 5: Add WAC and margin data
    # =========================================================================
    print("\nStep 5: Adding WAC and margin data...")
    
    # Drop nmv and re-merge sales
    market_data = market_data.drop(columns=['nmv'], errors='ignore')
    market_data = market_data.merge(
        df_sales[['cohort_id', 'product_id', 'nmv']], 
        on=['cohort_id', 'product_id'], 
        how='left'
    )
    
    # Add WAC from product base
    market_data = market_data.merge(
        df_product_base[['cohort_id', 'product_id', 'wac_p', 'brand', 'cat']].drop_duplicates(), 
        on=['cohort_id', 'product_id'], 
        how='left',
        suffixes=('', '_base')
    )
    # Fill brand/cat from base if missing
    if 'brand_base' in market_data.columns:
        market_data['brand'] = market_data['brand'].fillna(market_data['brand_base'])
        market_data['cat'] = market_data['cat'].fillna(market_data['cat_base'])
        market_data = market_data.drop(columns=['brand_base', 'cat_base'], errors='ignore')
    
    # Add margin stats
    market_data = market_data.merge(df_margin_stats, on=['cohort_id', 'product_id'], how='left')
    
    # Add target margins
    market_data = market_data.merge(df_targets, on=['brand', 'cat'], how='left')
    market_data['target_margin'] = market_data['target_bm'].fillna(market_data['cat_target_margin']).fillna(0)
    market_data = market_data.drop(columns=['target_bm', 'cat_target_margin'], errors='ignore')
    
    # Fill NaN values with defaults
    market_data['std'] = market_data['std'].fillna(0.01)
    market_data['avg_margin'] = market_data['avg_margin'].fillna(0)
    
    # Filter out records without WAC
    market_data = market_data[~market_data['wac_p'].isna()]
    
    print(f"    Records with WAC: {len(market_data)}")
    
    # =========================================================================
    # Step 6: Price coverage filtering
    # =========================================================================
    print("\nStep 6: Filtering by price coverage...")
    
    market_data['ben'] = 0
    market_data['MP'] = 0
    market_data['sp'] = 0
    
    # Ben Soliman: 1 point if present
    market_data.loc[~market_data['ben_soliman_price'].isna(), 'ben'] = 1
    
    # Marketplace: 1 point if single price, 3 points if range
    market_data.loc[(market_data['final_min_price'] == market_data['final_max_price']) & 
                    (~market_data['final_min_price'].isna()), 'MP'] = 1
    market_data.loc[(market_data['final_min_price'] != market_data['final_max_price']) & 
                    (~market_data['final_min_price'].isna()), 'MP'] = 3
    
    # Scrapped: 1 point if single price, 5 points if range
    market_data.loc[(market_data['min_scrapped'] == market_data['max_scrapped']) & 
                    (~market_data['min_scrapped'].isna()), 'sp'] = 1
    market_data.loc[(market_data['min_scrapped'] != market_data['max_scrapped']) & 
                    (~market_data['min_scrapped'].isna()), 'sp'] = 5
    
    market_data['total_p'] = market_data['ben'] + market_data['MP'] + market_data['sp']
    
    # Filter: keep only products with total_p > 2
    market_data = market_data[market_data['total_p'] > 2]
    
    print(f"    Records after price coverage filter: {len(market_data)}")
    
    # =========================================================================
    # Step 7: Apply price analysis
    # =========================================================================
    print("\nStep 7: Calculating price percentiles...")
    
    market_data[['minimum', 'percentile_25', 'percentile_50', 'percentile_75', 'maximum']] = \
        market_data.apply(price_analysis, axis=1, result_type='expand')
    
    # Filter out records without valid price analysis
    market_data = market_data[~market_data['minimum'].isna()]
    
    # Calculate below/above market bounds
    market_data[['below_market', 'above_market']] = market_data.apply(calculate_step_bounds, axis=1, result_type='expand')
    
    print(f"    Records after price analysis: {len(market_data)}")
    
    # =========================================================================
    # Step 8: Convert prices to margins
    # =========================================================================
    print("\nStep 8: Converting prices to margins...")
    
    market_data['below_market'] = (market_data['below_market'] - market_data['wac_p']) / market_data['below_market']
    market_data['market_min'] = (market_data['minimum'] - market_data['wac_p']) / market_data['minimum']
    market_data['market_25'] = (market_data['percentile_25'] - market_data['wac_p']) / market_data['percentile_25']
    market_data['market_50'] = (market_data['percentile_50'] - market_data['wac_p']) / market_data['percentile_50']
    market_data['market_75'] = (market_data['percentile_75'] - market_data['wac_p']) / market_data['percentile_75']
    market_data['market_max'] = (market_data['maximum'] - market_data['wac_p']) / market_data['maximum']
    market_data['above_market'] = (market_data['above_market'] - market_data['wac_p']) / market_data['above_market']
    
    # =========================================================================
    # Step 9: Select output columns
    # =========================================================================
    market_columns = [
        'cohort_id', 'product_id', 'region',
        # Raw prices
        'ben_soliman_price', 
        'final_min_price', 'final_max_price', 'final_mod_price', 'final_true_min', 'final_true_max',
        'min_scrapped', 'scrapped25', 'scrapped50', 'scrapped75', 'max_scrapped',
        # Price Percentiles
        'minimum', 'percentile_25', 'percentile_50', 'percentile_75', 'maximum',
        # Margin Tiers
        'below_market', 'market_min', 'market_25', 'market_50', 'market_75', 'market_max', 'above_market'
    ]
    market_data = market_data[[c for c in market_columns if c in market_data.columns]]
    
    # =========================================================================
    # Summary
    # =========================================================================
    print("\n" + "="*70)
    print("MARKET DATA COMPLETE")
    print("="*70)
    print(f"Total records: {len(market_data)}")
    print(f"  - With marketplace prices: {(~market_data['final_min_price'].isna()).sum()}")
    print(f"  - With scrapped prices: {(~market_data['min_scrapped'].isna()).sum()}")
    print(f"  - With Ben Soliman prices: {(~market_data['ben_soliman_price'].isna()).sum()}")
    
    return market_data


print("get_market_data() function defined ✓")


Data fetching functions defined ✓


In [None]:
# =============================================================================
# MAIN FUNCTION 2: get_margin_tiers()
# =============================================================================
# This function fetches margin boundaries and calculates margin tiers
# NO INPUT REQUIRED - all data is fetched directly

def get_margin_tiers() -> pd.DataFrame:
    """
    Fetch margin boundaries and calculate margin tiers from Snowflake.
    
    NO INPUT REQUIRED - All data is fetched directly from Snowflake.
    
    Process:
    1. Fetch margin boundaries from PRODUCT_STATISTICS
    2. Calculate 8 margin tiers:
       - margin_tier_below: 1 step below minimum
       - margin_tier_1 to margin_tier_5: Within range
       - margin_tier_above_1, margin_tier_above_2: Above maximum
    
    Returns:
        DataFrame with columns:
        - product_id, region, cohort_id
        - optimal_bm, min_boundary, max_boundary, median_bm
        - effective_min_margin, margin_step
        - margin_tier_below, margin_tier_1, margin_tier_2, margin_tier_3,
          margin_tier_4, margin_tier_5, margin_tier_above_1, margin_tier_above_2
    """
    print("\n" + "="*70)
    print("FETCHING MARGIN TIERS")
    print("="*70)
    print(f"Timestamp: {datetime.now(CAIRO_TZ).strftime('%Y-%m-%d %H:%M:%S')} Cairo time")
    
    # =========================================================================
    # Step 1: Fetch margin boundaries
    # =========================================================================
    print("\nStep 1: Fetching margin boundaries from PRODUCT_STATISTICS...")
    
    df_margin_boundaries = query_snowflake(MARGIN_BOUNDARIES_QUERY)
    print(f"    Loaded {len(df_margin_boundaries)} records")
    
    if len(df_margin_boundaries) == 0:
        print("    ⚠️ No margin boundaries found!")
        return pd.DataFrame()
    
    # =========================================================================
    # Step 2: Add cohort IDs
    # =========================================================================
    print("\nStep 2: Adding cohort IDs...")
    
    df = df_margin_boundaries.merge(REGION_COHORT_DF, on='region', how='left')
    print(f"    Records with cohorts: {len(df)}")
    
    # =========================================================================
    # Step 3: Calculate margin tiers
    # =========================================================================
    print("\nStep 3: Calculating margin tiers...")
    
    # Calculate the effective minimum margin (min of MIN_BOUNDARY and optimal_bm)
    df['effective_min_margin'] = df[['min_boundary', 'optimal_bm']].min(axis=1)
    
    # Calculate step size: (max_boundary - effective_min_margin) / 4
    df['margin_step'] = (df['max_boundary'] - df['effective_min_margin']) / 4
    
    # Calculate the 8 margin tiers:
    # Below minimum (1 step below)
    df['margin_tier_below'] = df['effective_min_margin'] - df['margin_step']
    
    # 5 tiers in range (equally spaced)
    df['margin_tier_1'] = df['effective_min_margin']  # Min
    df['margin_tier_2'] = df['effective_min_margin'] + df['margin_step']
    df['margin_tier_3'] = df['effective_min_margin'] + 2 * df['margin_step']
    df['margin_tier_4'] = df['effective_min_margin'] + 3 * df['margin_step']
    df['margin_tier_5'] = df['max_boundary']  # Max
    
    # Above maximum (2 steps above)
    df['margin_tier_above_1'] = df['max_boundary'] + df['margin_step']
    df['margin_tier_above_2'] = df['max_boundary'] + 2 * df['margin_step']
    
    # =========================================================================
    # Step 4: Select output columns
    # =========================================================================
    output_cols = [
        'product_id', 'region', 'cohort_id',
        'optimal_bm', 'min_boundary', 'max_boundary', 'median_bm',
        'effective_min_margin', 'margin_step',
        'margin_tier_below', 'margin_tier_1', 'margin_tier_2', 'margin_tier_3',
        'margin_tier_4', 'margin_tier_5', 'margin_tier_above_1', 'margin_tier_above_2'
    ]
    df = df[[c for c in output_cols if c in df.columns]]
    
    # =========================================================================
    # Summary
    # =========================================================================
    print("\n" + "="*70)
    print("MARGIN TIERS COMPLETE")
    print("="*70)
    print(f"Total records: {len(df)}")
    print(f"\nMargin Tier Structure:")
    print(f"  margin_tier_below:   effective_min - step (1 below)")
    print(f"  margin_tier_1:       effective_min_margin")
    print(f"  margin_tier_2:       effective_min + 1*step")
    print(f"  margin_tier_3:       effective_min + 2*step")
    print(f"  margin_tier_4:       effective_min + 3*step")
    print(f"  margin_tier_5:       max_boundary")
    print(f"  margin_tier_above_1: max_boundary + 1*step")
    print(f"  margin_tier_above_2: max_boundary + 2*step")
    
    return df


print("get_margin_tiers() function defined ✓")


In [None]:
# =============================================================================
# MODULE READY
# =============================================================================

print("\n" + "="*70)
print("MARKET DATA MODULE READY")
print("="*70)
print("\nAvailable functions (NO INPUT REQUIRED):")
print("  - get_market_data()   : Fetch and process all market prices")
print("  - get_margin_tiers()  : Fetch and calculate margin tiers")
print("\nUsage:")
print("  %run market_data_module.ipynb")
print("  df_market = get_market_data()")
print("  df_tiers = get_margin_tiers()")
print("="*70)


In [None]:
# This cell intentionally left empty - old combined function removed
# Use get_market_data() and get_margin_tiers() instead
