## Market Prices Extraction Queries
Queries for external market price data:
1. **Ben Soliman Prices** - Competitor reference prices
2. **Marketplace Prices** - Min, Max, Mod prices from marketplace
3. **Scrapped Data** - Competitor prices from scraping


In [12]:
# =============================================================================
# 1. BEN SOLIMAN PRICES QUERY
# =============================================================================
BEN_SOLIMAN_QUERY = f'''
WITH lower as (
    select distinct product_id, sku, new_d*bs_price as ben_soliman_price, INJECTION_DATE
    from (
        select maxab_product_id as product_id, maxab_sku as sku, INJECTION_DATE, wac1, wac_p,
            (bs_price/bs_unit_count) as bs_price, diff, cu_price,
            case when p1 > 1 then child_quantity else 0 end as scheck,
            round(p1/2)*2 as p1, p2,
            case when (ROUND(p1 / scheck) * scheck) = 0 then p1 else (ROUND(p1 / scheck) * scheck) end as new_d
        from (
            select sm.*, wac1, wac_p, 
                abs((bs_price/bs_unit_count)-(wac_p*maxab_basic_unit_count))/(wac_p*maxab_basic_unit_count) as diff,
                cpc.price as cu_price, pup.child_quantity,
                round((cu_price/(bs_price/bs_unit_count))) as p1, 
                round(((bs_price/bs_unit_count)/cu_price)) as p2
            from materialized_views.savvy_mapping sm 
            join finance.all_cogs f on f.product_id = sm.maxab_product_id 
                and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
            join PACKING_UNIT_PRODUCTS pu on pu.product_id = sm.maxab_product_id and pu.IS_BASIC_UNIT = 1 
            join cohort_product_packing_units cpc on cpc.PRODUCT_PACKING_UNIT_ID = pu.id and cohort_id = 700 
            join packing_unit_products pup on pup.product_id = sm.maxab_product_id and pup.is_basic_unit = 1  
            where bs_price is not null 
                and INJECTION_DATE::date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 5 
                and diff > 0.3 and p1 > 1
        )
    )
    qualify max(INJECTION_DATE) over(partition by product_id) = INJECTION_DATE
),

m_bs as (
    select z.* from (
        select maxab_product_id as product_id, maxab_sku as sku, avg(bs_final_price) as ben_soliman_price, INJECTION_DATE
        from (
            select *, row_number() over(partition by maxab_product_id order by diff) as rnk_2 
            from (
                select *, (bs_final_price-wac_p)/wac_p as diff_2 
                from (
                    select *, bs_price/maxab_basic_unit_count as bs_final_price 
                    from (
                        select *, row_number() over(partition by maxab_product_id, maxab_pu order by diff) as rnk 
                        from (
                            select *, max(INJECTION_DATE::date) over(partition by maxab_product_id, maxab_pu) as max_date
                            from (
                                select sm.*, wac1, wac_p, 
                                    abs(bs_price-(wac_p*maxab_basic_unit_count))/(wac_p*maxab_basic_unit_count) as diff 
                                from materialized_views.savvy_mapping sm 
                                join finance.all_cogs f on f.product_id = sm.maxab_product_id 
                                    and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
                                where bs_price is not null 
                                    and INJECTION_DATE::date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 5 
                                    and diff < 0.3
                            )
                            qualify max_date = INJECTION_DATE
                        ) qualify rnk = 1 
                    )
                ) where diff_2 between -0.5 and 0.5 
            ) qualify rnk_2 = 1 
        ) group by all
    ) z 
    join finance.all_cogs f on f.product_id = z.product_id 
        and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_Date and f.to_date
    where ben_soliman_price between f.wac_p*0.8 and f.wac_p*1.3
)

select product_id, avg(ben_soliman_price) as ben_soliman_price
from (
    select * from (
        select * from m_bs 
        union all
        select * from lower
    )
    qualify max(INJECTION_DATE) over(partition by product_id) = INJECTION_DATE
)
group by all
'''


In [13]:
# =============================================================================
# 2. MARKETPLACE PRICES QUERY (with region fallback)
# =============================================================================
MARKETPLACE_PRICES_QUERY = f'''
WITH MP as (
    select region, product_id,
        min(min_price) as min_price, min(max_price) as max_price,
        min(mod_price) as mod_price, min(true_min) as true_min, min(true_max) as true_max
    from (
        select mp.region, mp.product_id, mp.pu_id,
            min_price/BASIC_UNIT_COUNT as min_price,
            max_price/BASIC_UNIT_COUNT as max_price,
            mod_price/BASIC_UNIT_COUNT as mod_price,
            TRUE_MIN_PRICE/BASIC_UNIT_COUNT as true_min,
            TRUE_MAX_PRICE/BASIC_UNIT_COUNT as true_max
        from materialized_views.marketplace_prices mp 
        join packing_unit_products pup on pup.product_id = mp.product_id and pup.packing_unit_id = mp.pu_id
        join finance.all_cogs f on f.product_id = mp.product_id 
            and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_date and f.to_date
        where least(min_price, mod_price) between wac_p*0.9 and wac_p*1.3 
    )
    group by all 
),

region_mapping AS (
    SELECT * FROM (VALUES
        ('Delta East', 'Delta West'), ('Delta West', 'Delta East'),
        ('Alexandria', 'Cairo'), ('Alexandria', 'Giza'),
        ('Upper Egypt', 'Cairo'), ('Upper Egypt', 'Giza'),
        ('Cairo', 'Giza'), ('Giza', 'Cairo'),
        ('Delta West', 'Cairo'), ('Delta East', 'Cairo'),
        ('Delta West', 'Giza'), ('Delta East', 'Giza')
    ) AS region_mapping(region, fallback_region)
),

all_regions as (
    SELECT * FROM (VALUES
        ('Cairo'), ('Giza'), ('Delta West'), ('Delta East'), ('Upper Egypt'), ('Alexandria')
    ) AS x(region)
),

full_data as (
    select products.id as product_id, ar.region
    from products, all_regions ar
    where activation = 'true'
)

select region, product_id,
    min(final_min_price) as final_min_price, 
    min(final_max_price) as final_max_price,
    min(final_mod_price) as final_mod_price, 
    min(final_true_min) as final_true_min,
    min(final_true_max) as final_true_max
from (
    SELECT distinct w.region, w.product_id,
        COALESCE(m1.min_price, m2.min_price) AS final_min_price,
        COALESCE(m1.max_price, m2.max_price) AS final_max_price,
        COALESCE(m1.mod_price, m2.mod_price) AS final_mod_price,
        COALESCE(m1.true_min, m2.true_min) AS final_true_min,
        COALESCE(m1.true_max, m2.true_max) AS final_true_max
    FROM full_data w
    LEFT JOIN MP m1 ON w.region = m1.region and w.product_id = m1.product_id
    LEFT JOIN region_mapping rm ON w.region = rm.region
    LEFT JOIN MP m2 ON rm.fallback_region = m2.region AND w.product_id = m2.product_id
)
where final_min_price is not null 
group by all
'''


In [14]:
# =============================================================================
# 3. SCRAPPED DATA QUERY (Competitor prices from scraping)
# =============================================================================
SCRAPPED_DATA_QUERY = f'''
select product_id, region,
    MIN(market_price) AS min_scrapped,
    PERCENTILE_CONT(0.25) WITHIN GROUP (ORDER BY market_price) AS scrapped25,
    PERCENTILE_CONT(0.50) WITHIN GROUP (ORDER BY market_price) AS scrapped50,
    PERCENTILE_CONT(0.75) WITHIN GROUP (ORDER BY market_price) AS scrapped75,
    MAX(market_price) AS max_scrapped
from (
    select distinct cmp.*, max(date) over(partition by region, cmp.product_id, competitor) as max_date
    from MATERIALIZED_VIEWS.CLEANED_MARKET_PRICES cmp
    join finance.all_cogs f on f.product_id = cmp.product_id 
        and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) between f.from_date and f.to_date 
    where date >= CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 7 
        and MARKET_PRICE between f.wac_p * 0.8 and wac_p * 1.3
    qualify date = max_date 
)
group by all
'''


In [15]:
## Additional Data Queries (Sales, Groups, WAC)


In [16]:
# =============================================================================
# 4. PRODUCT BASE DATA QUERY (product_id, sku, brand, cat, wac1, wac_p, current_price)
# =============================================================================
PRODUCT_BASE_QUERY = f'''
WITH skus_prices AS (
    WITH local_prices AS (
        SELECT  
            CASE 
                WHEN cpu.cohort_id IN (700, 695) THEN 'Cairo'
                WHEN cpu.cohort_id IN (701) THEN 'Giza'
                WHEN cpu.cohort_id IN (704, 698) THEN 'Delta East'
                WHEN cpu.cohort_id IN (703, 697) THEN 'Delta West'
                WHEN cpu.cohort_id IN (696, 1123, 1124, 1125, 1126) THEN 'Upper Egypt'
                WHEN cpu.cohort_id IN (702, 699) THEN 'Alexandria'
            END AS region,
            cohort_id,
            pu.product_id,
            pu.packing_unit_id,
            pu.basic_unit_count,
            AVG(cpu.price) AS price
        FROM cohort_product_packing_units cpu
        JOIN PACKING_UNIT_PRODUCTS pu ON pu.id = cpu.product_packing_unit_id
        WHERE cpu.cohort_id IN (700,701,702,703,704,695,696,697,698,699,1123,1124,1125,1126)
            AND cpu.created_at::date <> '2023-07-31'
            AND cpu.is_customized = TRUE
        GROUP BY ALL
    ),
    
    live_prices AS (
        SELECT 
            region, cohort_id, product_id, 
            pu_id AS packing_unit_id, 
            buc AS basic_unit_count, 
            NEW_PRICE AS price
        FROM materialized_views.DBDP_PRICES
        WHERE created_at = CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date
            AND DATE_PART('hour', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::time) 
                BETWEEN SPLIT_PART(time_slot, '-', 1)::int AND (SPLIT_PART(time_slot, '-', 1)::int) + 1
            AND cohort_id IN (700,701,702,703,704,695,696,697,698,699,1123,1124,1125,1126)
    ),
    
    prices AS (
        SELECT *
        FROM (
            SELECT *, 1 AS priority FROM live_prices
            UNION ALL
            SELECT *, 2 AS priority FROM local_prices
        )
        QUALIFY ROW_NUMBER() OVER (PARTITION BY region, cohort_id, product_id, packing_unit_id ORDER BY priority) = 1
    )
    
    SELECT region, cohort_id, product_id, price
    FROM prices
    WHERE basic_unit_count = 1
        AND ((product_id = 1309 AND packing_unit_id = 2) OR (product_id <> 1309))
)

SELECT distinct
    region, cohort_id, p.product_id,
    CONCAT(products.name_ar, ' ', products.size, ' ', product_units.name_ar) AS sku,
    b.name_ar AS brand,
    cat.name_ar AS cat,
    wac1, wac_p, p.price as current_price
FROM skus_prices p
JOIN finance.all_cogs c ON c.product_id = p.product_id 
    AND CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP()) BETWEEN c.from_date AND c.to_date
JOIN products ON products.id = p.product_id
JOIN categories cat ON cat.id = products.category_id
JOIN brands b ON b.id = products.brand_id
JOIN product_units ON product_units.id = products.unit_id
WHERE wac1 > 0 AND wac_p > 0
GROUP BY ALL
'''

# =============================================================================
# 5. SALES DATA QUERY (120-day NMV by cohort/product)
# =============================================================================
SALES_QUERY = f'''
SELECT DISTINCT cpc.cohort_id, pso.product_id,
    CONCAT(products.name_ar,' ',products.size,' ',product_units.name_ar) as sku,
    brands.name_ar as brand, categories.name_ar as cat,
    sum(pso.total_price) as nmv
FROM product_sales_order pso
JOIN sales_orders so ON so.id = pso.sales_order_id
JOIN COHORT_PRICING_CHANGES cpc ON cpc.id = pso.COHORT_PRICING_CHANGE_id
JOIN products ON products.id = pso.product_id
JOIN brands ON products.brand_id = brands.id 
JOIN categories ON products.category_id = categories.id
JOIN product_units ON product_units.id = products.unit_id 
WHERE so.created_at::date BETWEEN CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 120 
    AND CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 1 
    AND so.sales_order_status_id NOT IN (7, 12)
    AND so.channel IN ('telesales', 'retailer')
    AND pso.purchased_item_count <> 0
    AND cpc.cohort_id IN (700,701,702,703,704,1123,1124,1125,1126)
GROUP BY ALL
'''

# =============================================================================
# 6. MARGIN STATS QUERY (STD and average margins)  
# =============================================================================
MARGIN_STATS_QUERY = f'''
select product_id, cohort_id, 
    (0.6*product_std) + (0.3*brand_std) + (0.1*cat_std) as std, 
    avg_margin
from (
    select product_id, cohort_id, 
        stddev(product_margin) as product_std, 
        stddev(brand_margin) as brand_std,
        stddev(cat_margin) as cat_std, 
        avg(product_margin) as avg_margin
    from (
        select distinct product_id, order_date, cohort_id,
            (nmv-cogs_p)/nmv as product_margin, 
            (brand_nmv-brand_cogs)/brand_nmv as brand_margin,
            (cat_nmv-cat_cogs)/cat_nmv as cat_margin
        from (
            SELECT DISTINCT so.created_at::date as order_date, cpc.cohort_id, pso.product_id,
                brands.name_ar as brand, categories.name_ar as cat,
                sum(COALESCE(f.wac_p,0) * pso.purchased_item_count * pso.basic_unit_count) as cogs_p,
                sum(pso.total_price) as nmv,
                sum(nmv) over(partition by order_date, cat, brand) as brand_nmv,
                sum(cogs_p) over(partition by order_date, cat, brand) as brand_cogs,
                sum(nmv) over(partition by order_date, cat) as cat_nmv,
                sum(cogs_p) over(partition by order_date, cat) as cat_cogs
            FROM product_sales_order pso
            JOIN sales_orders so ON so.id = pso.sales_order_id   
            JOIN COHORT_PRICING_CHANGES cpc on cpc.id = pso.cohort_pricing_change_id
            JOIN products on products.id = pso.product_id
            JOIN brands on products.brand_id = brands.id 
            JOIN categories ON products.category_id = categories.id
            JOIN finance.all_cogs f ON f.product_id = pso.product_id
                AND f.from_date::date <= so.created_at::date AND f.to_date::date > so.created_at::date
            WHERE so.created_at::date between 
                date_trunc('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - 120) 
                and CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date
                AND so.sales_order_status_id not in (7,12)
                AND so.channel IN ('telesales','retailer')
                AND pso.purchased_item_count <> 0
            GROUP BY ALL
        )
    ) group by all 
)
'''

# =============================================================================
# 7. TARGET MARGINS QUERY
# =============================================================================
TARGET_MARGINS_QUERY = f'''
WITH cat_brand_target as (
    SELECT DISTINCT cat, brand, margin as target_bm
    FROM performance.commercial_targets cplan
    QUALIFY CASE 
        WHEN DATE_TRUNC('month', MAX(DATE) OVER()) = DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date) 
        THEN DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date)
        ELSE DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - INTERVAL '1 month') 
    END = DATE_TRUNC('month', date)
),
cat_target as (
    select cat, sum(target_bm * (target_nmv/cat_total)) as cat_target_margin
    from (
        select *, sum(target_nmv) over(partition by cat) as cat_total
        from (
            select cat, brand, avg(target_bm) as target_bm, sum(target_nmv) as target_nmv
            from (
                SELECT DISTINCT date, city as region, cat, brand, margin as target_bm, nmv as target_nmv
                FROM performance.commercial_targets cplan
                QUALIFY CASE 
                    WHEN DATE_TRUNC('month', MAX(DATE) OVER()) = DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date) 
                    THEN DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date)
                    ELSE DATE_TRUNC('month', CONVERT_TIMEZONE('{TIMEZONE}', 'Africa/Cairo', CURRENT_TIMESTAMP())::date - INTERVAL '1 month') 
                END = DATE_TRUNC('month', date)
            ) group by all
        )
    ) group by all 
)
SELECT DISTINCT cbt.cat, cbt.brand, cbt.target_bm, ct.cat_target_margin
FROM cat_brand_target cbt
LEFT JOIN cat_target ct ON ct.cat = cbt.cat
'''


In [17]:
## Execute All Queries


In [18]:
# =============================================================================
# Execute all queries
# =============================================================================
print("Loading data from Snowflake...")

# 1. Ben Soliman Prices
print("  1. Loading Ben Soliman prices...")
df_ben_soliman = query_snowflake(BEN_SOLIMAN_QUERY)
df_ben_soliman = convert_to_numeric(df_ben_soliman)
print(f"     Loaded {len(df_ben_soliman)} Ben Soliman price records")

# 2. Marketplace Prices
print("  2. Loading marketplace prices...")
df_marketplace = query_snowflake(MARKETPLACE_PRICES_QUERY)
df_marketplace = convert_to_numeric(df_marketplace)
print(f"     Loaded {len(df_marketplace)} marketplace price records")

# 3. Scrapped Data
print("  3. Loading scrapped data...")
df_scrapped = query_snowflake(SCRAPPED_DATA_QUERY)
df_scrapped = convert_to_numeric(df_scrapped)
print(f"     Loaded {len(df_scrapped)} scrapped price records")

# 4. Product Base Data (product_id, sku, brand, cat, wac1, wac_p, current_price)
print("  4. Loading product base data...")
df_product_base = query_snowflake(PRODUCT_BASE_QUERY)
df_product_base = convert_to_numeric(df_product_base)
print(f"     Loaded {len(df_product_base)} product base records")

# 5. Sales Data
print("  5. Loading sales data...")
df_sales = query_snowflake(SALES_QUERY)
df_sales = convert_to_numeric(df_sales)
print(f"     Loaded {len(df_sales)} sales records")

# 6. Margin Stats
print("  6. Loading margin stats...")
df_margin_stats = query_snowflake(MARGIN_STATS_QUERY)
df_margin_stats = convert_to_numeric(df_margin_stats)
print(f"     Loaded {len(df_margin_stats)} margin stat records")

# 7. Target Margins
print("  7. Loading target margins...")
df_targets = query_snowflake(TARGET_MARGINS_QUERY)
df_targets = convert_to_numeric(df_targets)
print(f"     Loaded {len(df_targets)} target margin records")

# 8. Product Groups (from PostgreSQL)
print("  8. Loading product groups...")
df_groups = setup_environment_2.dwh_pg_query(
    "SELECT * FROM materialized_views.sku_commercial_groups", 
    columns=['product_id', 'group']
)
df_groups.columns = df_groups.columns.str.lower()
df_groups = convert_to_numeric(df_groups)
print(f"     Loaded {len(df_groups)} group records")

print("\nAll queries completed!")
print(f"\n{'='*60}")
print("df_product_base DataFrame available with columns:")
print("  - region, cohort_id, product_id, sku, brand, cat, wac1, wac_p, current_price")
print(f"{'='*60}")


Loading data from Snowflake...
  1. Loading Ben Soliman prices...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 1600 Ben Soliman price records
  2. Loading marketplace prices...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 11404 marketplace price records
  3. Loading scrapped data...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 5189 scrapped price records
  4. Loading product base data...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 101742 product base records
  5. Loading sales data...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 20617 sales records
  6. Loading margin stats...


  df[col] = pd.to_numeric(df[col], errors='ignore')


     Loaded 28648 margin stat records
  7. Loading target margins...
     Loaded 478 target margin records
  8. Loading product groups...
     Loaded 1576 group records

All queries completed!

df_product_base DataFrame available with columns:
  - region, cohort_id, product_id, sku, brand, cat, wac1, wac_p, current_price


  df[col] = pd.to_numeric(df[col], errors='ignore')
  df[col] = pd.to_numeric(df[col], errors='ignore')


In [48]:
# =============================================================================
# PART A: Build market_data DataFrame - Process market prices SEPARATELY
# =============================================================================
print("Building market_data DataFrame (market prices only)...")

# Create region-cohort mapping
REGION_COHORT_DF = pd.DataFrame({
    'region': ['Cairo', 'Giza', 'Delta West', 'Delta East', 
               'Upper Egypt', 'Upper Egypt', 'Upper Egypt', 'Upper Egypt', 'Alexandria'],
    'cohort_id': [700, 701, 703, 704, 1124, 1126, 1123, 1125, 702]
})

# =============================================================================
# Step 1: Outer join all market price sources
# =============================================================================
print("  Step 1: Joining all market price sources (outer join)...")

# Start with marketplace prices (has region + product_id)
market_data = df_marketplace.copy()

# Outer join with scrapped data (by region + product_id)
market_data = market_data.merge(df_scrapped, on=['region', 'product_id'], how='outer')

# Outer join with Ben Soliman prices (by product_id only - expand to all regions)
all_regions = pd.DataFrame({'region': ['Cairo', 'Giza', 'Delta West', 'Delta East', 'Upper Egypt', 'Alexandria']})
df_ben_soliman_expanded = df_ben_soliman.merge(all_regions, how='cross')

# Outer join with Ben Soliman
market_data = market_data.merge(df_ben_soliman_expanded, on=['region', 'product_id'], how='outer')

print(f"     Market prices base: {len(market_data)} records")

# =============================================================================
# Step 2: Add cohort_id and supporting data for market processing
# =============================================================================
print("  Step 2: Adding cohort IDs and supporting data for processing...")
market_data = market_data.merge(REGION_COHORT_DF, on='region')

# Need sales data for group processing (weighted median)
market_data = market_data.merge(
    df_sales[['cohort_id', 'product_id', 'nmv']], 
    on=['cohort_id', 'product_id'], 
    how='left'
)
market_data['nmv'] = market_data['nmv'].fillna(0)

# Need margin stats for price analysis
market_data = market_data.merge(df_margin_stats, on=['cohort_id', 'product_id'], how='left')

# Need WAC for price analysis - get from product base
market_data = market_data.merge(
    df_product_base[['cohort_id', 'product_id', 'wac_p', 'brand', 'cat']].drop_duplicates(), 
    on=['cohort_id', 'product_id'], 
    how='left'
)

# Need target margins for price analysis
market_data = market_data.merge(df_targets, on=['brand', 'cat'], how='left')
market_data['target_margin'] = market_data['target_bm'].fillna(market_data['cat_target_margin']).fillna(0)
market_data = market_data.drop(columns=['target_bm', 'cat_target_margin'], errors='ignore')

# Fill NaN values with defaults
market_data['std'] = market_data['std'].fillna(0.01)
market_data['avg_margin'] = market_data['avg_margin'].fillna(0)

# Merge product groups for group processing
market_data = market_data.merge(df_groups, on='product_id', how='left')

# Remove duplicates
market_data = market_data.drop_duplicates(subset=['cohort_id', 'product_id'])

# Filter out records without WAC (can't process prices without cost)
market_data = market_data[~market_data['wac_p'].isna()]

print(f"\n{'='*60}")
print(f"MARKET DATA BASE READY FOR PROCESSING")
print(f"{'='*60}")
print(f"Total records: {len(market_data)}")
print(f"  - With marketplace prices: {len(market_data[~market_data['final_min_price'].isna()])}")
print(f"  - With scrapped prices: {len(market_data[~market_data['min_scrapped'].isna()])}")
print(f"  - With Ben Soliman prices: {len(market_data[~market_data['ben_soliman_price'].isna()])}")


Building market_data DataFrame (market prices only)...
  Step 1: Joining all market price sources (outer join)...
     Market prices base: 16306 records
  Step 2: Adding cohort IDs and supporting data for processing...

MARKET DATA BASE READY FOR PROCESSING
Total records: 24076
  - With marketplace prices: 16747
  - With scrapped prices: 7694
  - With Ben Soliman prices: 14400


## PART A: Market Data Processing
Process market prices separately (group fill, coverage filter, price analysis, margin tiers)

In [49]:
# =============================================================================
# Group Processing - Calculate group-level aggregated prices (on market_data)
# =============================================================================

# Calculate group-level aggregated prices for products with group assignments
groups_data = market_data[~market_data['group'].isna()].copy()
groups_data['group_nmv'] = groups_data.groupby(['group', 'cohort_id'])['nmv'].transform('sum')
groups_data['cntrb'] = (groups_data['nmv'] / groups_data['group_nmv']).fillna(1)

# Flag if any price/scrapped column is non-NaN
price_cols = [
    'ben_soliman_price', 'final_min_price', 'final_max_price', 'final_mod_price', 'final_true_min', 'final_true_max',
    'min_scrapped', 'scrapped25', 'scrapped50', 'scrapped75', 'max_scrapped'
]
groups_data['flag_non_nan'] = groups_data[price_cols].notna().any(axis=1).astype(int)

# Weighted Median Function
def weighted_median(series, weights):
    valid = ~series.isna() & ~weights.isna()
    s = series[valid]
    w = weights[valid]
    if len(s) == 0:
        return np.nan
    order = np.argsort(s)
    s, w = s.iloc[order], w.iloc[order]
    return s.iloc[np.searchsorted(np.cumsum(w), w.sum() / 2)]

# Perform Weighted Aggregation
groups_agg = (
    groups_data[groups_data['flag_non_nan'] == 1]
    .groupby(['group', 'cohort_id'])
    .apply(lambda g: pd.Series({
        col: weighted_median(g[col], g['cntrb']) for col in price_cols
    }))
    .reset_index()
)

# Fill missing prices with group-level prices
merged = market_data.merge(groups_agg, on=['group', 'cohort_id'], how='left', suffixes=('', '_group'))
for col in price_cols:
    merged[col] = merged[col].fillna(merged[f'{col}_group'])

market_data = merged.drop(columns=[f'{c}_group' for c in price_cols])

print(f"Market data after group processing: {len(market_data)} records")

Market data after group processing: 24076 records


  .apply(lambda g: pd.Series({


## Price Coverage Filtering

In [50]:
# =============================================================================
# Price Coverage Filtering - Filter products with sufficient price data (on market_data)
# =============================================================================

# Score price coverage
market_data['ben'] = 0
market_data['MP'] = 0
market_data['sp'] = 0

# Ben Soliman: 1 point if present
market_data.loc[~market_data['ben_soliman_price'].isna(), 'ben'] = 1

# Marketplace: 1 point if single price, 3 points if range
market_data.loc[(market_data['final_min_price'] == market_data['final_max_price']) & 
                (~market_data['final_min_price'].isna()), 'MP'] = 1
market_data.loc[(market_data['final_min_price'] != market_data['final_max_price']) & 
                (~market_data['final_min_price'].isna()), 'MP'] = 3

# Scrapped: 1 point if single price, 5 points if range
market_data.loc[(market_data['min_scrapped'] == market_data['max_scrapped']) & 
                (~market_data['min_scrapped'].isna()), 'sp'] = 1
market_data.loc[(market_data['min_scrapped'] != market_data['max_scrapped']) & 
                (~market_data['min_scrapped'].isna()), 'sp'] = 5

# Total price coverage score
market_data['total_p'] = market_data['ben'] + market_data['MP'] + market_data['sp']

# Filter: keep only products with total_p > 2
market_data = market_data[market_data['total_p'] > 2]

print(f"Market data after price coverage filtering: {len(market_data)} records")

Market data after price coverage filtering: 13000 records


## Price Analysis & Margin Calculation


In [51]:
# =============================================================================
# Price Analysis Functions
# =============================================================================

def price_analysis(row):
    """Analyze prices and calculate percentiles for a product."""
    wac = row['wac_p']
    avg_margin = row['avg_margin'] if row['avg_margin'] >= 0.01 else row['target_margin']
    std = np.maximum(row['std'], 0.0025)
    target_margin = row['target_margin']
    max_marg = np.maximum(avg_margin, target_margin)
    
    # Collect all price points
    price_list = [
        row['ben_soliman_price'], row['final_min_price'], row['final_mod_price'],
        row['final_max_price'], row['final_true_min'], row['final_true_max'],
        row['min_scrapped'], row['scrapped25'], row['scrapped50'], row['scrapped75'], row['max_scrapped']
    ]
    
    # Filter valid prices within acceptable range
    valid_prices = sorted({
        x for x in price_list 
        if x and not pd.isna(x) and x != 0 
        and wac / (1 - (avg_margin - (10 * std))) <= x <= wac / (1 - (max_marg + 10 * std))
        and x >= wac * 0.9
    })
    
    if not valid_prices:
        return np.nan, np.nan, np.nan, np.nan, np.nan
    
    return (
        np.min(valid_prices),
        np.percentile(valid_prices, 25),
        np.percentile(valid_prices, 50),
        np.percentile(valid_prices, 75),
        np.max(valid_prices)
    )


def calculate_step_bounds(row):
    """Calculate below/above market bounds based on price steps."""
    wac = row['wac_p']
    std = row['std']
    prices = [row['minimum'], row['percentile_25'], row['percentile_50'], row['percentile_75'], row['maximum']]
    
    # Calculate valid steps between price points
    valid_steps = []
    for i in range(len(prices) - 1):
        step = prices[i + 1] - prices[i]
        if (step / wac) <= std * 1.2:
            valid_steps.append(step)
    
    avg_step = np.mean(valid_steps) if valid_steps else min(2 * std, 0.2 * row['target_margin'])
    
    new_min = prices[0] - avg_step if (prices[0] - avg_step) >= wac else prices[0]
    new_max = prices[-1] + avg_step if (prices[-1] + avg_step) >= wac else prices[-1]
    
    return new_min, new_max


In [52]:
# =============================================================================
# Apply Price Analysis & Margin Calculation (on market_data)
# =============================================================================

# Apply price analysis to calculate price percentiles
market_data[['minimum', 'percentile_25', 'percentile_50', 'percentile_75', 'maximum']] = \
    market_data.apply(price_analysis, axis=1, result_type='expand')

# Filter out records without valid price analysis
market_data = market_data[~market_data['minimum'].isna()]

# Calculate below/above market bounds
market_data[['below_market', 'above_market']] = market_data.apply(calculate_step_bounds, axis=1, result_type='expand')

print(f"Market data after price analysis: {len(market_data)} records")


Market data after price analysis: 12329 records


In [53]:
# =============================================================================
# Convert prices to margins (on market_data) - FINALIZE market_data processing
# =============================================================================

market_data['below_market'] = (market_data['below_market'] - market_data['wac_p']) / market_data['below_market']
market_data['market_min'] = (market_data['minimum'] - market_data['wac_p']) / market_data['minimum']
market_data['market_25'] = (market_data['percentile_25'] - market_data['wac_p']) / market_data['percentile_25']
market_data['market_50'] = (market_data['percentile_50'] - market_data['wac_p']) / market_data['percentile_50']
market_data['market_75'] = (market_data['percentile_75'] - market_data['wac_p']) / market_data['percentile_75']
market_data['market_max'] = (market_data['maximum'] - market_data['wac_p']) / market_data['maximum']
market_data['above_market'] = (market_data['above_market'] - market_data['wac_p']) / market_data['above_market']

# Select only the market-related columns to merge later
market_columns = [
    'cohort_id', 'product_id',
    # Market Prices (raw)
    'ben_soliman_price', 
    'final_min_price', 'final_max_price', 'final_mod_price', 'final_true_min', 'final_true_max',
    'min_scrapped', 'scrapped25', 'scrapped50', 'scrapped75', 'max_scrapped',
    # Price Percentiles
    'minimum', 'percentile_25', 'percentile_50', 'percentile_75', 'maximum',
    # Margin Tiers
    'below_market', 'market_min', 'market_25', 'market_50', 'market_75', 'market_max', 'above_market'
]
market_data = market_data[[c for c in market_columns if c in market_data.columns]]

print(f"\n{'='*60}")
print(f"MARKET DATA PROCESSING COMPLETE")
print(f"{'='*60}")
print(f"Total processed market records: {len(market_data)}")
print(f"\nMarket data columns:")
print("  - Price columns: ben_soliman_price, final_min_price, final_max_price, etc.")
print("  - Percentiles: minimum, percentile_25, percentile_50, percentile_75, maximum")
print("  - Margin tiers: below_market, market_min, market_25, market_50, market_75, market_max, above_market")
print(f"\nSample processed market data:")
market_data.head()



MARKET DATA PROCESSING COMPLETE
Total processed market records: 12329

Market data columns:
  - Price columns: ben_soliman_price, final_min_price, final_max_price, etc.
  - Percentiles: minimum, percentile_25, percentile_50, percentile_75, maximum
  - Margin tiers: below_market, market_min, market_25, market_50, market_75, market_max, above_market

Sample processed market data:


Unnamed: 0,cohort_id,product_id,ben_soliman_price,final_min_price,final_max_price,final_mod_price,final_true_min,final_true_max,min_scrapped,scrapped25,...,percentile_50,percentile_75,maximum,below_market,market_min,market_25,market_50,market_75,market_max,above_market
0,702,3.0,258.5,255.0,279.0,255.0,255.0,300.0,254.630005,254.957504,...,255.448753,256.580002,279.0,0.050898,0.05332,0.054655,0.056355,0.060515,0.136011,0.138019
1,702,9.0,,835.4,838.6,835.0,835.0,839.0,,,...,837.0,838.7,839.0,0.018123,0.019299,0.019651,0.021643,0.023626,0.023975,0.025137
2,702,10.0,,272.0,288.0,270.0,270.0,290.0,,,...,280.0,288.5,290.0,0.025238,0.030653,0.036009,0.065273,0.092812,0.097505,0.102149
4,702,14.0,465.0,461.5,477.0,477.0,460.0,477.0,,,...,463.25,468.0,477.0,0.02265,0.028316,0.030687,0.035133,0.044926,0.062946,0.068156
5,702,17.0,604.0,596.5,603.5,600.0,595.0,605.0,598.5,598.5,...,600.0,603.75,605.0,0.018424,0.022548,0.026638,0.030693,0.036714,0.038704,0.04266
