# Segment-Level Validation with 2% Threshold

This notebook performs **segment-level validation** with a **2% tolerance threshold**.

**Key Feature:** Differences under 2% are considered as **MATCHED** ✓

**Validation Segments:**
1. Overall Totals
2. By Date
3. By Campaign
4. By Gender
5. By Age Group
6. By Campaign + Date

## Configuration: Set Threshold

In [5]:
# CONFIGURATION: Set your threshold here
THRESHOLD_PERCENT = 3.0  # Accept differences up to 2%

print("="*80)
print("VALIDATION CONFIGURATION")
print("="*80)
print(f"\nThreshold: {THRESHOLD_PERCENT}%")
print(f"Differences under {THRESHOLD_PERCENT}% will be marked as MATCHED")
print("\nYou can change THRESHOLD_PERCENT above to adjust tolerance")

VALIDATION CONFIGURATION

Threshold: 3.0%
Differences under 3.0% will be marked as MATCHED

You can change THRESHOLD_PERCENT above to adjust tolerance


## Step 1: Import Libraries

In [6]:
# Install openpyxl if needed
import sys
!{sys.executable} -m pip install openpyxl -q

import pandas as pd
import numpy as np
from datetime import datetime

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.2f}'.format)

print("✓ Libraries imported successfully")
print(f"Analysis started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

✓ Libraries imported successfully
Analysis started: 2025-12-21 00:31:33


## Step 2: Define Matching Function with Threshold

In [7]:
def check_match_with_threshold(csv_val, fabric_val, threshold_pct=2.0):
    """
    Check if two values match within a percentage threshold.
    
    Args:
        csv_val: Value from CSV
        fabric_val: Value from Fabric
        threshold_pct: Acceptable difference percentage (default 2%)
    
    Returns:
        Boolean: True if difference is within threshold
    """
    # Handle NaN values
    if pd.isna(csv_val) or pd.isna(fabric_val):
        return pd.isna(csv_val) and pd.isna(fabric_val)
    
    # Handle zero values
    if fabric_val == 0:
        return csv_val == 0
    
    # Calculate percentage difference
    pct_diff = abs((csv_val - fabric_val) / fabric_val * 100)
    
    return pct_diff <= threshold_pct

print("✓ Matching function defined")
print(f"  Threshold: {THRESHOLD_PERCENT}%")

✓ Matching function defined
  Threshold: 3.0%


## Step 3: Load and Prepare Data

In [8]:
# Load CSV (skip 2 header rows)
print("Loading CSV...")
csv_df = pd.read_csv("growth/merged_age_gender(growth).csv", skiprows=2)

# Clean and map columns
csv_df['Impr.'] = csv_df['Impr.'].str.replace(',', '').astype(int)
csv_df['Cost'] = pd.to_numeric(csv_df['Cost'], errors='coerce')
csv_df['Clicks'] = pd.to_numeric(csv_df['Clicks'], errors='coerce')

csv_df = csv_df.rename(columns={
    'Campaign': 'campaign_name',
    'Day': 'day',
    'Gender': 'gender',
    'Age': 'age',
    'Cost': 'cost',
    'Impr.': 'impressions',
    'Clicks': 'clicks'
})

print(f"✓ CSV loaded: {len(csv_df):,} rows")

# Load Fabric Excel
print("\nLoading Fabric export...")
fabric_df = pd.read_excel("gold/merged_age_gender(gold)2.xlsx")
fabric_df['day'] = pd.to_datetime(fabric_df['day']).dt.strftime('%Y-%m-%d')

print(f"✓ Fabric loaded: {len(fabric_df):,} rows")

print("\n" + "="*80)
print("DATA SUMMARY")
print("="*80)
print(f"\nCSV Date Range: {csv_df['day'].min()} to {csv_df['day'].max()}")
print(f"Fabric Date Range: {fabric_df['day'].min()} to {fabric_df['day'].max()}")

Loading CSV...
✓ CSV loaded: 1,302 rows

Loading Fabric export...
✓ Fabric loaded: 6,472 rows

DATA SUMMARY

CSV Date Range: 2025-11-01 to 2025-11-30
Fabric Date Range: 2025-11-01 to 2025-11-30


## Step 4: Overall Totals Comparison (with 2% threshold)

In [9]:
print("="*80)
print("OVERALL TOTALS COMPARISON (with 2% threshold)")
print("="*80)

# Calculate totals
csv_totals = csv_df[['cost', 'impressions', 'clicks']].sum()
fabric_totals = fabric_df[['cost', 'impressions', 'clicks']].sum()

# Create comparison dataframe
overall_comparison = pd.DataFrame({
    'Metric': ['Cost (₹)', 'Impressions', 'Clicks'],
    'CSV': [csv_totals['cost'], csv_totals['impressions'], csv_totals['clicks']],
    'Fabric': [fabric_totals['cost'], fabric_totals['impressions'], fabric_totals['clicks']],
})

overall_comparison['Difference'] = overall_comparison['CSV'] - overall_comparison['Fabric']
overall_comparison['Diff %'] = (overall_comparison['Difference'] / overall_comparison['Fabric'] * 100).round(2)

# Apply threshold matching
overall_comparison['Match'] = overall_comparison['Diff %'].abs() <= THRESHOLD_PERCENT
overall_comparison['Status'] = overall_comparison['Match'].apply(lambda x: '✓ PASS' if x else '✗ FAIL')

display(overall_comparison)

# Summary
matches = overall_comparison['Match'].sum()
print(f"\n✓ Matches (within {THRESHOLD_PERCENT}%): {matches}/3 metrics")
if matches == 3:
    print(f"✓✓✓ ALL OVERALL TOTALS MATCH (within {THRESHOLD_PERCENT}% threshold)! ✓✓✓")
else:
    print(f"⚠ {3-matches} metric(s) exceed {THRESHOLD_PERCENT}% threshold")

OVERALL TOTALS COMPARISON (with 2% threshold)


Unnamed: 0,Metric,CSV,Fabric,Difference,Diff %,Match,Status
0,Cost (₹),260312.7,260320.48,-7.78,-0.0,True,✓ PASS
1,Impressions,1338526.0,1335418.0,3108.0,0.23,True,✓ PASS
2,Clicks,52059.0,50129.0,1930.0,3.85,False,✗ FAIL



✓ Matches (within 3.0%): 2/3 metrics
⚠ 1 metric(s) exceed 3.0% threshold


## Step 5: Validation by Date (with 2% threshold)

In [10]:
print("="*80)
print(f"SEGMENT VALIDATION: BY DATE (with {THRESHOLD_PERCENT}% threshold)")
print("="*80)

# Aggregate by date
csv_by_date = csv_df.groupby('day').agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
csv_by_date.columns = ['day', 'cost_csv', 'impressions_csv', 'clicks_csv']

fabric_by_date = fabric_df.groupby('day').agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
fabric_by_date.columns = ['day', 'cost_fabric', 'impressions_fabric', 'clicks_fabric']

# Merge and compare
date_comparison = pd.merge(csv_by_date, fabric_by_date, on='day', how='outer', indicator=True)

# Calculate differences and percentages
date_comparison['cost_diff'] = date_comparison['cost_csv'] - date_comparison['cost_fabric']
date_comparison['cost_diff_pct'] = (date_comparison['cost_diff'] / date_comparison['cost_fabric'] * 100).round(2)

date_comparison['impr_diff'] = date_comparison['impressions_csv'] - date_comparison['impressions_fabric']
date_comparison['impr_diff_pct'] = (date_comparison['impr_diff'] / date_comparison['impressions_fabric'] * 100).round(2)

date_comparison['clicks_diff'] = date_comparison['clicks_csv'] - date_comparison['clicks_fabric']
date_comparison['clicks_diff_pct'] = (date_comparison['clicks_diff'] / date_comparison['clicks_fabric'] * 100).round(2)

# Apply threshold matching
date_comparison['cost_match'] = date_comparison.apply(
    lambda row: check_match_with_threshold(row['cost_csv'], row['cost_fabric'], THRESHOLD_PERCENT), axis=1
)
date_comparison['impr_match'] = date_comparison.apply(
    lambda row: check_match_with_threshold(row['impressions_csv'], row['impressions_fabric'], THRESHOLD_PERCENT), axis=1
)
date_comparison['clicks_match'] = date_comparison.apply(
    lambda row: check_match_with_threshold(row['clicks_csv'], row['clicks_fabric'], THRESHOLD_PERCENT), axis=1
)

date_comparison['perfect_match'] = date_comparison['cost_match'] & date_comparison['impr_match'] & date_comparison['clicks_match']
date_comparison['status'] = date_comparison['perfect_match'].apply(lambda x: '✓ PASS' if x else '✗ FAIL')

# Display results
display_cols = ['day', 'cost_csv', 'cost_fabric', 'cost_diff_pct', 
                'impressions_csv', 'impressions_fabric', 'impr_diff_pct',
                'clicks_csv', 'clicks_fabric', 'clicks_diff_pct', 'status']

print(f"\nTotal dates compared: {len(date_comparison)}")
print(f"✓ Matches (within {THRESHOLD_PERCENT}%): {date_comparison['perfect_match'].sum()}")
print(f"✗ Exceeds threshold: {(~date_comparison['perfect_match']).sum()}")

print("\nDetailed comparison:")
display(date_comparison[display_cols].sort_values('day'))

# Save mismatches
if (~date_comparison['perfect_match']).sum() > 0:
    mismatches = date_comparison[~date_comparison['perfect_match']]
    mismatches[display_cols].to_csv('segment_validation_by_date_threshold.csv', index=False)
    print(f"\n✓ Date-level mismatches (>{THRESHOLD_PERCENT}%) saved to: segment_validation_by_date_threshold.csv")

SEGMENT VALIDATION: BY DATE (with 3.0% threshold)

Total dates compared: 29
✓ Matches (within 3.0%): 6
✗ Exceeds threshold: 23

Detailed comparison:


Unnamed: 0,day,cost_csv,cost_fabric,cost_diff_pct,impressions_csv,impressions_fabric,impr_diff_pct,clicks_csv,clicks_fabric,clicks_diff_pct,status
0,2025-11-01,5406.62,5406.67,-0.0,89790,89680,0.12,1519,1428,6.37,✗ FAIL
1,2025-11-03,958.94,958.92,0.0,6534,6422,1.74,190,137,38.69,✗ FAIL
2,2025-11-04,5404.33,5404.34,-0.0,57103,56981,0.21,1475,1392,5.96,✗ FAIL
3,2025-11-05,576.05,576.02,0.01,7172,7064,1.53,144,102,41.18,✗ FAIL
4,2025-11-06,2212.8,2212.79,0.0,22750,22634,0.51,564,501,12.57,✗ FAIL
5,2025-11-07,5691.94,5691.93,0.0,49953,49838,0.23,1396,1317,6.0,✗ FAIL
6,2025-11-08,10985.24,10985.25,-0.0,84543,84437,0.13,1955,1872,4.43,✗ FAIL
7,2025-11-09,16887.9,16887.89,0.0,113292,113181,0.1,2092,2017,3.72,✗ FAIL
8,2025-11-10,16130.8,16130.81,-0.0,90208,90093,0.13,1838,1742,5.51,✗ FAIL
9,2025-11-11,11965.48,11965.46,0.0,87841,87738,0.12,1773,1700,4.29,✗ FAIL



✓ Date-level mismatches (>3.0%) saved to: segment_validation_by_date_threshold.csv


## Step 6: Validation by Campaign (with 2% threshold)

In [11]:
print("="*80)
print(f"SEGMENT VALIDATION: BY CAMPAIGN (with {THRESHOLD_PERCENT}% threshold)")
print("="*80)

# Aggregate by campaign
csv_by_campaign = csv_df.groupby('campaign_name').agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
csv_by_campaign.columns = ['campaign_name', 'cost_csv', 'impressions_csv', 'clicks_csv']

fabric_by_campaign = fabric_df.groupby('campaign_name').agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
fabric_by_campaign.columns = ['campaign_name', 'cost_fabric', 'impressions_fabric', 'clicks_fabric']

# Merge and compare
campaign_comparison = pd.merge(csv_by_campaign, fabric_by_campaign, on='campaign_name', how='inner', indicator=True)

# Calculate differences and percentages
campaign_comparison['cost_diff_pct'] = ((
    campaign_comparison['cost_csv'] - campaign_comparison['cost_fabric']
) / campaign_comparison['cost_fabric'] * 100).round(2)

campaign_comparison['impr_diff_pct'] = ((
    campaign_comparison['impressions_csv'] - campaign_comparison['impressions_fabric']
) / campaign_comparison['impressions_fabric'] * 100).round(2)

campaign_comparison['clicks_diff_pct'] = ((
    campaign_comparison['clicks_csv'] - campaign_comparison['clicks_fabric']
) / campaign_comparison['clicks_fabric'] * 100).round(2)

# Apply threshold matching
campaign_comparison['perfect_match'] = (
    (campaign_comparison['cost_diff_pct'].abs() <= THRESHOLD_PERCENT) & 
    (campaign_comparison['impr_diff_pct'].abs() <= THRESHOLD_PERCENT) & 
    (campaign_comparison['clicks_diff_pct'].abs() <= THRESHOLD_PERCENT)
)
campaign_comparison['status'] = campaign_comparison['perfect_match'].apply(lambda x: '✓ PASS' if x else '✗ FAIL')

# Display results
display_cols = ['campaign_name', 'cost_csv', 'cost_fabric', 'cost_diff_pct',
                'impressions_csv', 'impressions_fabric', 'impr_diff_pct',
                'clicks_csv', 'clicks_fabric', 'clicks_diff_pct', 'status']

print(f"\nTotal campaigns compared: {len(campaign_comparison)}")
print(f"✓ Matches (within {THRESHOLD_PERCENT}%): {campaign_comparison['perfect_match'].sum()}")
print(f"✗ Exceeds threshold: {(~campaign_comparison['perfect_match']).sum()}")

print("\nDetailed comparison:")
display(campaign_comparison[display_cols].sort_values('campaign_name'))

# Save mismatches
if (~campaign_comparison['perfect_match']).sum() > 0:
    mismatches = campaign_comparison[~campaign_comparison['perfect_match']]
    mismatches[display_cols].to_csv('segment_validation_by_campaign_threshold.csv', index=False)
    print(f"\n✓ Campaign-level mismatches (>{THRESHOLD_PERCENT}%) saved to: segment_validation_by_campaign_threshold.csv")

SEGMENT VALIDATION: BY CAMPAIGN (with 3.0% threshold)

Total campaigns compared: 5
✓ Matches (within 3.0%): 2
✗ Exceeds threshold: 3

Detailed comparison:


Unnamed: 0,campaign_name,cost_csv,cost_fabric,cost_diff_pct,impressions_csv,impressions_fabric,impr_diff_pct,clicks_csv,clicks_fabric,clicks_diff_pct,status
0,Cadiveu_Instamart_External_20th_Nov_2025,5499.5,5499.49,0.0,342,268,27.61,26,18,44.44,✗ FAIL
1,IKONIC-AMZ-Glide-Peach-14-Oct-2025,30429.6,30439.37,-0.03,287833,287868,-0.01,10622,10625,-0.03,✓ PASS
2,ME_Search_|_Oct_25,111296.45,111296.54,-0.0,646629,645180,0.22,13091,12082,8.35,✗ FAIL
3,Nykaa_Black_Friday_Traffic,3499.34,3497.13,0.06,216816,216752,0.03,16089,16010,0.49,✓ PASS
4,PRO_Search_|_Oct_25,109587.81,109587.95,-0.0,186906,185350,0.84,12231,11394,7.35,✗ FAIL



✓ Campaign-level mismatches (>3.0%) saved to: segment_validation_by_campaign_threshold.csv


## Step 6.1: Validation by Gender (with threshold)

In [12]:
print("="*80)
print(f"SEGMENT VALIDATION: BY GENDER (with {THRESHOLD_PERCENT}% threshold)")
print("="*80)

# Aggregate by gender
csv_by_gender = csv_df.groupby('gender').agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
csv_by_gender.columns = ['gender', 'cost_csv', 'impressions_csv', 'clicks_csv']

fabric_by_gender = fabric_df.groupby('gender').agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
fabric_by_gender.columns = ['gender', 'cost_fabric', 'impressions_fabric', 'clicks_fabric']

# Merge and compare
gender_comparison = pd.merge(csv_by_gender, fabric_by_gender, on='gender', how='outer')

# Calculate differences
gender_comparison['cost_diff_pct'] = ((gender_comparison['cost_csv'] - gender_comparison['cost_fabric']) / gender_comparison['cost_fabric'] * 100).round(2)
gender_comparison['impr_diff_pct'] = ((gender_comparison['impressions_csv'] - gender_comparison['impressions_fabric']) / gender_comparison['impressions_fabric'] * 100).round(2)
gender_comparison['clicks_diff_pct'] = ((gender_comparison['clicks_csv'] - gender_comparison['clicks_fabric']) / gender_comparison['clicks_fabric'] * 100).round(2)

# Check matches
gender_comparison['perfect_match'] = (
    (gender_comparison['cost_diff_pct'].abs() <= THRESHOLD_PERCENT) & 
    (gender_comparison['impr_diff_pct'].abs() <= THRESHOLD_PERCENT) & 
    (gender_comparison['clicks_diff_pct'].abs() <= THRESHOLD_PERCENT)
)
gender_comparison['status'] = gender_comparison['perfect_match'].apply(lambda x: '✓ PASS' if x else '✗ FAIL')

print(f"\nTotal gender segments: {len(gender_comparison)}")
print(f"Perfect matches (within {THRESHOLD_PERCENT}%): {gender_comparison['perfect_match'].sum()}")
display(gender_comparison)

SEGMENT VALIDATION: BY GENDER (with 3.0% threshold)

Total gender segments: 6
Perfect matches (within 3.0%): 0


Unnamed: 0,gender,cost_csv,impressions_csv,clicks_csv,cost_fabric,impressions_fabric,clicks_fabric,cost_diff_pct,impr_diff_pct,clicks_diff_pct,perfect_match,status
0,FEMALE,,,,128266.23,664996.0,25289.0,,,,False,✗ FAIL
1,Female,128257.29,666019.0,25945.0,,,,,,,False,✗ FAIL
2,MALE,,,,68162.33,420655.0,16561.0,,,,False,✗ FAIL
3,Male,68163.55,421691.0,17199.0,,,,,,,False,✗ FAIL
4,UNDETERMINED,,,,63891.92,249767.0,8279.0,,,,False,✗ FAIL
5,Unknown,63891.86,250816.0,8915.0,,,,,,,False,✗ FAIL


## Step 6.2: Validation by Age Group (with threshold)

In [13]:
print("="*80)
print(f"SEGMENT VALIDATION: BY AGE GROUP (with {THRESHOLD_PERCENT}% threshold)")
print("="*80)

# Aggregate by age
csv_by_age = csv_df.groupby('age').agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
csv_by_age.columns = ['age', 'cost_csv', 'impressions_csv', 'clicks_csv']

fabric_by_age = fabric_df.groupby('age').agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
fabric_by_age.columns = ['age', 'cost_fabric', 'impressions_fabric', 'clicks_fabric']

# Merge and compare
age_comparison = pd.merge(csv_by_age, fabric_by_age, on='age', how='outer')

# Calculate differences
age_comparison['cost_diff_pct'] = ((age_comparison['cost_csv'] - age_comparison['cost_fabric']) / age_comparison['cost_fabric'] * 100).round(2)
age_comparison['impr_diff_pct'] = ((age_comparison['impressions_csv'] - age_comparison['impressions_fabric']) / age_comparison['impressions_fabric'] * 100).round(2)
age_comparison['clicks_diff_pct'] = ((age_comparison['clicks_csv'] - age_comparison['clicks_fabric']) / age_comparison['clicks_fabric'] * 100).round(2)

# Check matches
age_comparison['perfect_match'] = (
    (age_comparison['cost_diff_pct'].abs() <= THRESHOLD_PERCENT) & 
    (age_comparison['impr_diff_pct'].abs() <= THRESHOLD_PERCENT) & 
    (age_comparison['clicks_diff_pct'].abs() <= THRESHOLD_PERCENT)
)
age_comparison['status'] = age_comparison['perfect_match'].apply(lambda x: '✓ PASS' if x else '✗ FAIL')

print(f"\nTotal age segments: {len(age_comparison)}")
print(f"Perfect matches (within {THRESHOLD_PERCENT}%): {age_comparison['perfect_match'].sum()}")
display(age_comparison)

SEGMENT VALIDATION: BY AGE GROUP (with 3.0% threshold)

Total age segments: 13
Perfect matches (within 3.0%): 0


Unnamed: 0,age,cost_csv,impressions_csv,clicks_csv,cost_fabric,impressions_fabric,clicks_fabric,cost_diff_pct,impr_diff_pct,clicks_diff_pct,perfect_match,status
0,18 - 24,45615.77,270832.0,9324.0,,,,,,,False,✗ FAIL
1,18-24,,,,45622.5,270369.0,8997.0,,,,False,✗ FAIL
2,25 - 34,95556.48,479613.0,18357.0,,,,,,,False,✗ FAIL
3,25-34,,,,95559.48,479140.0,17993.0,,,,False,✗ FAIL
4,35 - 44,39165.5,228410.0,8823.0,,,,,,,False,✗ FAIL
5,35-44,,,,39164.34,227943.0,8480.0,,,,False,✗ FAIL
6,45 - 54,11459.89,59095.0,3467.0,,,,,,,False,✗ FAIL
7,45-54,,,,11459.57,58620.0,3216.0,,,,False,✗ FAIL
8,55 - 64,2703.13,28642.0,1812.0,,,,,,,False,✗ FAIL
9,55-64,,,,2702.92,28258.0,1657.0,,,,False,✗ FAIL


## Step 6.3: Validation by Campaign + Date (with threshold)

In [14]:
print("="*80)
print(f"SEGMENT VALIDATION: BY CAMPAIGN + DATE (with {THRESHOLD_PERCENT}% threshold)")
print("="*80)

# Aggregate by campaign and date
csv_by_camp_date = csv_df.groupby(['campaign_name', 'day']).agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
csv_by_camp_date.columns = ['campaign_name', 'day', 'cost_csv', 'impressions_csv', 'clicks_csv']

fabric_by_camp_date = fabric_df.groupby(['campaign_name', 'day']).agg({
    'cost': 'sum',
    'impressions': 'sum',
    'clicks': 'sum'
}).reset_index()
fabric_by_camp_date.columns = ['campaign_name', 'day', 'cost_fabric', 'impressions_fabric', 'clicks_fabric']

# Merge and compare
camp_date_comparison = pd.merge(csv_by_camp_date, fabric_by_camp_date, on=['campaign_name', 'day'], how='outer')

# Calculate differences
camp_date_comparison['cost_diff_pct'] = ((camp_date_comparison['cost_csv'] - camp_date_comparison['cost_fabric']) / camp_date_comparison['cost_fabric'] * 100).round(2)
camp_date_comparison['impr_diff_pct'] = ((camp_date_comparison['impressions_csv'] - camp_date_comparison['impressions_fabric']) / camp_date_comparison['impressions_fabric'] * 100).round(2)
camp_date_comparison['clicks_diff_pct'] = ((camp_date_comparison['clicks_csv'] - camp_date_comparison['clicks_fabric']) / camp_date_comparison['clicks_fabric'] * 100).round(2)

# Check matches
camp_date_comparison['perfect_match'] = (
    (camp_date_comparison['cost_diff_pct'].abs() <= THRESHOLD_PERCENT) & 
    (camp_date_comparison['impr_diff_pct'].abs() <= THRESHOLD_PERCENT) & 
    (camp_date_comparison['clicks_diff_pct'].abs() <= THRESHOLD_PERCENT)
)
camp_date_comparison['status'] = camp_date_comparison['perfect_match'].apply(lambda x: '✓ PASS' if x else '✗ FAIL')

print(f"\nTotal camp+date segments: {len(camp_date_comparison)}")
print(f"Perfect matches (within {THRESHOLD_PERCENT}%): {camp_date_comparison['perfect_match'].sum()}")
display(camp_date_comparison.head(10))

SEGMENT VALIDATION: BY CAMPAIGN + DATE (with 3.0% threshold)

Total camp+date segments: 104
Perfect matches (within 3.0%): 35


Unnamed: 0,campaign_name,day,cost_csv,impressions_csv,clicks_csv,cost_fabric,impressions_fabric,clicks_fabric,cost_diff_pct,impr_diff_pct,clicks_diff_pct,perfect_match,status
0,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-20,0.0,1,0,0.0,1,0,,0.0,,False,✗ FAIL
1,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-21,375.98,32,2,375.98,22,2,0.0,45.45,0.0,False,✗ FAIL
2,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-22,954.31,61,2,954.31,55,2,0.0,10.91,0.0,False,✗ FAIL
3,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-23,865.36,22,4,865.36,13,4,0.0,69.23,0.0,False,✗ FAIL
4,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-24,362.94,49,2,362.94,42,0,0.0,16.67,inf,False,✗ FAIL
5,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-25,609.54,56,3,609.54,48,1,0.0,16.67,200.0,False,✗ FAIL
6,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-26,396.18,24,1,396.18,17,1,0.0,41.18,0.0,False,✗ FAIL
7,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-27,339.27,26,3,339.26,15,1,0.0,73.33,200.0,False,✗ FAIL
8,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-28,774.87,21,2,774.87,17,2,0.0,23.53,0.0,False,✗ FAIL
9,Cadiveu_Instamart_External_20th_Nov_2025,2025-11-29,158.54,27,1,158.54,21,1,0.0,28.57,0.0,False,✗ FAIL


## Step 7: Final Summary Report

In [15]:
print("="*80)
print(f"SEGMENT VALIDATION SUMMARY REPORT (with {THRESHOLD_PERCENT}% threshold)")
print("="*80)
print(f"\nAnalysis completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

# Create summary table
summary_data = [
    ['Overall Totals', 3, overall_comparison['Match'].sum(), 3 - overall_comparison['Match'].sum()],
    ['By Date', len(date_comparison), date_comparison['perfect_match'].sum(), (~date_comparison['perfect_match']).sum()],
    ['By Campaign', len(campaign_comparison), campaign_comparison['perfect_match'].sum(), (~campaign_comparison['perfect_match']).sum()],
    ['By Gender', len(gender_comparison), gender_comparison['perfect_match'].sum(), (~gender_comparison['perfect_match']).sum()],
    ['By Age', len(age_comparison), age_comparison['perfect_match'].sum(), (~age_comparison['perfect_match']).sum()],
    ['By Campaign+Date', len(camp_date_comparison), camp_date_comparison['perfect_match'].sum(), (~camp_date_comparison['perfect_match']).sum()]
]

summary_df = pd.DataFrame(summary_data, columns=['Segment Type', 'Total Segments', 'Matches', 'Exceeds Threshold'])
summary_df['Match %'] = (summary_df['Matches'] / summary_df['Total Segments'] * 100).round(2)

print("\n")
display(summary_df)

# Overall assessment
total_segments = summary_df['Total Segments'].sum()
total_matches = summary_df['Matches'].sum()
overall_match_pct = (total_matches / total_segments * 100)

print("\n" + "="*80)
print(f"OVERALL MATCH RATE (within {THRESHOLD_PERCENT}%): {total_matches}/{total_segments} ({overall_match_pct:.1f}%)")
print("="*80)

if overall_match_pct == 100:
    print(f"\n✓✓✓ PERFECT VALIDATION! All segments within {THRESHOLD_PERCENT}% threshold! ✓✓✓")
elif overall_match_pct >= 95:
    print(f"\n✓ EXCELLENT! {overall_match_pct:.1f}% of segments within {THRESHOLD_PERCENT}% threshold")
elif overall_match_pct >= 80:
    print(f"\n⚠ GOOD: {overall_match_pct:.1f}% within threshold. Some segments need review.")
else:
    print(f"\n⚠ ATTENTION: Only {overall_match_pct:.1f}% within {THRESHOLD_PERCENT}% threshold. Review required.")

print("\n" + "-"*80)
print("KEY INSIGHTS:")
print("-"*80)
print(f"• Threshold used: {THRESHOLD_PERCENT}%")
print(f"• Segments passing: {total_matches}/{total_segments}")
print(f"• Segments exceeding threshold: {total_segments - total_matches}")

print("\n" + "="*80)
print("VALIDATION COMPLETE")
print("="*80)


SEGMENT VALIDATION SUMMARY REPORT (with 3.0% threshold)

Analysis completed: 2025-12-21 00:31:35




Unnamed: 0,Segment Type,Total Segments,Matches,Exceeds Threshold,Match %
0,Overall Totals,3,2,1,66.67
1,By Date,29,6,23,20.69
2,By Campaign,5,2,3,40.0
3,By Gender,6,0,6,0.0
4,By Age,13,0,13,0.0
5,By Campaign+Date,104,35,69,33.65



OVERALL MATCH RATE (within 3.0%): 45/160 (28.1%)

⚠ ATTENTION: Only 28.1% within 3.0% threshold. Review required.

--------------------------------------------------------------------------------
KEY INSIGHTS:
--------------------------------------------------------------------------------
• Threshold used: 3.0%
• Segments passing: 45/160
• Segments exceeding threshold: 115

VALIDATION COMPLETE


## Step 8: Interactive HTML Dashboard

In [16]:
import os
import webbrowser
from datetime import datetime

def create_table_html(df, title):
    if df is None or len(df) == 0:
        return f"<div class='no-data'>No data available for {title}</div>"
    
    status_col = 'perfect_match' if 'perfect_match' in df.columns else ('Match' if 'Match' in df.columns else None)
    
    html = f"<h3>{title}</h3>"
    html += "<div class='table-container'><table><thead><tr>"
    for col in df.columns:
        html += f"<th>{col}</th>"
    html += "</tr></thead><tbody>"
    
    for _, row in df.iterrows():
        row_style = ""
        if status_col is not None:
            val = row[status_col]
            if val == False or str(val).lower() == 'false':
                row_style = " class='row-fail'"
            else:
                row_style = " class='row-pass'"
        
        html += f"<tr{row_style}>"
        for col in df.columns:
            val = row[col]
            if isinstance(val, (int, float)) and not isinstance(val, bool):
                html += f"<td>{val:,.2f}</td>"
            else:
                html += f"<td>{val}</td>"
        html += "</tr>"
    html += "</tbody></table></div>"
    return html

total_segments_count = summary_df['Total Segments'].sum()
matches_count = summary_df['Matches'].sum()
overall_match_rate = (matches_count / total_segments_count * 100)

report_html = f"""
<!DOCTYPE html>
<html>
<head>
    <title>Google Age/Gender Validation Report</title>
    <script src=\"https://cdn.jsdelivr.net/npm/chart.js\"></script>
    <style>
        :root {{ --primary: #2563eb; --secondary: #64748b; --bg: #f8fafc; --card: #ffffff; --text: #1e293b; --pass: #f0fdf4; --pass-text: #166534; --fail: #fef2f2; --fail-text: #991b1b; }}
        body {{ font-family: 'Inter', sans-serif; margin: 0; padding: 20px; background-color: var(--bg); color: var(--text); }}
        .header {{ background: linear-gradient(135deg, #1e3a8a 0%, #2563eb 100%); color: white; padding: 40px; border-radius: 16px; margin-bottom: 30px; }}
        .stats-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(280px, 1fr)); gap: 24px; margin-bottom: 30px; }}
        .stat-card {{ background: var(--card); padding: 30px; border-radius: 16px; text-align: center; box-shadow: 0 4px 6px -1px rgba(0,0,0,0.05); border-bottom: 4px solid var(--primary); }}
        .stat-value {{ font-size: 40px; font-weight: 800; color: var(--primary); }}
        .charts-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(400px, 1fr)); gap: 24px; margin-bottom: 30px; }}
        .chart-container {{ background: var(--card); padding: 24px; border-radius: 16px; height: 400px; }}
        .table-section {{ background: var(--card); padding: 32px; border-radius: 16px; margin-bottom: 30px; }}
        .table-container {{ overflow-x: auto; }}
        table {{ width: 100%; border-collapse: collapse; }}\n",
        th {{ background-color: #f1f5f9; padding: 12px; text-align: left; }}\n",
        td {{ padding: 12px; border-bottom: 1px solid #f1f5f9; }}\n",
        .row-pass {{ background-color: var(--pass); color: var(--pass-text); }}\n",
        .row-fail {{ background-color: var(--fail); color: var(--fail-text); }}\n",
    </style>
</head>
<body>
    <div class='header'>
        <h1>Google Age/Gender Validation Dashboard</h1>
        <p>Report generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
    </div>
    <div class='stats-grid'>
        <div class='stat-card'>
            <div class='stat-label'>Overall Match Rate</div>
            <div class='stat-value'>{overall_match_rate:.1f}%</div>
        </div>
        <div class='stat-card'>
            <div class='stat-label'>Threshold Applied</div>
            <div class='stat-value'>{THRESHOLD_PERCENT}%</div>
        </div>
    </div>
    <div class='charts-grid'>
        <div class='chart-container'><canvas id='summaryChart'></canvas></div>
        <div class='chart-container'><canvas id='segmentHealthChart'></canvas></div>
    </div>
    <div class='table-section'>
        {create_table_html(summary_df, '📊 Summary Overview')}
        {create_table_html(overall_comparison, '🔍 Overall Totals')}
        {create_table_html(date_comparison.sort_values('day', ascending=False).head(15), '📅 Date Validation')}
        {create_table_html(campaign_comparison, '🚀 Campaign Validation')}
        {create_table_html(gender_comparison, '🚻 Gender Validation')}
        {create_table_html(age_comparison, '🎂 Age Group Validation')}
    </div>
    <script>
        new Chart(document.getElementById('summaryChart'), {{ 
            type: 'bar', 
            data: {{ labels: {summary_df['Segment Type'].tolist()}, datasets: [{{ label: 'Match %', data: {summary_df['Match %'].tolist()}, backgroundColor: '#2563eb' }}] }}
        }});
        new Chart(document.getElementById('segmentHealthChart'), {{ 
            type: 'radar', 
            data: {{ labels: {summary_df['Segment Type'].tolist()}, datasets: [{{ label: 'Match %', data: {summary_df['Match %'].tolist()}, fill: true, backgroundColor: 'rgba(37, 99, 235, 0.2)', borderColor: '#2563eb' }}] }},
            options: {{ scales: {{ r: {{ min: 0, max: 100 }} }} }}
        }});
    </script>
</body>
</html>
"""

with open('google_age_gender_validation_report.html', 'w', encoding='utf-8') as f: f.write(report_html)
webbrowser.open('file://' + os.path.abspath('google_age_gender_validation_report.html'))
print("INTERACTIVE DASHBOARD GENERATED")


INTERACTIVE DASHBOARD GENERATED
