In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from scipy import stats
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import json

In [2]:
# ===========================================================================
# STEP 0: Load the datasets
# ============================================================================
print("Loading datasets...")
enrollment_df = pd.read_csv(r'D:\Project\Hackathons\Aadhar_Hackathon\Normalized_Datasets\normalized_enrollment.csv')
demographic_df = pd.read_csv(r'D:\Project\Hackathons\Aadhar_Hackathon\Normalized_Datasets\normalized_demographic.csv')

print(f"Enrollment data shape: {enrollment_df.shape}")
print(f"Demographic data shape: {demographic_df.shape}")
print("\nEnrollment columns:", enrollment_df.columns.tolist())
print("Demographic columns:", demographic_df.columns.tolist())

Loading datasets...
Enrollment data shape: (1006007, 7)
Demographic data shape: (2045700, 6)

Enrollment columns: ['date', 'state', 'district', 'pincode', 'en_age_0_5', 'en_age_5_17', 'en_age_18_greater']
Demographic columns: ['date', 'state', 'district', 'pincode', 'demo_age_5_17', 'demo_age_17_']


In [3]:
# ============================================================================
# STEP 1 & 2: Aggregate data per pincode
# ============================================================================
print("\n" + "="*80)
print("STEP 1-2: Aggregating data by pincode...")
print("="*80)

# Enrollment aggregation - Focus on child enrollments (0-5 age group)
enrollment_agg = enrollment_df.groupby(['state', 'district', 'pincode']).agg({
    'en_age_0_5': 'sum'
}).reset_index()

enrollment_agg.rename(columns={'en_age_0_5': 'Total_Child_Enrollments'}, inplace=True)

print(f"\nEnrollment aggregated: {enrollment_agg.shape[0]} unique pincodes")
print(f"Total child enrollments (0-5 age): {enrollment_agg['Total_Child_Enrollments'].sum():,.0f}")

print("\n" + "="*80)

# Demographic aggregation - Updates by older children and adults
demographic_agg = demographic_df.groupby(['state', 'district', 'pincode']).agg({
    'demo_age_5_17': 'sum',
    'demo_age_17_': 'sum'
}).reset_index()

# Calculate total demographic updates
demographic_agg['Total_Demo_Updates'] = (
    demographic_agg['demo_age_5_17'] + 
    demographic_agg['demo_age_17_']
)

print(f"Demographic aggregated: {demographic_agg.shape[0]} unique pincodes")
print(f"Total demographic updates: {demographic_agg['Total_Demo_Updates'].sum():,.0f}")


STEP 1-2: Aggregating data by pincode...

Enrollment aggregated: 23663 unique pincodes
Total child enrollments (0-5 age): 3,546,965

Demographic aggregated: 25186 unique pincodes
Total demographic updates: 49,012,513


In [4]:
# ============================================================================
# STEP 3: Merge the datasets
# ============================================================================
print("\n" + "="*80)
print("STEP 3: Merging enrollment and demographic data...")
print("="*80)

merged_df = pd.merge(
    enrollment_agg,
    demographic_agg[['state', 'district', 'pincode', 'Total_Demo_Updates']],
    on=['state', 'district', 'pincode'],
    how='inner'
)

print(f"\nMerged data shape: {merged_df.shape}")
print(f"Pincodes with both enrollment and demographic data: {merged_df.shape[0]}",'\n')
print(merged_df.head()[['state', 'district', 'pincode', 'Total_Child_Enrollments', 'Total_Demo_Updates']])


STEP 3: Merging enrollment and demographic data...

Merged data shape: (22657, 5)
Pincodes with both enrollment and demographic data: 22657 

                         state       district  pincode  \
0  andaman_and_nicobar_islands        nicobar   744301   
1  andaman_and_nicobar_islands        nicobar   744302   
2  andaman_and_nicobar_islands        nicobar   744303   
3  andaman_and_nicobar_islands        nicobar   744304   
4  andaman_and_nicobar_islands  south_andaman   744101   

   Total_Child_Enrollments  Total_Demo_Updates  
0                       57                 694  
1                        3                  28  
2                        4                  51  
3                        1                  18  
4                       22                 392  


In [5]:
# ============================================================================
# STEP 4: Compute Update Intensity Ratio (UIR)
# ============================================================================
print("\n" + "="*80)
print("STEP 4: Computing Update Intensity Ratio (UIR)...")
print("="*80)

# Calculate UIR, handling division by zero
merged_df['UIR'] = np.where(
    merged_df['Total_Child_Enrollments'] > 0,
    merged_df['Total_Demo_Updates'] / merged_df['Total_Child_Enrollments'],
    0
)

'''How active the Aadhaar update ecosystem is in a pincode, 
relative to how many new children are being enrolled there.'''

print(f"\nUIR Statistics:")
print(merged_df['UIR'].describe())


STEP 4: Computing Update Intensity Ratio (UIR)...

UIR Statistics:
count    22657.000000
mean        16.989317
std         21.330357
min          0.000000
25%          8.346154
50%         12.923077
75%         19.918033
max       1139.500000
Name: UIR, dtype: float64


In [6]:
# ============================================================================
# STEP 5 (FIXED): Percentile-based categorization
# ============================================================================

# Calculate percentile thresholds
p10 = merged_df['UIR'].quantile(0.10)
p25 = merged_df['UIR'].quantile(0.25)
p75 = merged_df['UIR'].quantile(0.75)

print("UIR Thresholds:")
print(f"10th percentile : {p10:.2f}")
print(f"25th percentile : {p25:.2f}")
print(f"75th percentile : {p75:.2f}")

def categorize_uir_percentile(uir):
    if uir <= p10:
        return 'High risk / exclusion-prone üö®'
    elif uir <= p25:
        return 'Weak engagement ‚ö†Ô∏è'
    elif uir <= p75:
        return 'Normal'
    else:
        return 'Strong engagement'

merged_df['Engagement_Level'] = merged_df['UIR'].apply(categorize_uir_percentile)


# Display distribution
engagement_dist = merged_df['Engagement_Level'].value_counts()
print("\nEngagement Distribution:")
print(engagement_dist)
print("\nPercentage Distribution:")
print((engagement_dist / len(merged_df) * 100).round(2))

UIR Thresholds:
10th percentile : 5.23
25th percentile : 8.35
75th percentile : 19.92

Engagement Distribution:
Engagement_Level
Normal                           11327
Strong engagement                 5664
Weak engagement ‚ö†Ô∏è                3400
High risk / exclusion-prone üö®     2266
Name: count, dtype: int64

Percentage Distribution:
Engagement_Level
Normal                           49.99
Strong engagement                25.00
Weak engagement ‚ö†Ô∏è               15.01
High risk / exclusion-prone üö®    10.00
Name: count, dtype: float64


In [7]:
# ============================================================================
# STEP 6: Filter meaningful cases
# ============================================================================
print("\n" + "="*80)
print("STEP 6: Filtering for meaningful analysis...")
print("="*80)

# Set threshold (adjust as needed: 500, 1000, etc.)
ENROLLMENT_THRESHOLD = 500

filtered_df = merged_df[
    merged_df['Total_Child_Enrollments'] >= ENROLLMENT_THRESHOLD
].copy()

print(f"\nThreshold: {ENROLLMENT_THRESHOLD} child enrollments")
print(f"Pincodes after filtering: {filtered_df.shape[0]} (from {merged_df.shape[0]})")
print(f"Percentage retained: {(filtered_df.shape[0] / merged_df.shape[0] * 100):.1f}%")

# Recalculate engagement distribution for filtered data
filtered_engagement_dist = filtered_df['Engagement_Level'].value_counts()
print("\nFiltered Engagement Distribution:")
print(filtered_engagement_dist)



STEP 6: Filtering for meaningful analysis...

Threshold: 500 child enrollments
Pincodes after filtering: 1277 (from 22657)
Percentage retained: 5.6%

Filtered Engagement Distribution:
Engagement_Level
Normal                           684
Weak engagement ‚ö†Ô∏è               244
High risk / exclusion-prone üö®    204
Strong engagement                145
Name: count, dtype: int64


In [10]:
# ============================================================================
# RESULTS: Identify high-risk areas
# ============================================================================
print("\n" + "="*80)
print("IDENTIFYING HIGH-RISK AREAS")
print("="*80)

# Focus on high-risk pincodes
high_risk_df = filtered_df[
    filtered_df['Engagement_Level'] == 'High risk / exclusion-prone üö®'
].sort_values('UIR').copy()

print(f"\nHigh-risk pincodes found: {len(high_risk_df)}")

if len(high_risk_df) > 0:
    print("\nTop 50 highest-risk pincodes (lowest UIR):")
    print(high_risk_df[['state', 'district', 'pincode', 
                        'Total_Child_Enrollments', 'Total_Demo_Updates', 
                        'UIR']].head(50).to_string(index=False))

# Weak engagement areas
weak_engagement_df = filtered_df[
    filtered_df['Engagement_Level'] == 'Weak engagement ‚ö†Ô∏è'
].sort_values('UIR').copy()

print(f"\n\nWeak engagement pincodes found: {len(weak_engagement_df)}")


IDENTIFYING HIGH-RISK AREAS

High-risk pincodes found: 204

Top 50 highest-risk pincodes (lowest UIR):
            state              district  pincode  Total_Child_Enrollments  Total_Demo_Updates      UIR
   madhya_pradesh            ashoknagar   473101                      516                  49 0.094961
        karnataka       bengaluru_south   562160                      518                 425 0.820463
        meghalaya south_west_garo_hills   794115                     1256                1457 1.160032
      west_bengal       purba_bardhaman   713129                      574                 685 1.193380
          gujarat           gir_somnath   362720                      873                1167 1.336770
   madhya_pradesh                ujjain   456224                     1062                1552 1.461394
          gujarat       devbhumi_dwarka   361305                      718                1079 1.502786
      west_bengal       purba_bardhaman   713512                      50

In [11]:
# ============================================================================
# SUMMARY STATISTICS BY STATE
# ============================================================================
print("\n" + "="*80)
print("STATE-WISE SUMMARY")
print("="*80)

state_summary = filtered_df.groupby('state').agg({
    'pincode': 'count',
    'Total_Child_Enrollments': 'sum',
    'Total_Demo_Updates': 'sum',
    'UIR': 'mean'
}).round(3)

state_summary.columns = ['Pincodes', 'Child_Enrollments', 'Demo_Updates', 'Avg_UIR']
state_summary = state_summary.sort_values('Avg_UIR')

print("\nStates ranked by average UIR (lowest to highest):")
print(state_summary)

# Count high-risk pincodes by state
high_risk_by_state = high_risk_df.groupby('state').size().sort_values(ascending=False)
print("\n\nHigh-risk pincodes by state:")
print(high_risk_by_state)


STATE-WISE SUMMARY

States ranked by average UIR (lowest to highest):
                                          Pincodes  Child_Enrollments  \
state                                                                   
meghalaya                                       12              11460   
nagaland                                         1                855   
dadra_and_nagar_haveli_and_daman_and_diu         1               1144   
assam                                           61              56789   
mizoram                                          2               1394   
odisha                                           7               4175   
gujarat                                         89              88797   
madhya_pradesh                                 215             224916   
karnataka                                       41              36246   
punjab                                          14              20451   
tamil_nadu                                       4   

In [12]:
# ============================================================================
# SAVE RESULTS
# ============================================================================
print("\n" + "="*80)
print("SAVING RESULTS...")
print("="*80)

# Save full analysis
output_path = r'D:\Project\Hackathons\Aadhar_Hackathon\analysis_results.csv'
filtered_df.to_csv(output_path, index=False)
print(f"Full analysis saved to: {output_path}")

# Save high-risk pincodes
high_risk_path = r'D:\Project\Hackathons\Aadhar_Hackathon\high_risk_pincodes.csv'
high_risk_df.to_csv(high_risk_path, index=False)
print(f"High-risk pincodes saved to: {high_risk_path}")

# Save state summary
state_summary_path = r'D:\Project\Hackathons\Aadhar_Hackathon\state_summary.csv'
state_summary.to_csv(state_summary_path)
print(f"State summary saved to: {state_summary_path}")


SAVING RESULTS...
Full analysis saved to: D:\Project\Hackathons\Aadhar_Hackathon\analysis_results.csv
High-risk pincodes saved to: D:\Project\Hackathons\Aadhar_Hackathon\high_risk_pincodes.csv
State summary saved to: D:\Project\Hackathons\Aadhar_Hackathon\state_summary.csv


In [13]:
# Read the high-risk pincodes CSV and create a dict with state: districts mapping
high_risk_data = pd.read_csv(high_risk_path)

# Create dictionary with state as key and comma-separated districts as value
state_districts_dict = high_risk_data.groupby('state')['district'].apply(
    lambda x: ','.join(x.unique())
).to_dict()

print(state_districts_dict)

{'assam': 'marigaon,goalpara,kamrup,golaghat,chirang,barpeta,dhemaji,cachar,dhubri,hojai,hailakandi,kokrajhar,lakhimpur,sonitpur', 'bihar': 'sitamarhi,purnia', 'chhattisgarh': 'bijapur,dakshin_bastar_dantewada', 'delhi': 'east', 'gujarat': 'gir_somnath,devbhumi_dwarka,morbi,botad,banas_kantha,amreli,arvalli,patan,valsad,bhavnagar,dahod,chhotaudepur,porbandar,surendranagar,vadodara,rajkot,panch_mahals,kachchh', 'jammu_and_kashmir': 'doda,poonch,kishtwar', 'jharkhand': 'ranchi,garhwa,pakur', 'karnataka': 'bengaluru_south,yadgir,bengaluru_rural,bengaluru_urban,raichur', 'madhya_pradesh': 'ashoknagar,ujjain,narmadapuram,dewas,guna,burhanpur,umaria,chhatarpur,sagar,panna,sehore,chhindwara,katni,vidisha,shajapur,bhind,gwalior,harda,barwani,betul,morena,raisen,jhabua,alirajpur,mandsaur,bhopal,sheopur,sidhi,satna,rewa,singrauli', 'maharashtra': 'nandurbar,dhule', 'meghalaya': 'south_west_garo_hills,west_garo_hills,west_jaintia_hills,east_khasi_hills,west_khasi_hills,south_west_khasi_hills', 'm

<h2>Classify Pincodes<h4>

In [None]:
# ============================================================================
# CLASSIFICATION DICTIONARIES
# ============================================================================

TRIBAL_DOMINATED_DISTRICTS = {
    'assam': [
        'chirang', 'kokrajhar', 'dhemaji', 'karbi_anglong', 'dima_hasao',
        'goalpara', 'barpeta', 'sonitpur', 'lakhimpur'
    ],
    'bihar': ['purnia'],
    'chhattisgarh': [
        'bijapur', 'dakshin_bastar_dantewada', 'bastar', 'kondagaon', 
        'sukma', 'narayanpur', 'kanker', 'gariaband', 'korea', 
        'surguja', 'jashpur', 'raigarh'
    ],
    'gujarat': [
        'dahod', 'arvalli', 'panch_mahals', 'chhotaudepur', 'valsad',
        'narmada', 'tapi', 'dang'
    ],
    'jharkhand': [
        'ranchi', 'garhwa', 'pakur', 'gumla', 'simdega', 'khunti',
        'west_singhbhum', 'east_singhbhum', 'saraikela_kharsawan',
        'lohardaga', 'dumka', 'jamtara', 'sahebganj'
    ],
    'madhya_pradesh': [
        'jhabua', 'alirajpur', 'barwani', 'burhanpur', 'dhar',
        'mandla', 'dindori', 'umaria', 'anuppur', 'shahdol',
        'sidhi', 'singrauli', 'betul', 'chhindwara'
    ],
    'maharashtra': [
        'nandurbar', 'dhule', 'palghar', 'nashik', 'thane',
        'gadchiroli', 'chandrapur', 'yavatmal', 'amravati'
    ],
    'meghalaya': [
        'south_west_garo_hills', 'west_garo_hills', 'west_jaintia_hills',
        'east_khasi_hills', 'west_khasi_hills', 'south_west_khasi_hills',
        'east_garo_hills', 'north_garo_hills', 'south_garo_hills',
        'ri_bhoi', 'east_jaintia_hills'
    ],
    'mizoram': [
        'lawngtlai', 'mamit', 'kolasib', 'aizawl', 'champhai',
        'serchhip', 'lunglei', 'saiha'
    ],
    'odisha': [
        'mayurbhanj', 'keonjhar', 'sundargarh', 'koraput', 'malkangiri',
        'rayagada', 'nabarangpur', 'kalahandi', 'kandhamal', 'gajapati'
    ],
    'rajasthan': ['udaipur', 'dungarpur', 'banswara', 'pratapgarh', 'sirohi'],
    'telangana': [
        'adilabad', 'komaram_bheem_asifabad', 'mancherial', 
        'bhadradri_kothagudem', 'khammam', 'mahabubabad'
    ],
    'west_bengal': [
        'jalpaiguri', 'alipurduar', 'darjeeling', 'purulia',
        'bankura', 'jhargram', 'paschim_medinipur'
    ]
}

FOREST_HILLY_DISTRICTS = {
    'assam': [
        'golaghat', 'kamrup', 'cachar', 'hailakandi', 'marigaon',
        'dhubri', 'hojai'
    ],
    'jammu_and_kashmir': [
        'doda', 'poonch', 'kishtwar', 'ramban', 'rajouri', 'reasi',
        'kathua', 'udhampur', 'samba', 'jammu', 'anantnag', 'kulgam',
        'pulwama', 'shopian', 'budgam', 'baramulla', 'bandipora',
        'ganderbal', 'kupwara', 'srinagar'
    ],
    'himachal_pradesh': [
        'shimla', 'kinnaur', 'kullu', 'mandi', 'chamba', 'kangra',
        'una', 'hamirpur', 'bilaspur', 'solan', 'sirmaur', 'lahaul_spiti'
    ],
    'uttarakhand': [
        'almora', 'bageshwar', 'chamoli', 'champawat', 'dehradun',
        'haridwar', 'nainital', 'pauri_garhwal', 'pithoragarh',
        'rudraprayag', 'tehri_garhwal', 'udham_singh_nagar', 'uttarkashi'
    ],
    'karnataka': [
        'uttara_kannada', 'udupi', 'dakshina_kannada', 'chikmagalur',
        'hassan', 'kodagu', 'shimoga', 'chickballapur'
    ],
    'kerala': [
        'idukki', 'wayanad', 'palakkad', 'malappuram', 'kozhikode',
        'kannur', 'kasaragod', 'pathanamthitta', 'kottayam'
    ],
    'madhya_pradesh': ['panna', 'katni', 'sehore'],
    'odisha': ['jagatsinghapur', 'angul', 'dhenkanal', 'sambalpur']
}

LOW_LITERACY_DISTRICTS = {
    'bihar': [
        'sitamarhi', 'purnia', 'saharsa', 'madhepura', 'araria',
        'kishanganj', 'katihar', 'purnea', 'supaul', 'sheohar'
    ],
    'uttar_pradesh': [
        'bahraich', 'bareilly', 'sitapur', 'etah', 'siddharthnagar',
        'shravasti', 'balrampur', 'rampur', 'budaun'
    ],
    'rajasthan': [
        'jalore', 'sirohi', 'karauli', 'dhaulpur', 'pratapgarh',
        'banswara', 'dungarpur', 'sawai_madhopur'
    ],
    'madhya_pradesh': [
        'alirajpur', 'jhabua', 'barwani', 'sheopur', 'ashoknagar',
        'singrauli', 'bhind', 'morena'
    ],
    'jharkhand': ['pakur', 'sahebganj', 'godda', 'dumka', 'jamtara'],
    'chhattisgarh': [
        'bijapur', 'dakshin_bastar_dantewada', 'sukma', 'narayanpur'
    ],
    'assam': ['dhubri', 'barpeta', 'goalpara', 'chirang', 'dhemaji'],
    'odisha': ['nabarangpur', 'malkangiri', 'rayagada', 'kalahandi'],
    'west_bengal': [
        'uttar_dinajpur', 'dakshin_dinajpur', 'murshidabad', 'malda'
    ]
}

MIGRATION_SOURCE_DISTRICTS = {
    'bihar': [
        'sitamarhi', 'purnia', 'madhubani', 'darbhanga', 'samastipur',
        'muzaffarpur', 'gopalganj', 'siwan', 'saran', 'vaishali'
    ],
    'uttar_pradesh': [
        'bahraich', 'sitapur', 'bareilly', 'etah', 'siddharthnagar',
        'azamgarh', 'mau', 'ballia', 'deoria', 'gorakhpur', 'basti'
    ],
    'jharkhand': ['ranchi', 'garhwa', 'pakur', 'palamu', 'gumla', 'latehar'],
    'west_bengal': [
        'murshidabad', 'malda', 'uttar_dinajpur', 'dakshin_dinajpur',
        'birbhum', 'purba_bardhaman'
    ],
    'odisha': ['ganjam', 'balangir', 'kalahandi', 'nuapada', 'bargarh'],
    'rajasthan': ['nagaur', 'alwar', 'bharatpur', 'sawai_madhopur'],
    'madhya_pradesh': [
        'satna', 'rewa', 'sidhi', 'singrauli', 'shahdol', 'umaria'
    ]
}

MIGRATION_DESTINATION_DISTRICTS = {
    'delhi': [
        'east', 'north', 'south', 'west', 'central', 'north_west',
        'north_east', 'south_west', 'south_east', 'new_delhi', 'shahdara'
    ],
    'maharashtra': [
        'mumbai', 'mumbai_suburban', 'thane', 'pune', 'pimpri_chinchwad',
        'nagpur', 'nashik', 'aurangabad'
    ],
    'karnataka': [
        'bengaluru_urban', 'bengaluru_rural', 'bengaluru_south',
        'mysuru', 'mangaluru'
    ],
    'gujarat': [
        'ahmedabad', 'surat', 'vadodara', 'rajkot', 'gandhinagar',
        'bhavnagar', 'jamnagar'
    ],
    'tamil_nadu': [
        'chennai', 'coimbatore', 'madurai', 'tiruchirappalli', 'salem',
        'tiruppur', 'erode'
    ],
    'haryana': [
        'gurgaon', 'faridabad', 'ghaziabad', 'noida', 'greater_noida',
        'sonipat', 'panipat', 'rohtak'
    ],
    'punjab': [
        'ludhiana', 'amritsar', 'jalandhar', 'patiala', 'bathinda', 'kapurthala'
    ],
    'uttar_pradesh': [
        'agra', 'lucknow', 'kanpur', 'meerut', 'ghaziabad', 'noida'
    ]
}

REMOTE_RURAL_DISTRICTS = {
    'jammu_and_kashmir': ['kishtwar', 'doda', 'ramban', 'poonch', 'rajouri'],
    'himachal_pradesh': ['kinnaur', 'lahaul_spiti', 'chamba'],
    'uttarakhand': ['uttarkashi', 'chamoli', 'pithoragarh', 'rudraprayag'],
    'rajasthan': [
        'jaisalmer', 'barmer', 'bikaner', 'karauli', 'sirohi'
    ],
    'gujarat': [
        'kachchh', 'patan', 'banas_kantha', 'gir_somnath',
        'devbhumi_dwarka', 'morbi', 'botad', 'porbandar'
    ],
    'madhya_pradesh': [
        'sheopur', 'ashoknagar', 'guna', 'chhatarpur', 'panna'
    ],
    'chhattisgarh': [
        'bijapur', 'dakshin_bastar_dantewada', 'sukma', 'narayanpur'
    ],
    'assam': ['dhemaji', 'hojai'],
    'arunachal_pradesh': [
        'anjaw', 'changlang', 'dibang_valley', 'east_kameng',
        'east_siang', 'kurung_kumey', 'lohit', 'lower_dibang_valley',
        'lower_subansiri', 'papum_pare', 'tawang', 'tirap',
        'upper_siang', 'upper_subansiri', 'west_kameng', 'west_siang'
    ]
}

# ============================================================================
# HELPER FUNCTIONS
# ============================================================================

def normalize_name(name):
    """Normalize state/district names for matching"""
    if pd.isna(name):
        return ''
    return str(name).lower().strip().replace(' ', '_').replace('-', '_')

def classify_district(state, district):
    """Classify a district based on all characteristics"""
    state_norm = normalize_name(state)
    district_norm = normalize_name(district)
    
    classification = {
        'is_tribal': False,
        'is_forest_hilly': False,
        'is_low_literacy': False,
        'is_migration_source': False,
        'is_migration_destination': False,
        'is_remote_rural': False
    }
    
    # Check each classification
    if state_norm in TRIBAL_DOMINATED_DISTRICTS:
        if district_norm in TRIBAL_DOMINATED_DISTRICTS[state_norm]:
            classification['is_tribal'] = True
    
    if state_norm in FOREST_HILLY_DISTRICTS:
        if district_norm in FOREST_HILLY_DISTRICTS[state_norm]:
            classification['is_forest_hilly'] = True
    
    if state_norm in LOW_LITERACY_DISTRICTS:
        if district_norm in LOW_LITERACY_DISTRICTS[state_norm]:
            classification['is_low_literacy'] = True
    
    if state_norm in MIGRATION_SOURCE_DISTRICTS:
        if district_norm in MIGRATION_SOURCE_DISTRICTS[state_norm]:
            classification['is_migration_source'] = True
    
    if state_norm in MIGRATION_DESTINATION_DISTRICTS:
        if district_norm in MIGRATION_DESTINATION_DISTRICTS[state_norm]:
            classification['is_migration_destination'] = True
    
    if state_norm in REMOTE_RURAL_DISTRICTS:
        if district_norm in REMOTE_RURAL_DISTRICTS[state_norm]:
            classification['is_remote_rural'] = True
    
    return classification

# ============================================================================
# MAIN ENRICHMENT FUNCTION
# ============================================================================

def enrich_high_risk_pincodes(input_path, output_path=None):
    """
    Read high-risk pincodes CSV and enrich with classifications
    
    Parameters:
    -----------
    input_path : str
        Path to high_risk_pincodes.csv
    output_path : str, optional
        Path to save enriched CSV. If None, creates default path.
    
    Returns:
    --------
    enriched_df : DataFrame
        Enriched dataframe with all classifications
    """
    
    print("="*80)
    print("ENRICHING HIGH-RISK PINCODES WITH CLASSIFICATIONS")
    print("="*80)
    
    # Load data
    print(f"\nLoading: {input_path}")
    df = pd.read_csv(input_path)
    print(f"‚úì Loaded {len(df)} high-risk pincodes")
    print(f"Columns: {df.columns.tolist()}")
    
    # Apply classifications
    print("\nApplying district classifications...")
    
    classifications = df.apply(
        lambda row: classify_district(row['state'], row['district']), 
        axis=1
    )
    
    # Convert list of dicts to DataFrame
    classification_df = pd.DataFrame(classifications.tolist())
    
    # Add classification columns to original dataframe
    enriched_df = pd.concat([df, classification_df], axis=1)
    
    # ========================================================================
    # LAYER A: SETTLEMENT TYPE
    # ========================================================================
    print("\nLayer A: Classifying Settlement Type...")
    
    def get_settlement_type(row):
        """Determine settlement type based on classifications"""
        
        # Migration destinations are typically urban
        if row['is_migration_destination']:
            return 'Urban'
        
        # Remote rural takes priority
        if row['is_remote_rural']:
            return 'Remote Rural'
        
        # Tribal/forest/hilly areas are typically rural
        if row['is_tribal'] or row['is_forest_hilly']:
            return 'Rural'
        
        # Default classification
        # You could add population density logic here if you have the data
        return 'Rural'
    
    enriched_df['Settlement_Type'] = enriched_df.apply(get_settlement_type, axis=1)
    
    settlement_dist = enriched_df['Settlement_Type'].value_counts()
    print(f"Settlement Type Distribution:\n{settlement_dist}\n")
    
    # ========================================================================
    # LAYER B: TRIBAL/FOREST TYPE
    # ========================================================================
    print("Layer B: Classifying Tribal/Forest Type...")
    
    def get_tribal_forest_type(row):
        """Determine tribal/forest classification"""
        if row['is_tribal']:
            return 'Tribal-dominated'
        elif row['is_forest_hilly']:
            return 'Forest / Hilly'
        else:
            return 'Normal Rural'
    
    enriched_df['Tribal_Forest_Type'] = enriched_df.apply(get_tribal_forest_type, axis=1)
    
    tribal_dist = enriched_df['Tribal_Forest_Type'].value_counts()
    print(f"Tribal/Forest Type Distribution:\n{tribal_dist}\n")
    
    # ========================================================================
    # LAYER C: LITERACY CATEGORY
    # ========================================================================
    print("Layer C: Classifying Literacy Level...")
    
    def get_literacy_category(row):
        """Determine literacy category"""
        if row['is_low_literacy']:
            return 'Low Literacy'
        else:
            # Without actual literacy rate data, we use medium as default
            return 'Medium Literacy'
    
    enriched_df['Literacy_Category'] = enriched_df.apply(get_literacy_category, axis=1)
    
    literacy_dist = enriched_df['Literacy_Category'].value_counts()
    print(f"Literacy Category Distribution:\n{literacy_dist}\n")
    
    # ========================================================================
    # LAYER D: MIGRATION CATEGORY
    # ========================================================================
    print("Layer D: Classifying Migration Pattern...")
    
    def get_migration_category(row):
        """Determine migration pattern"""
        if row['is_migration_source']:
            return 'High Out-Migration'
        elif row['is_migration_destination']:
            return 'High In-Migration'
        else:
            return 'Low Migration'
    
    enriched_df['Migration_Category'] = enriched_df.apply(get_migration_category, axis=1)
    
    migration_dist = enriched_df['Migration_Category'].value_counts()
    print(f"Migration Category Distribution:\n{migration_dist}\n")
    
    # ========================================================================
    # CREATE COMPREHENSIVE RISK PROFILE
    # ========================================================================
    print("Creating comprehensive risk profiles...")
    
    def create_risk_profile(row):
        """Create human-readable risk profile"""
        profile_parts = []
        
        # Settlement
        profile_parts.append(row['Settlement_Type'])
        
        # Tribal/Forest
        if row['Tribal_Forest_Type'] != 'Normal Rural':
            profile_parts.append(row['Tribal_Forest_Type'])
        
        # Literacy
        if row['Literacy_Category'] == 'Low Literacy':
            profile_parts.append('Low Literacy')
        
        # Migration
        if 'High' in row['Migration_Category']:
            profile_parts.append(row['Migration_Category'])
        
        return ' | '.join(profile_parts)
    
    enriched_df['Risk_Profile'] = enriched_df.apply(create_risk_profile, axis=1)
    
    # ========================================================================
    # INTERVENTION PRIORITY SCORING
    # ========================================================================
    print("Calculating intervention priority scores...")
    
    def calculate_priority_score(row):
        """Calculate intervention priority (0-100, higher = more urgent)"""
        score = 0
        
        # Base score from UIR (inverse - lower UIR = higher priority)
        uir = row.get('UIR', 0.2)
        if uir < 0.05:
            score += 40
        elif uir < 0.1:
            score += 35
        elif uir < 0.15:
            score += 30
        elif uir < 0.2:
            score += 25
        
        # Settlement type
        if row['Settlement_Type'] == 'Remote Rural':
            score += 20
        elif row['Settlement_Type'] == 'Rural':
            score += 10
        
        # Tribal/Forest (infrastructure challenges)
        if row['is_tribal']:
            score += 15
        elif row['is_forest_hilly']:
            score += 12
        
        # Low literacy (awareness challenge)
        if row['is_low_literacy']:
            score += 15
        
        # Migration (continuity challenge)
        if row['is_migration_source']:
            score += 10
        
        return min(score, 100)  # Cap at 100
    
    enriched_df['Intervention_Priority_Score'] = enriched_df.apply(
        calculate_priority_score, axis=1
    )
    
    def categorize_priority(score):
        """Convert score to priority category"""
        if score >= 70:
            return 'Critical Priority'
        elif score >= 50:
            return 'High Priority'
        elif score >= 30:
            return 'Medium Priority'
        else:
            return 'Low Priority'
    
    enriched_df['Priority_Level'] = enriched_df['Intervention_Priority_Score'].apply(
        categorize_priority
    )
    
    priority_dist = enriched_df['Priority_Level'].value_counts()
    print(f"Priority Level Distribution:\n{priority_dist}\n")
    
    # ========================================================================
    # DISPLAY SAMPLE RESULTS
    # ========================================================================
    print("="*80)
    print("SAMPLE ENRICHED PINCODES (First 15)")
    print("="*80)
    
    display_cols = [
        'state', 'district', 'pincode',
        'Settlement_Type', 'Tribal_Forest_Type',
        'Literacy_Category', 'Migration_Category',
        'UIR', 'Priority_Level', 'Risk_Profile'
    ]
    
    # Filter to columns that exist
    display_cols = [col for col in display_cols if col in enriched_df.columns]
    
    print(enriched_df[display_cols].head(15).to_string(index=False))
    
    # ========================================================================
    # PATTERN ANALYSIS
    # ========================================================================
    print("\n" + "="*80)
    print("PATTERN ANALYSIS")
    print("="*80)
    
    print("\nTop 10 Most Common Risk Profiles:")
    risk_counts = enriched_df['Risk_Profile'].value_counts().head(10)
    for profile, count in risk_counts.items():
        print(f"  {count:4d}  {profile}")
    
    print("\n\nCritical Priority Pincodes by State:")
    critical = enriched_df[enriched_df['Priority_Level'] == 'Critical Priority']
    if len(critical) > 0:
        critical_by_state = critical.groupby('state').size().sort_values(ascending=False).head(10)
        for state, count in critical_by_state.items():
            print(f"  {state:25s}  {count:4d} pincodes")
    else:
        print("  No critical priority pincodes found")
    
    # ========================================================================
    # SAVE ENRICHED DATA
    # ========================================================================
    if output_path is None:
        output_path = input_path.replace('.csv', '_enriched.csv')
    
    print("\n" + "="*80)
    print("SAVING ENRICHED DATA")
    print("="*80)
    
    enriched_df.to_csv(output_path, index=False)
    print(f"‚úì Enriched data saved to: {output_path}")
    
    # Save critical priorities separately
    critical_path = output_path.replace('_enriched.csv', '_critical_priority.csv')
    if len(critical) > 0:
        critical.sort_values('Intervention_Priority_Score', ascending=False).to_csv(
            critical_path, index=False
        )
        print(f"‚úì Critical priorities saved to: {critical_path}")
    
    # ========================================================================
    # SUMMARY STATISTICS
    # ========================================================================
    print("\n" + "="*80)
    print("ENRICHMENT SUMMARY")
    print("="*80)
    print(f"\n‚úì Total pincodes enriched: {len(enriched_df)}")
    print(f"‚úì States covered: {enriched_df['state'].nunique()}")
    print(f"‚úì Districts covered: {enriched_df['district'].nunique()}")
    print(f"\n‚úì Tribal-dominated pincodes: {enriched_df['is_tribal'].sum()}")
    print(f"‚úì Forest/Hilly pincodes: {enriched_df['is_forest_hilly'].sum()}")
    print(f"‚úì Low literacy pincodes: {enriched_df['is_low_literacy'].sum()}")
    print(f"‚úì Migration-affected pincodes: {(enriched_df['is_migration_source'] | enriched_df['is_migration_destination']).sum()}")
    print(f"‚úì Remote rural pincodes: {enriched_df['is_remote_rural'].sum()}")
    print(f"\n‚úì Critical priority pincodes: {(enriched_df['Priority_Level'] == 'Critical Priority').sum()}")
    print(f"‚úì High priority pincodes: {(enriched_df['Priority_Level'] == 'High Priority').sum()}")
    
    print("\n" + "="*80)
    print("ENRICHMENT COMPLETE!")
    print("="*80)
    
    return enriched_df

# ============================================================================
# RUN THE ENRICHMENT
# ============================================================================

if __name__ == "__main__":
    input_file = r'D:\Project\Hackathons\Aadhar_Hackathon\high_risk_pincodes.csv'
    
    enriched_data = enrich_high_risk_pincodes(input_file)
    
    print("\n‚úÖ Enrichment process completed successfully!")
    print("\nYou can now use the enriched data for:")
    print("  ‚Ä¢ Targeted intervention planning")
    print("  ‚Ä¢ Resource allocation")
    print("  ‚Ä¢ Policy recommendations")
    print("  ‚Ä¢ Geographic prioritization")

ENRICHING HIGH-RISK PINCODES WITH CLASSIFICATIONS

Loading: D:\Project\Hackathons\Aadhar_Hackathon\high_risk_pincodes.csv
‚úì Loaded 204 high-risk pincodes
Columns: ['state', 'district', 'pincode', 'Total_Child_Enrollments', 'Total_Demo_Updates', 'UIR', 'Engagement_Level']

Applying district classifications...

Layer A: Classifying Settlement Type...
Settlement Type Distribution:
Settlement_Type
Rural           143
Remote Rural     47
Urban            14
Name: count, dtype: int64

Layer B: Classifying Tribal/Forest Type...
Tribal/Forest Type Distribution:
Tribal_Forest_Type
Normal Rural        105
Tribal-dominated     65
Forest / Hilly       34
Name: count, dtype: int64

Layer C: Classifying Literacy Level...
Literacy Category Distribution:
Literacy_Category
Medium Literacy    162
Low Literacy        42
Name: count, dtype: int64

Layer D: Classifying Migration Pattern...
Migration Category Distribution:
Migration_Category
Low Migration         165
High Out-Migration     25
High In-Migr

<h2> Report Generation <h2>

In [2]:
# Load the data
file_path = r'D:\Project\Hackathons\Aadhar_Hackathon\Documentation\Invisible_Citizens\high_risk_pincodes_enriched.csv'
df = pd.read_csv(file_path)

# Clean column names
df.columns = df.columns.str.strip()

print("="*80)
print("HIGH-RISK PINCODES ANALYSIS - EXECUTIVE SUMMARY")
print("="*80)

# Calculate key metrics
total_pincodes = len(df)
total_enrollments = df['Total_Child_Enrollments'].sum()
total_updates = df['Total_Demo_Updates'].sum()
avg_uir = df['UIR'].mean()

print(f"\nüìä KEY METRICS:")
print(f"   Total High-Risk Pincodes: {total_pincodes:,}")
print(f"   Total Child Enrollments: {total_enrollments:,}")
print(f"   Total Demographic Updates: {total_updates:,}")
print(f"   Average Update Rate (UIR): {avg_uir:.2f}%")
print(f"   Coverage Gap: {100 - avg_uir:.2f}%")

# Priority distribution
priority_dist = df['Priority_Level'].value_counts()
print(f"\nüéØ PRIORITY DISTRIBUTION:")
for level, count in priority_dist.items():
    print(f"   {level}: {count} pincodes ({count/total_pincodes*100:.1f}%)")

# State-wise analysis
state_analysis = df.groupby('state').agg({
    'pincode': 'count',
    'Total_Child_Enrollments': 'sum',
    'Total_Demo_Updates': 'sum',
    'UIR': 'mean',
    'Intervention_Priority_Score': 'mean'
}).round(2)
state_analysis.columns = ['Pincodes', 'Enrollments', 'Updates', 'Avg_UIR', 'Avg_Priority_Score']
state_analysis = state_analysis.sort_values('Avg_Priority_Score', ascending=False)

print(f"\nüó∫Ô∏è TOP 10 STATES BY PRIORITY SCORE:")
print(state_analysis.head(10).to_string())

# Risk factor analysis
risk_factors = {
    'Tribal Areas': df['is_tribal'].sum(),
    'Remote/Rural': df['is_remote_rural'].sum(),
    'Low Literacy': df['is_low_literacy'].sum(),
    'Migration Source': df['is_migration_source'].sum(),
    'Migration Destination': df['is_migration_destination'].sum(),
    'Forest/Hilly': df['is_forest_hilly'].sum()
}

print(f"\n‚ö†Ô∏è RISK FACTOR PREVALENCE:")
for factor, count in risk_factors.items():
    print(f"   {factor}: {count} pincodes ({count/total_pincodes*100:.1f}%)")

# Create visualizations
print("\n" + "="*80)
print("GENERATING INTERACTIVE VISUALIZATIONS...")
print("="*80)

# 1. India Map - State-wise Priority Score
fig1 = go.Figure()

# Prepare state data for choropleth
state_map_data = state_analysis.reset_index()
state_map_data['Pincodes_Text'] = state_map_data['Pincodes'].astype(str)

fig1 = px.choropleth(
    state_map_data,
    geojson="https://gist.githubusercontent.com/jbrobst/56c13bbbf9d97d187fea01ca62ea5112/raw/e388c4cae20aa53cb5090210a42ebb9b765c0a36/india_states.geojson",
    featureidkey='properties.ST_NM',
    locations='state',
    color='Avg_Priority_Score',
    color_continuous_scale='Reds',
    hover_data=['Pincodes', 'Enrollments', 'Avg_UIR'],
    title='<b>India Map: State-wise Intervention Priority Scores</b><br><sub>Darker colors indicate higher priority for intervention</sub>',
    labels={'Avg_Priority_Score': 'Priority Score'}
)

fig1.update_geos(
    fitbounds="locations",
    visible=False
)

fig1.update_layout(
    title_font_size=20,
    title_x=0.5,
    geo=dict(
        bgcolor='rgba(0,0,0,0)',
        lakecolor='lightblue',
        landcolor='#f0f0f0'
    ),
    height=700,
    margin={"r":0,"t":80,"l":0,"b":0}
)

# 2. Multi-panel Dashboard
fig2 = make_subplots(
    rows=1, cols=3,
    subplot_titles=(
        'Top 10 High-Risk States',
        'Priority Level Distribution',
        'Risk Factor Prevalence'
    ),
    specs=[[{"type": "bar"}, {"type": "pie"}, {"type": "bar"}]]
)

# Top 10 states by priority score
top_10_states = state_analysis.head(10).reset_index()
fig2.add_trace(
    go.Bar(
        x=top_10_states['state'],
        y=top_10_states['Avg_Priority_Score'],
        marker_color='crimson',
        name='Priority Score',
        text=top_10_states['Pincodes'],
        texttemplate='%{text} pincodes',
        textposition='outside'
    ),
    row=1, col=1
)

# Priority distribution pie chart
priority_colors = {'High': '#ef4444', 'Medium': '#f59e0b', 'Low': '#10b981'}
fig2.add_trace(
    go.Pie(
        labels=priority_dist.index,
        values=priority_dist.values,
        marker_colors=[priority_colors.get(x, '#6366f1') for x in priority_dist.index],
        textinfo='label+percent',
        hole=0.3
    ),
    row=1, col=2
)

# Risk factors bar chart
risk_df = pd.DataFrame(list(risk_factors.items()), columns=['Factor', 'Count'])
risk_df['Percentage'] = (risk_df['Count'] / total_pincodes * 100).round(1)
fig2.add_trace(
    go.Bar(
        x=risk_df['Factor'],
        y=risk_df['Percentage'],
        marker_color='#6366f1',
        text=risk_df['Percentage'],
        texttemplate='%{text}%',
        textposition='outside',
        name='Percentage'
    ),
    row=1, col=3
)

fig2.update_xaxes(tickangle=-45, row=1, col=1)
fig2.update_xaxes(tickangle=-45, row=1, col=3)
fig2.update_layout(
    title_text="<b>Comprehensive Risk Analysis Dashboard</b>",
    title_font_size=22,
    title_x=0.5,
    showlegend=False,
    height=600
)

# 3. District-level detailed analysis (top 20 districts)
district_analysis = df.groupby(['state', 'district']).agg({
    'pincode': 'count',
    'Total_Child_Enrollments': 'sum',
    'UIR': 'mean',
    'Intervention_Priority_Score': 'mean'
}).round(2)
district_analysis.columns = ['Pincodes', 'Enrollments', 'Avg_UIR', 'Priority_Score']
district_analysis = district_analysis.sort_values('Priority_Score', ascending=False).head(20)
district_analysis['District_State'] = [f"{d[1]}, {d[0]}" for d in district_analysis.index]

fig3 = go.Figure()

fig3.add_trace(go.Bar(
    y=district_analysis['District_State'][::-1],
    x=district_analysis['Priority_Score'][::-1],
    orientation='h',
    marker=dict(
        color=district_analysis['Priority_Score'][::-1],
        colorscale='Reds',
        showscale=True,
        colorbar=dict(title="Priority<br>Score")
    ),
    text=district_analysis['Pincodes'][::-1],
    texttemplate='%{text} pincodes',
    textposition='outside'
))

fig3.update_layout(
    title='<b>Top 20 Districts Requiring Immediate Intervention</b>',
    title_font_size=20,
    title_x=0.5,
    xaxis_title='Intervention Priority Score',
    yaxis_title='',
    height=700,
    margin=dict(l=200)
)

# 4. Scatter plot: UIR vs Enrollments
fig4 = px.scatter(
    df,
    x='Total_Child_Enrollments',
    y='UIR',
    color='Priority_Level',
    size='Intervention_Priority_Score',
    hover_data=['state', 'district', 'pincode'],
    color_discrete_map={'High': '#ef4444', 'Medium': '#f59e0b', 'Low': '#10b981'},
    title='<b>Update Rate vs Child Enrollments by Priority Level</b><br><sub>Bubble size represents intervention priority score</sub>',
    labels={'UIR': 'Update Intensity Rate %', 'Total_Child_Enrollments': 'Total Child Enrollments'}
)

fig4.update_layout(
    title_font_size=20,
    title_x=0.5,
    height=600
)

# Save all visualizations
print("\nüíæ Saving visualizations as HTML files...")
fig1.write_html('india_map_priority_scores.html')
print("   ‚úì india_map_priority_scores.html")

fig2.write_html('comprehensive_dashboard.html')
print("   ‚úì comprehensive_dashboard.html")

fig3.write_html('top_20_districts.html')
print("   ‚úì top_20_districts.html")

fig4.write_html('uir_vs_enrollments_scatter.html')
print("   ‚úì uir_vs_enrollments_scatter.html")

# Generate detailed insights report
print("\n" + "="*80)
print("KEY INSIGHTS FOR JUDGES")
print("="*80)

print(f"\nüéØ CRITICAL FINDINGS:")
print(f"\n1. COVERAGE GAP SEVERITY:")
print(f"   ‚Ä¢ {total_pincodes:,} high-risk pincodes have only {avg_uir:.1f}% average update rate")
print(f"   ‚Ä¢ This means {100-avg_uir:.1f}% of demographic data is outdated")
print(f"   ‚Ä¢ {total_enrollments:,} children are affected by this data gap")

high_priority = len(df[df['Priority_Level'] == 'High'])
print(f"\n2. IMMEDIATE ACTION REQUIRED:")
print(f"   ‚Ä¢ {high_priority} pincodes ({high_priority/total_pincodes*100:.1f}%) need URGENT intervention")
print(f"   ‚Ä¢ Top 3 states: {', '.join(state_analysis.head(3).index.tolist())}")
print(f"   ‚Ä¢ These areas should receive 60% of allocated resources")

tribal_count = risk_factors['Tribal Areas']
remote_count = risk_factors['Remote/Rural']
print(f"\n3. GEOGRAPHIC CHALLENGES:")
print(f"   ‚Ä¢ {tribal_count} ({tribal_count/total_pincodes*100:.1f}%) tribal areas need culturally-sensitive outreach")
print(f"   ‚Ä¢ {remote_count} ({remote_count/total_pincodes*100:.1f}%) remote areas need mobile enrollment units")
print(f"   ‚Ä¢ Traditional methods won't work - innovative solutions required")

low_lit = risk_factors['Low Literacy']
print(f"\n4. COMMUNICATION BARRIERS:")
print(f"   ‚Ä¢ {low_lit} pincodes ({low_lit/total_pincodes*100:.1f}%) have low literacy rates")
print(f"   ‚Ä¢ Requires: Visual aids, local language support, community leaders")
print(f"   ‚Ä¢ Digital-first approach will fail in these areas")

migration_total = risk_factors['Migration Source'] + risk_factors['Migration Destination']
print(f"\n5. POPULATION MOBILITY:")
print(f"   ‚Ä¢ {migration_total} pincodes affected by migration patterns")
print(f"   ‚Ä¢ Seasonal campaigns timed with migration cycles needed")
print(f"   ‚Ä¢ Coordination between source and destination areas critical")

print("\n" + "="*80)
print("RECOMMENDED 3-PHASE ACTION PLAN")
print("="*80)

print(f"\nüìç PHASE 1 - IMMEDIATE (0-3 months):")
print(f"   Target: {high_priority} High Priority pincodes")
print(f"   Resources: 60% of budget and personnel")
print(f"   Actions:")
print(f"   ‚Ä¢ Deploy mobile enrollment units to top 5 states")
print(f"   ‚Ä¢ Establish temporary centers in tribal/remote areas")
print(f"   ‚Ä¢ Train local volunteers for community outreach")
print(f"   ‚Ä¢ Launch awareness campaign in local languages")

medium_priority = len(df[df['Priority_Level'] == 'Medium'])
print(f"\nüìç PHASE 2 - SECONDARY (3-6 months):")
print(f"   Target: {medium_priority} Medium Priority pincodes")
print(f"   Resources: 30% of budget and personnel")
print(f"   Actions:")
print(f"   ‚Ä¢ Set up semi-permanent enrollment centers")
print(f"   ‚Ä¢ Leverage schools and anganwadis for outreach")
print(f"   ‚Ä¢ Digital campaigns in areas with connectivity")
print(f"   ‚Ä¢ Partner with local NGOs and panchayats")

low_priority = len(df[df['Priority_Level'] == 'Low'])
print(f"\nüìç PHASE 3 - PREVENTIVE (6-12 months):")
print(f"   Target: {low_priority} Low Priority pincodes")
print(f"   Resources: 10% of budget and personnel")
print(f"   Actions:")
print(f"   ‚Ä¢ Strengthen existing infrastructure")
print(f"   ‚Ä¢ Implement regular update reminder systems")
print(f"   ‚Ä¢ Create sustainable community engagement model")
print(f"   ‚Ä¢ Monitor and prevent future data degradation")

print("\n" + "="*80)
print("SUCCESS METRICS TO TRACK")
print("="*80)
print("\nüìà Quarterly KPIs:")
print("   ‚Ä¢ Update Rate (UIR) improvement in target pincodes")
print("   ‚Ä¢ Number of children with updated demographics")
print("   ‚Ä¢ Coverage of high-risk areas by mobile units")
print("   ‚Ä¢ Community engagement scores")
print("   ‚Ä¢ Cost per successful update")
print("   ‚Ä¢ Reduction in high-priority pincode count")

print("\n" + "="*80)
print("‚úÖ ANALYSIS COMPLETE - All visualizations saved!")
print("="*80)
print("\nGenerated Files:")
print("   1. india_map_priority_scores.html - Interactive India map")
print("   2. comprehensive_dashboard.html - Multi-panel analytics dashboard")
print("   3. top_20_districts.html - District-level priority ranking")
print("   4. uir_vs_enrollments_scatter.html - Relationship analysis")
print("\nOpen these HTML files in any browser for interactive exploration.")
print("="*80)


HIGH-RISK PINCODES ANALYSIS - EXECUTIVE SUMMARY

üìä KEY METRICS:
   Total High-Risk Pincodes: 204
   Total Child Enrollments: 222,945
   Total Demographic Updates: 833,991
   Average Update Rate (UIR): 3.67%
   Coverage Gap: 96.33%

üéØ PRIORITY DISTRIBUTION:
   Low Priority: 156 pincodes (76.5%)
   Medium Priority: 38 pincodes (18.6%)
   High Priority: 9 pincodes (4.4%)
   Critical Priority: 1 pincodes (0.5%)

üó∫Ô∏è TOP 10 STATES BY PRIORITY SCORE:
                   Pincodes  Enrollments  Updates  Avg_UIR  Avg_Priority_Score
state                                                                         
chhattisgarh              3         3429     9815     3.38               50.00
jharkhand                 3         4431    20686     4.43               40.00
bihar                     3         6266    27856     4.52               40.00
jammu_and_kashmir         3         2672     8419     3.51               32.00
assam                    35        35242   138108     3.89         