# Invisible Citizens - Comprehensive Analysis Pipeline
## Advanced Analytics for High-Risk Area Identification

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
import warnings
warnings.filterwarnings('ignore')

# Set style
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (14, 8)

## 1. Data Loading and Initial Exploration

In [None]:
# Load all datasets
high_risk_pincodes = pd.read_csv('high_risk_pincodes.csv')
state_summary = pd.read_csv('state_summary.csv')

# Display basic info
print("="*60)
print("HIGH RISK PINCODES DATASET")
print("="*60)
print(f"Total Records: {len(high_risk_pincodes)}")
print(f"Columns: {list(high_risk_pincodes.columns)}")
print("\nFirst 5 rows:")
display(high_risk_pincodes.head())

print("\n" + "="*60)
print("STATE SUMMARY DATASET")
print("="*60)
print(f"Total States: {len(state_summary)}")
display(state_summary.head())

## 2. Data Quality Assessment

In [None]:
# Missing values analysis
missing_data = pd.DataFrame({
    'Column': high_risk_pincodes.columns,
    'Missing_Count': high_risk_pincodes.isnull().sum(),
    'Missing_Percent': (high_risk_pincodes.isnull().sum() / len(high_risk_pincodes) * 100).round(2)
})

print("Missing Data Analysis:")
display(missing_data[missing_data['Missing_Count'] > 0])

# Data types
print("\nData Types:")
display(high_risk_pincodes.dtypes)

## 3. Statistical Summary and Risk Scoring

In [None]:
# Calculate comprehensive risk score
def calculate_risk_score(df):
    """
    Calculate multi-dimensional risk score based on available metrics
    """
    df = df.copy()
    
    # Identify numeric columns for scoring
    numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()
    
    # Remove ID columns if present
    exclude_cols = ['pincode', 'id', 'index']
    score_cols = [col for col in numeric_cols if col.lower() not in exclude_cols]
    
    if len(score_cols) > 0:
        # Normalize the scores
        scaler = StandardScaler()
        normalized = scaler.fit_transform(df[score_cols].fillna(0))
        
        # Calculate composite score
        df['composite_risk_score'] = normalized.mean(axis=1)
        df['risk_percentile'] = df['composite_risk_score'].rank(pct=True) * 100
        
        # Categorize risk levels
        df['risk_category'] = pd.cut(df['risk_percentile'], 
                                      bins=[0, 25, 50, 75, 100],
                                      labels=['Low', 'Medium', 'High', 'Critical'])
    
    return df

high_risk_enriched = calculate_risk_score(high_risk_pincodes)

print("Risk Score Distribution:")
if 'risk_category' in high_risk_enriched.columns:
    display(high_risk_enriched['risk_category'].value_counts())

## 4. Geographic Analysis

In [None]:
# State-level aggregation
if 'state' in high_risk_enriched.columns or 'State' in high_risk_enriched.columns:
    state_col = 'state' if 'state' in high_risk_enriched.columns else 'State'
    
    state_analysis = high_risk_enriched.groupby(state_col).agg({
        'pincode': 'count',
        'composite_risk_score': ['mean', 'std', 'min', 'max']
    }).round(3)
    
    state_analysis.columns = ['Total_Pincodes', 'Avg_Risk', 'StdDev_Risk', 'Min_Risk', 'Max_Risk']
    state_analysis = state_analysis.sort_values('Avg_Risk', ascending=False)
    
    print("\nTop 10 High-Risk States:")
    display(state_analysis.head(10))
    
    # Visualization
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Top states by average risk
    state_analysis.head(15)['Avg_Risk'].plot(kind='barh', ax=axes[0], color='crimson')
    axes[0].set_title('Top 15 States by Average Risk Score', fontsize=14, fontweight='bold')
    axes[0].set_xlabel('Average Risk Score')
    
    # Pincode count by state
    state_analysis.head(15)['Total_Pincodes'].plot(kind='barh', ax=axes[1], color='steelblue')
    axes[1].set_title('Top 15 States by Number of High-Risk Pincodes', fontsize=14, fontweight='bold')
    axes[1].set_xlabel('Number of Pincodes')
    
    plt.tight_layout()
    plt.savefig('state_analysis.png', dpi=300, bbox_inches='tight')
    plt.show()

## 5. Clustering Analysis - Identifying Similar Risk Profiles

In [None]:
# Prepare data for clustering
numeric_features = high_risk_enriched.select_dtypes(include=[np.number]).columns.tolist()
exclude_cols = ['pincode', 'id', 'index', 'composite_risk_score', 'risk_percentile']
cluster_features = [col for col in numeric_features if col.lower() not in exclude_cols]

if len(cluster_features) >= 2:
    # Prepare clustering data
    X_cluster = high_risk_enriched[cluster_features].fillna(0)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_cluster)
    
    # Determine optimal clusters using elbow method
    inertias = []
    K_range = range(2, min(11, len(X_scaled)//10))
    
    for k in K_range:
        kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
        kmeans.fit(X_scaled)
        inertias.append(kmeans.inertia_)
    
    # Plot elbow curve
    plt.figure(figsize=(10, 6))
    plt.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
    plt.xlabel('Number of Clusters (k)', fontsize=12)
    plt.ylabel('Inertia', fontsize=12)
    plt.title('Elbow Method for Optimal k', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.savefig('elbow_curve.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    # Apply K-means with optimal k (assume 5)
    optimal_k = 5
    kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
    high_risk_enriched['cluster'] = kmeans.fit_predict(X_scaled)
    
    print(f"\nCluster Distribution:")
    display(high_risk_enriched['cluster'].value_counts().sort_index())

## 6. PCA Visualization

In [None]:
if len(cluster_features) >= 2:
    # Apply PCA
    pca = PCA(n_components=2)
    X_pca = pca.fit_transform(X_scaled)
    
    # Create visualization
    plt.figure(figsize=(12, 8))
    scatter = plt.scatter(X_pca[:, 0], X_pca[:, 1], 
                         c=high_risk_enriched['cluster'], 
                         cmap='viridis', 
                         s=50, 
                         alpha=0.6,
                         edgecolors='black',
                         linewidth=0.5)
    plt.colorbar(scatter, label='Cluster')
    plt.xlabel(f'PC1 ({pca.explained_variance_ratio_[0]*100:.1f}% variance)', fontsize=12)
    plt.ylabel(f'PC2 ({pca.explained_variance_ratio_[1]*100:.1f}% variance)', fontsize=12)
    plt.title('PCA Visualization of High-Risk Areas by Cluster', fontsize=14, fontweight='bold')
    plt.grid(True, alpha=0.3)
    plt.savefig('pca_clusters.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print(f"\nPCA Explained Variance:")
    print(f"PC1: {pca.explained_variance_ratio_[0]*100:.2f}%")
    print(f"PC2: {pca.explained_variance_ratio_[1]*100:.2f}%")
    print(f"Total: {sum(pca.explained_variance_ratio_)*100:.2f}%")

## 7. Priority Area Identification

In [None]:
# Identify top priority areas
if 'risk_percentile' in high_risk_enriched.columns:
    priority_threshold = 90
    critical_areas = high_risk_enriched[high_risk_enriched['risk_percentile'] >= priority_threshold].copy()
    
    print(f"\nCritical Priority Areas (Risk Percentile >= {priority_threshold}):")
    print(f"Total: {len(critical_areas)} pincodes")
    
    # Sort by risk score
    display_cols = ['pincode', 'composite_risk_score', 'risk_percentile', 'risk_category']
    if 'state' in critical_areas.columns:
        display_cols.insert(1, 'state')
    
    available_cols = [col for col in display_cols if col in critical_areas.columns]
    
    print("\nTop 20 Highest Risk Pincodes:")
    display(critical_areas.nlargest(20, 'composite_risk_score')[available_cols])

## 8. Correlation Analysis

In [None]:
# Calculate correlations
if len(cluster_features) >= 2:
    correlation_matrix = high_risk_enriched[cluster_features].corr()
    
    # Heatmap
    plt.figure(figsize=(12, 10))
    sns.heatmap(correlation_matrix, 
                annot=True, 
                fmt='.2f', 
                cmap='coolwarm', 
                center=0,
                square=True,
                linewidths=1,
                cbar_kws={"shrink": 0.8})
    plt.title('Feature Correlation Heatmap', fontsize=14, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.savefig('correlation_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()

## 9. Export Final Results

In [None]:
# Prepare final comprehensive report
final_report = high_risk_enriched.copy()

# Export to CSV
final_report.to_csv('FINAL_COMPREHENSIVE_REPORT.csv', index=False)
print("✓ Exported: FINAL_COMPREHENSIVE_REPORT.csv")

# Export critical priorities
if 'risk_percentile' in final_report.columns:
    critical_areas.to_csv('CRITICAL_PRIORITY_AREAS.csv', index=False)
    print("✓ Exported: CRITICAL_PRIORITY_AREAS.csv")

# Export state summary
if 'state' in high_risk_enriched.columns or 'State' in high_risk_enriched.columns:
    state_analysis.to_csv('STATE_RISK_ANALYSIS.csv')
    print("✓ Exported: STATE_RISK_ANALYSIS.csv")

# Export cluster profiles
if 'cluster' in final_report.columns:
    cluster_profiles = final_report.groupby('cluster').agg({
        'pincode': 'count',
        'composite_risk_score': ['mean', 'std'],
        **{col: 'mean' for col in cluster_features}
    }).round(3)
    cluster_profiles.to_csv('CLUSTER_PROFILES.csv')
    print("✓ Exported: CLUSTER_PROFILES.csv")

print("\n" + "="*60)
print("ANALYSIS COMPLETE - ALL REPORTS GENERATED")
print("="*60)

## 10. Executive Summary

In [None]:
# Generate executive summary
print("\n" + "#"*60)
print("# EXECUTIVE SUMMARY")
print("#"*60 + "\n")

print(f"Total High-Risk Pincodes Analyzed: {len(high_risk_enriched):,}")

if 'risk_category' in high_risk_enriched.columns:
    print(f"\nRisk Distribution:")
    for cat in ['Critical', 'High', 'Medium', 'Low']:
        count = (high_risk_enriched['risk_category'] == cat).sum()
        pct = count / len(high_risk_enriched) * 100
        print(f"  {cat}: {count:,} ({pct:.1f}%)")

if 'cluster' in high_risk_enriched.columns:
    print(f"\nIdentified {optimal_k} distinct risk profile clusters")

if 'state' in high_risk_enriched.columns or 'State' in high_risk_enriched.columns:
    state_col = 'state' if 'state' in high_risk_enriched.columns else 'State'
    top_state = state_analysis.index[0]
    print(f"\nHighest Risk State: {top_state}")
    print(f"  Average Risk Score: {state_analysis.iloc[0]['Avg_Risk']:.3f}")
    print(f"  Number of Pincodes: {int(state_analysis.iloc[0]['Total_Pincodes']):,}")

print("\n" + "#"*60)
print("# RECOMMENDED ACTIONS")
print("#"*60 + "\n")
print("1. Prioritize resources for Critical and High-risk categories")
print("2. Deploy targeted interventions based on cluster profiles")
print("3. Focus immediate attention on top 10% risk percentile areas")
print("4. Conduct detailed field assessments in identified hotspots")
print("5. Monitor trends in state-level risk variations")