In [1]:
"""
Imputation Quality Validation Script - FINAL VERSION
=====================================================

Validates imputation quality by comparing:
- PRE: /raw/Hotel_XX_competitors.csv (with NaNs)
- POST: /processed/Hotel_XX_competitors_matrix_completion.csv (filled)

Compares the 'price' column (pre) with 'competitor_price' column (post)
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import ks_2samp, spearmanr
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# Configuration
# ============================================================================

RAW_DATA_DIR = "../data/full-data/raw"
PROCESSED_DATA_DIR = "../data/full-data/processed"
OUTPUT_DIR = "imputation_validation_results"

# Create output directory
Path(OUTPUT_DIR).mkdir(exist_ok=True)

# ============================================================================
# Main Validation Function
# ============================================================================

def validate_hotel_imputation(hotel_id):
    """
    Validate imputation quality for a single hotel
    
    Parameters:
    -----------
    hotel_id : str
        Hotel identifier (e.g., 'Hotel_01', 'Hotel_02')
    """
    
    print(f"\n{'='*70}")
    print(f"Validating: {hotel_id}")
    print(f"{'='*70}")
    
    # -------------------------------------------------------------------------
    # 1. Load Data
    # -------------------------------------------------------------------------
    
    try:
        # PRE-imputation: Raw competitor data with NaNs
        # Columns: hotel_id, stay_date, price, can_check_in, min_length_of_stay
        pre_file = f"{RAW_DATA_DIR}/{hotel_id}_competitors.csv"
        
        if not Path(pre_file).exists():
            print(f"✗ Pre-imputation file not found: {pre_file}")
            return None
            
        pre_data = pd.read_csv(pre_file)
        
        # POST-imputation: Matrix completion imputed (long format)
        # Columns: date, hotel_id, competitor_price
        post_file = f"{PROCESSED_DATA_DIR}/{hotel_id}_competitors_matrix_completion.csv"
        
        if not Path(post_file).exists():
            print(f"✗ Post-imputation file not found: {post_file}")
            return None
        
        post_data = pd.read_csv(post_file)
        
        print(f"✓ Loaded pre-imputation: {len(pre_data)} rows")
        print(f"✓ Loaded post-imputation: {len(post_data)} rows")
        
    except Exception as e:
        print(f"✗ Error loading files: {e}")
        return None
    
    # -------------------------------------------------------------------------
    # 2. Align Data by Date
    # -------------------------------------------------------------------------
    
    # Ensure dates are datetime
    pre_data['date'] = pd.to_datetime(pre_data['stay_date'])
    post_data['date'] = pd.to_datetime(post_data['date'])
    
    # Find common dates and hotels
    common_dates = set(pre_data['date']).intersection(set(post_data['date']))
    common_hotels = set(pre_data['hotel_id']).intersection(set(post_data['hotel_id']))
    
    print(f"✓ Common dates: {len(common_dates)}")
    print(f"✓ Common competitors: {len(common_hotels)}")
    
    # Filter to common dates and hotels
    pre_filtered = pre_data[pre_data['date'].isin(common_dates) & 
                            pre_data['hotel_id'].isin(common_hotels)].copy()
    post_filtered = post_data[post_data['date'].isin(common_dates) & 
                              post_data['hotel_id'].isin(common_hotels)].copy()
    
    print(f"✓ Aligned datasets: {len(pre_filtered)} rows each")
    
    # -------------------------------------------------------------------------
    # 3. Missing Data Summary
    # -------------------------------------------------------------------------
    
    print(f"\n{'Missing Data Summary':-^70}")
    
    # Use 'price' from pre, 'competitor_price' from post
    missing_pre = pre_filtered['price'].isnull().sum()
    missing_post = post_filtered['competitor_price'].isnull().sum()
    
    total_cells = len(pre_filtered)
    pct_missing_pre = (missing_pre / total_cells) * 100
    pct_missing_post = (missing_post / total_cells) * 100
    
    print(f"Total rows: {total_cells:,}")
    print(f"Missing before: {missing_pre:,} ({pct_missing_pre:.1f}%)")
    print(f"Missing after: {missing_post:,} ({pct_missing_post:.1f}%)")
    print(f"Values imputed: {missing_pre - missing_post:,}")
    
    if missing_post > 0:
        print(f"⚠️  Warning: {missing_post} values still missing!")
    
    results = {
        'hotel_id': hotel_id,
        'n_rows': len(pre_filtered),
        'n_competitors': len(common_hotels),
        'total_cells': total_cells,
        'missing_before': int(missing_pre),
        'missing_after': int(missing_post),
        'pct_missing': float(pct_missing_pre),
        'n_imputed': int(missing_pre - missing_post)
    }
    
    # -------------------------------------------------------------------------
    # 4. Distribution Comparison
    # -------------------------------------------------------------------------
    
    print(f"\n{'Distribution Preservation (KS Test)':-^70}")
    
    # Get non-null values
    pre_vals = pre_filtered['price'].dropna()
    post_vals = post_filtered['competitor_price'].dropna()
    
    print(f"Sample sizes:")
    print(f"  Pre (observed): {len(pre_vals)}")
    print(f"  Post (all): {len(post_vals)}")
    
    # Basic statistics
    pre_mean = pre_vals.mean()
    post_mean = post_vals.mean()
    mean_delta = post_mean - pre_mean
    
    pre_std = pre_vals.std()
    post_std = post_vals.std()
    std_delta = post_std - pre_std
    
    pre_median = pre_vals.median()
    post_median = post_vals.median()
    median_delta = post_median - pre_median
    
    print(f"\n{'Statistic':<15s} {'Pre':<15s} {'Post':<15s} {'Delta':<15s}")
    print("-" * 70)
    print(f"{'Mean':<15s} ${pre_mean:<14.2f} ${post_mean:<14.2f} ${mean_delta:+14.2f}")
    print(f"{'Median':<15s} ${pre_median:<14.2f} ${post_median:<14.2f} ${median_delta:+14.2f}")
    print(f"{'Std Dev':<15s} ${pre_std:<14.2f} ${post_std:<14.2f} ${std_delta:+14.2f}")
    print(f"{'Min':<15s} ${pre_vals.min():<14.2f} ${post_vals.min():<14.2f}")
    print(f"{'Max':<15s} ${pre_vals.max():<14.2f} ${post_vals.max():<14.2f}")
    
    # KS Test
    ks_stat, ks_pval = ks_2samp(pre_vals, post_vals)
    
    print(f"\nKolmogorov-Smirnov Test:")
    print(f"  KS Statistic: {ks_stat:.4f}")
    print(f"  p-value: {ks_pval:.4f}")
    
    if ks_pval > 0.05:
        print(f"  Result: Distributions SIMILAR ✓ (p > 0.05)")
        ks_significant = False
    else:
        print(f"  Result: Distributions DIFFER ⚠️ (p < 0.05)")
        ks_significant = True
    
    results['distribution_stats'] = {
        'pre_mean': float(pre_mean),
        'post_mean': float(post_mean),
        'mean_delta': float(mean_delta),
        'pre_std': float(pre_std),
        'post_std': float(post_std),
        'std_delta': float(std_delta),
        'pre_median': float(pre_median),
        'post_median': float(post_median),
        'median_delta': float(median_delta),
        'ks_statistic': float(ks_stat),
        'ks_pvalue': float(ks_pval),
        'ks_significant': ks_significant
    }
    
    # -------------------------------------------------------------------------
    # 5. Overall Quality Assessment
    # -------------------------------------------------------------------------
    
    print(f"\n{'Overall Assessment':-^70}")
    
    issues = []
    warnings_list = []
    
    # Check mean change (threshold: 10% of std dev)
    if abs(mean_delta) > pre_std * 0.1:
        pct_change = (mean_delta / pre_mean) * 100
        issues.append(f"Mean shifted by ${mean_delta:+.2f} ({pct_change:+.1f}%)")
        print(f"✗ Mean shifted significantly: ${mean_delta:+.2f}")
    else:
        print(f"✓ Mean preserved (Δ = ${mean_delta:+.2f})")
    
    # Check std change
    if abs(std_delta) > pre_std * 0.1:
        pct_change = (std_delta / pre_std) * 100
        warnings_list.append(f"Std dev changed by {pct_change:+.1f}%")
        print(f"⚠  Std dev changed: ${std_delta:+.2f}")
    else:
        print(f"✓ Std dev preserved (Δ = ${std_delta:+.2f})")
    
    # Check KS test
    if ks_pval > 0.05:
        print(f"✓ Distribution similar (KS p = {ks_pval:.4f})")
    else:
        issues.append(f"Distribution changed (KS p = {ks_pval:.4f})")
        print(f"✗ Distribution differs (KS p = {ks_pval:.4f})")
    
    # Check imputation completeness
    if missing_post == 0:
        print(f"✓ All missing values imputed ({missing_pre} filled)")
    elif missing_post < missing_pre:
        pct_filled = ((missing_pre - missing_post) / missing_pre) * 100
        print(f"✓ Most missing values imputed ({missing_pre - missing_post} filled, {pct_filled:.1f}%)")
    else:
        issues.append(f"Missing values not reduced")
    
    # Determine overall quality
    n_issues = len(issues)
    n_warnings = len(warnings_list)
    
    if n_issues == 0:
        overall = "EXCELLENT ✓"
        quality_score = 5
        message = "Imputation successfully preserved original characteristics!"
    elif n_issues == 1 and n_warnings == 0:
        overall = "GOOD ✓"
        quality_score = 4
        message = "Minor differences detected but generally good preservation."
    elif n_issues == 1 or (n_issues == 0 and n_warnings > 0):
        overall = "FAIR"
        quality_score = 3
        message = "Some differences detected. Review carefully."
    else:
        overall = "POOR ⚠️"
        quality_score = 2
        message = "Significant differences detected. Imputation may have introduced bias."
    
    print(f"\n{'='*70}")
    print(f"OVERALL QUALITY: {overall} (Score: {quality_score}/5)")
    print(f"{message}")
    
    if issues:
        print(f"\nIssues:")
        for issue in issues:
            print(f"  - {issue}")
    
    if warnings_list:
        print(f"\nWarnings:")
        for warning in warnings_list:
            print(f"  - {warning}")
    
    results['overall_assessment'] = {
        'quality': overall,
        'quality_score': quality_score,
        'issues': issues,
        'warnings': warnings_list,
        'message': message
    }
    
    return results


# ============================================================================
# Process All Hotels
# ============================================================================

def validate_all_hotels():
    """Validate imputation for all hotels"""
    
    print("="*70)
    print("IMPUTATION QUALITY VALIDATION - ALL HOTELS")
    print("="*70)
    
    # Find all raw competitor files
    raw_files = list(Path(RAW_DATA_DIR).glob("Hotel_*_competitors.csv"))
    
    print(f"\nFound {len(raw_files)} hotels to validate")
    print(f"Looking for matching files in: {PROCESSED_DATA_DIR}")
    
    all_results = []
    successful = 0
    failed = 0
    
    for raw_file in sorted(raw_files):
        hotel_id = raw_file.stem.replace('_competitors', '')
        
        result = validate_hotel_imputation(hotel_id)
        
        if result:
            all_results.append(result)
            successful += 1
        else:
            failed += 1
    
    print(f"\n{'Validation Summary':-^70}")
    print(f"Successful: {successful}/{len(raw_files)}")
    print(f"Failed: {failed}/{len(raw_files)}")
    
    if not all_results:
        print("\n❌ No hotels validated successfully!")
        return None, None
    
    # -------------------------------------------------------------------------
    # Create Summary Report
    # -------------------------------------------------------------------------
    
    print("\n" + "="*70)
    print("SUMMARY REPORT - ALL HOTELS")
    print("="*70)
    
    # Convert to DataFrame
    summary_df = pd.DataFrame([
        {
            'hotel_id': r['hotel_id'],
            'n_rows': r['n_rows'],
            'n_competitors': r['n_competitors'],
            'pct_missing': r['pct_missing'],
            'mean_delta': r['distribution_stats']['mean_delta'],
            'std_delta': r['distribution_stats']['std_delta'],
            'ks_pvalue': r['distribution_stats']['ks_pvalue'],
            'ks_significant': r['distribution_stats']['ks_significant'],
            'quality_score': r['overall_assessment']['quality_score'],
            'quality': r['overall_assessment']['quality']
        }
        for r in all_results
    ])
    
    # Statistics
    print(f"\nTotal hotels validated: {len(all_results)}")
    print(f"Mean missing data: {summary_df['pct_missing'].mean():.1f}%")
    print(f"Median missing data: {summary_df['pct_missing'].median():.1f}%")
    
    print("\nQuality Distribution:")
    quality_counts = summary_df['quality'].value_counts()
    for quality in ['EXCELLENT ✓', 'GOOD ✓', 'FAIR', 'POOR ⚠️']:
        count = quality_counts.get(quality, 0)
        pct = (count / len(summary_df)) * 100 if len(summary_df) > 0 else 0
        bar = '█' * int(pct / 5)
        print(f"  {quality:<15s}: {count:2d} ({pct:5.1f}%) {bar}")
    
    print("\nMean Delta Distribution:")
    print(f"  Mean of mean deltas: ${summary_df['mean_delta'].mean():.2f}")
    print(f"  Median of mean deltas: ${summary_df['mean_delta'].median():.2f}")
    print(f"  Hotels with negative bias: {(summary_df['mean_delta'] < -10).sum()}")
    print(f"  Hotels with positive bias: {(summary_df['mean_delta'] > 10).sum()}")
    
    print("\nTop 5 Hotels (Best Quality):")
    top5 = summary_df.nlargest(5, 'quality_score')[
        ['hotel_id', 'pct_missing', 'mean_delta', 'ks_pvalue', 'quality']
    ]
    print(top5.to_string(index=False))
    
    if len(summary_df) >= 5:
        print("\nBottom 5 Hotels (Issues Detected):")
        bottom5 = summary_df.nsmallest(5, 'quality_score')[
            ['hotel_id', 'pct_missing', 'mean_delta', 'ks_pvalue', 'quality']
        ]
        print(bottom5.to_string(index=False))
    
    # Save results
    summary_df.to_csv(f'{OUTPUT_DIR}/imputation_validation_summary.csv', 
                     index=False)
    print(f"\n✓ Summary saved: {OUTPUT_DIR}/imputation_validation_summary.csv")
    
    with open(f'{OUTPUT_DIR}/imputation_validation_details.json', 'w') as f:
        json.dump(all_results, f, indent=2, default=str)
    print(f"✓ Details saved: {OUTPUT_DIR}/imputation_validation_details.json")
    
    return summary_df, all_results


# ============================================================================
# MAIN EXECUTION
# ============================================================================

if __name__ == "__main__":
    
    print("\n" + "="*70)
    print("STARTING IMPUTATION VALIDATION")
    print("="*70)
    print(f"\nRaw data directory: {RAW_DATA_DIR}")
    print(f"Processed data directory: {PROCESSED_DATA_DIR}")
    print(f"Output directory: {OUTPUT_DIR}")
    print("\nComparing:")
    print("  PRE:  /raw/Hotel_XX_competitors.csv → 'price' column")
    print("  POST: /processed/Hotel_XX_competitors_matrix_completion.csv → 'competitor_price' column")
    
    # Validate all hotels
    result = validate_all_hotels()
    
    if result is not None and result[0] is not None:
        summary_df, all_results = result
        
        print("\n" + "="*70)
        print("VALIDATION COMPLETE!")
        print("="*70)
        print(f"\nResults saved to: {OUTPUT_DIR}/")
        print("  - imputation_validation_summary.csv (summary table)")
        print("  - imputation_validation_details.json (full details)")
        
        # Print key findings
        n_fair_or_worse = (summary_df['quality_score'] <= 3).sum()
        if n_fair_or_worse > 0:
            print(f"\n⚠️  Note: {n_fair_or_worse} hotels have FAIR or POOR quality")
            print("   Review imputation_validation_summary.csv for details")
    else:
        print("\n" + "="*70)
        print("VALIDATION FAILED!")
        print("="*70)
        print("\n📋 Check that both directories exist and contain matching files")


STARTING IMPUTATION VALIDATION

Raw data directory: ../data/full-data/raw
Processed data directory: ../data/full-data/processed
Output directory: imputation_validation_results

Comparing:
  PRE:  /raw/Hotel_XX_competitors.csv → 'price' column
  POST: /processed/Hotel_XX_competitors_matrix_completion.csv → 'competitor_price' column
IMPUTATION QUALITY VALIDATION - ALL HOTELS

Found 39 hotels to validate
Looking for matching files in: ../data/full-data/processed

Validating: Hotel_01
✓ Loaded pre-imputation: 2915 rows
✓ Loaded post-imputation: 2795 rows
✓ Common dates: 559
✓ Common competitors: 5
✓ Aligned datasets: 2795 rows each

-------------------------Missing Data Summary-------------------------
Total rows: 2,795
Missing before: 859 (30.7%)
Missing after: 0 (0.0%)
Values imputed: 859

-----------------Distribution Preservation (KS Test)------------------
Sample sizes:
  Pre (observed): 1936
  Post (all): 2795

Statistic       Pre             Post            Delta          
--------

✓ Common dates: 680
✓ Common competitors: 8
✓ Aligned datasets: 5440 rows each

-------------------------Missing Data Summary-------------------------
Total rows: 5,440
Missing before: 351 (6.5%)
Missing after: 0 (0.0%)
Values imputed: 351

-----------------Distribution Preservation (KS Test)------------------
Sample sizes:
  Pre (observed): 5089
  Post (all): 5440

Statistic       Pre             Post            Delta          
----------------------------------------------------------------------
Mean            $119.00         $121.19         $         +2.20
Median          $110.00         $113.90         $         +3.90
Std Dev         $31.38          $32.45          $         +1.07
Min             $65.45          $65.45         
Max             $355.00         $355.00        

Kolmogorov-Smirnov Test:
  KS Statistic: 0.0334
  p-value: 0.0055
  Result: Distributions DIFFER ⚠️ (p < 0.05)

--------------------------Overall Assessment--------------------------
✓ Mean preserved (Δ = $+

✓ Loaded pre-imputation: 5080 rows
✓ Loaded post-imputation: 4784 rows
✓ Common dates: 598
✓ Common competitors: 8
✓ Aligned datasets: 4784 rows each

-------------------------Missing Data Summary-------------------------
Total rows: 4,784
Missing before: 1,038 (21.7%)
Missing after: 0 (0.0%)
Values imputed: 1,038

-----------------Distribution Preservation (KS Test)------------------
Sample sizes:
  Pre (observed): 3746
  Post (all): 4784

Statistic       Pre             Post            Delta          
----------------------------------------------------------------------
Mean            $372.82         $374.76         $         +1.94
Median          $348.65         $354.25         $         +5.60
Std Dev         $145.09         $143.19         $         -1.90
Min             $148.70         $148.70        
Max             $1455.30        $1455.30       

Kolmogorov-Smirnov Test:
  KS Statistic: 0.0232
  p-value: 0.2036
  Result: Distributions SIMILAR ✓ (p > 0.05)

-------------------


Kolmogorov-Smirnov Test:
  KS Statistic: 0.0403
  p-value: 0.0002
  Result: Distributions DIFFER ⚠️ (p < 0.05)

--------------------------Overall Assessment--------------------------
✓ Mean preserved (Δ = $+5.27)
✓ Std dev preserved (Δ = $+2.67)
✗ Distribution differs (KS p = 0.0002)
✓ All missing values imputed (570 filled)

OVERALL QUALITY: GOOD ✓ (Score: 4/5)
Minor differences detected but generally good preservation.

Issues:
  - Distribution changed (KS p = 0.0002)

Validating: Hotel_23
✓ Loaded pre-imputation: 7830 rows
✓ Loaded post-imputation: 6810 rows
✓ Common dates: 681
✓ Common competitors: 10
✓ Aligned datasets: 6786 rows each

-------------------------Missing Data Summary-------------------------
Total rows: 6,786
Missing before: 1,247 (18.4%)
Missing after: 0 (0.0%)
Values imputed: 1,247

-----------------Distribution Preservation (KS Test)------------------
Sample sizes:
  Pre (observed): 5539
  Post (all): 6810

Statistic       Pre             Post            Delta   

✓ Loaded pre-imputation: 4800 rows
✓ Loaded post-imputation: 3600 rows
✓ Common dates: 600
✓ Common competitors: 6
✓ Aligned datasets: 3600 rows each

-------------------------Missing Data Summary-------------------------
Total rows: 3,600
Missing before: 2,392 (66.4%)
Missing after: 0 (0.0%)
Values imputed: 2,392

-----------------Distribution Preservation (KS Test)------------------
Sample sizes:
  Pre (observed): 1208
  Post (all): 3600

Statistic       Pre             Post            Delta          
----------------------------------------------------------------------
Mean            $150.60         $147.79         $         -2.81
Median          $123.27         $148.95         $        +25.68
Std Dev         $95.77          $68.22          $        -27.56
Min             $34.50          $34.50         
Max             $775.72         $775.72        

Kolmogorov-Smirnov Test:
  KS Statistic: 0.1800
  p-value: 0.0000
  Result: Distributions DIFFER ⚠️ (p < 0.05)

-------------------

✓ Loaded pre-imputation: 3192 rows
✓ Loaded post-imputation: 3184 rows
✓ Common dates: 398
✓ Common competitors: 8
✓ Aligned datasets: 3184 rows each

-------------------------Missing Data Summary-------------------------
Total rows: 3,184
Missing before: 336 (10.6%)
Missing after: 0 (0.0%)
Values imputed: 336

-----------------Distribution Preservation (KS Test)------------------
Sample sizes:
  Pre (observed): 2848
  Post (all): 3184

Statistic       Pre             Post            Delta          
----------------------------------------------------------------------
Mean            $243.26         $245.87         $         +2.61
Median          $201.14         $206.32         $         +5.18
Std Dev         $123.10         $122.09         $         -1.01
Min             $104.06         $79.25         
Max             $1233.00        $1233.00       

Kolmogorov-Smirnov Test:
  KS Statistic: 0.0224
  p-value: 0.4274
  Result: Distributions SIMILAR ✓ (p > 0.05)

-----------------------


✓ Summary saved: imputation_validation_results/imputation_validation_summary.csv
✓ Details saved: imputation_validation_results/imputation_validation_details.json

VALIDATION COMPLETE!

Results saved to: imputation_validation_results/
  - imputation_validation_summary.csv (summary table)
  - imputation_validation_details.json (full details)

⚠️  Note: 10 hotels have FAIR or POOR quality
   Review imputation_validation_summary.csv for details
