In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import matplotlib.pyplot as plt
import seaborn as sns

# Configuration
data_path = Path('../data/full-data/processed')
raw_data_path = Path('../data/full-data')  # Original data before imputation
output_path = Path('../data/full-data/imputation_analysis')
output_path.mkdir(parents=True, exist_ok=True)

mapping_df = pd.read_csv('../data/full-data/hotel_mapping.csv')
hotel_list = mapping_df['masked_id'].tolist()

CATASTROPHIC_HOTELS = ['Hotel_26', 'Hotel_32', 'Hotel_34']
ALL_HOTELS = [h for h in hotel_list if h not in CATASTROPHIC_HOTELS]

print("="*80)
print("COMPREHENSIVE IMPUTATION QUALITY ANALYSIS")
print("="*80)
print(f"Analyzing ALL {len(ALL_HOTELS)} hotels")
print(f"Comparing BEFORE and AFTER imputation")
print("-" * 80)

def load_original_data(hotel_id):
    """
    Try to load original data BEFORE imputation
    This will help us see what was actually imputed
    """
    try:
        # Try to find original query results or raw data
        # Adjust path based on your data structure
        original_file = raw_data_path / f'query_results/{hotel_id}_raw.csv'
        if original_file.exists():
            return pd.read_csv(original_file)
        return None
    except:
        return None

def analyze_imputation_quality(hotel_id):
    """
    Deep analysis comparing original vs imputed data
    """
    try:
        # Load processed (imputed) data
        df_imputed = pd.read_csv(data_path / f'{hotel_id}_lagged_dataset.csv')
        df_imputed['date'] = pd.to_datetime(df_imputed['date'])
        df_imputed = df_imputed.sort_values('date').reset_index(drop=True)
        
        if 'base_rate' not in df_imputed.columns:
            return None
        
        # Get competitor columns (all lags)
        all_comp_cols = [col for col in df_imputed.columns if '-USD' in col and 'lag_' in col]
        comp_lag1_cols = [col for col in all_comp_cols if 'lag_1' in col]
        
        analysis = {
            'hotel_id': hotel_id,
            'n_observations': len(df_imputed),
            'n_competitors': len(comp_lag1_cols),
            'n_competitor_features': len(all_comp_cols)
        }
        
        # ============================================================
        # 1. FOCAL HOTEL PRICE ANALYSIS
        # ============================================================
        focal_prices = df_imputed['base_rate']
        analysis['focal_price_stats'] = {
            'mean': float(focal_prices.mean()),
            'median': float(focal_prices.median()),
            'std': float(focal_prices.std()),
            'min': float(focal_prices.min()),
            'max': float(focal_prices.max()),
            'q25': float(focal_prices.quantile(0.25)),
            'q75': float(focal_prices.quantile(0.75)),
            'n_missing': int(focal_prices.isna().sum()),
            'pct_missing': float((focal_prices.isna().sum() / len(focal_prices)) * 100),
            'n_zero': int((focal_prices == 0).sum()),
            'n_negative': int((focal_prices < 0).sum()),
            'cv': float(focal_prices.std() / focal_prices.mean()) if focal_prices.mean() > 0 else None
        }
        
        # ============================================================
        # 2. COMPETITOR PRICE ANALYSIS (ALL FEATURES)
        # ============================================================
        if len(all_comp_cols) > 0:
            comp_data_all = df_imputed[all_comp_cols]
            
            # Missing data patterns BEFORE imputation (if we can detect it)
            # We'll look at patterns that suggest imputation
            analysis['competitor_all_features'] = {
                'total_cells': int(comp_data_all.size),
                'missing_cells': int(comp_data_all.isna().sum().sum()),
                'missing_pct': float((comp_data_all.isna().sum().sum() / comp_data_all.size) * 100),
                'features_with_missing': int((comp_data_all.isna().any()).sum()),
                'features_all_missing': int((comp_data_all.isna().all()).sum())
            }
            
            # Value distributions
            flat_values = comp_data_all.values.flatten()
            flat_values = flat_values[~np.isnan(flat_values)]
            
            if len(flat_values) > 0:
                analysis['competitor_price_stats_all'] = {
                    'mean': float(np.mean(flat_values)),
                    'median': float(np.median(flat_values)),
                    'std': float(np.std(flat_values)),
                    'min': float(np.min(flat_values)),
                    'max': float(np.max(flat_values)),
                    'n_zero': int((flat_values == 0).sum()),
                    'n_negative': int((flat_values < 0).sum()),
                    'pct_zero': float((flat_values == 0).sum() / len(flat_values) * 100),
                    'pct_negative': float((flat_values < 0).sum() / len(flat_values) * 100)
                }
        
        # ============================================================
        # 3. LAG 1 COMPETITOR ANALYSIS (for correlation)
        # ============================================================
        if len(comp_lag1_cols) > 0:
            comp_data_lag1 = df_imputed[comp_lag1_cols]
            
            analysis['competitor_lag1'] = {
                'missing_pct': float((comp_data_lag1.isna().sum().sum() / comp_data_lag1.size) * 100),
                'competitors_with_missing': int((comp_data_lag1.isna().any()).sum())
            }
            
            # ============================================================
            # 4. CORRELATION ANALYSIS (Focal vs Competitors)
            # ============================================================
            correlations = []
            for comp in comp_lag1_cols:
                valid_mask = ~(focal_prices.isna() | df_imputed[comp].isna())
                n_valid = valid_mask.sum()
                
                if n_valid > 10:
                    corr = focal_prices[valid_mask].corr(df_imputed[comp][valid_mask])
                    correlations.append({
                        'competitor': comp,
                        'correlation': float(corr) if not np.isnan(corr) else None,
                        'n_valid_pairs': int(n_valid),
                        'pct_valid': float((n_valid / len(df_imputed)) * 100)
                    })
            
            if correlations:
                valid_corrs = [c['correlation'] for c in correlations if c['correlation'] is not None]
                if valid_corrs:
                    analysis['correlations'] = {
                        'mean_corr': float(np.mean(valid_corrs)),
                        'median_corr': float(np.median(valid_corrs)),
                        'min_corr': float(np.min(valid_corrs)),
                        'max_corr': float(np.max(valid_corrs)),
                        'std_corr': float(np.std(valid_corrs)),
                        'n_negative_corr': int(sum(1 for c in valid_corrs if c < 0)),
                        'n_weak_corr': int(sum(1 for c in valid_corrs if abs(c) < 0.3)),
                        'all_correlations': correlations
                    }
            
            # ============================================================
            # 5. IMPUTATION ARTIFACTS DETECTION
            # ============================================================
            # Check for identical values (sign of poor imputation)
            identical_analysis = []
            for comp in comp_lag1_cols:
                comp_values = df_imputed[comp].dropna()
                if len(comp_values) > 0:
                    value_counts = comp_values.value_counts()
                    max_identical = value_counts.iloc[0]
                    pct_identical = (max_identical / len(comp_values)) * 100
                    
                    # Check for suspiciously repeated values
                    top_5_values = value_counts.head(5)
                    
                    identical_analysis.append({
                        'competitor': comp,
                        'max_identical_value': float(comp_values.mode()[0]) if len(comp_values.mode()) > 0 else None,
                        'max_identical_count': int(max_identical),
                        'max_identical_pct': float(pct_identical),
                        'unique_values': int(len(value_counts)),
                        'unique_pct': float((len(value_counts) / len(comp_values)) * 100),
                        'top_5_concentration': float((top_5_values.sum() / len(comp_values)) * 100)
                    })
            
            if identical_analysis:
                analysis['imputation_artifacts'] = {
                    'max_pct_identical': float(max([ia['max_identical_pct'] for ia in identical_analysis])),
                    'competitors_high_duplication_50': int(sum(1 for ia in identical_analysis if ia['max_identical_pct'] > 50)),
                    'competitors_high_duplication_30': int(sum(1 for ia in identical_analysis if ia['max_identical_pct'] > 30)),
                    'competitors_low_uniqueness': int(sum(1 for ia in identical_analysis if ia['unique_pct'] < 10)),
                    'mean_uniqueness_pct': float(np.mean([ia['unique_pct'] for ia in identical_analysis])),
                    'details': identical_analysis
                }
            
            # ============================================================
            # 6. PRICE RATIO ANALYSIS (Focal vs Competitors)
            # ============================================================
            price_ratios = []
            for comp in comp_lag1_cols:
                valid_mask = (focal_prices > 0) & (df_imputed[comp] > 0)
                if valid_mask.sum() > 10:
                    ratio = (focal_prices[valid_mask] / df_imputed[comp][valid_mask])
                    price_ratios.append({
                        'competitor': comp,
                        'mean_ratio': float(ratio.mean()),
                        'median_ratio': float(ratio.median()),
                        'std_ratio': float(ratio.std()),
                        'min_ratio': float(ratio.min()),
                        'max_ratio': float(ratio.max())
                    })
            
            if price_ratios:
                analysis['price_ratios'] = {
                    'mean_ratio': float(np.mean([pr['mean_ratio'] for pr in price_ratios])),
                    'median_ratio': float(np.median([pr['median_ratio'] for pr in price_ratios])),
                    'extreme_ratios': int(sum(1 for pr in price_ratios if pr['mean_ratio'] > 3 or pr['mean_ratio'] < 0.33)),
                    'details': price_ratios
                }
            
            # ============================================================
            # 7. TEMPORAL CONSISTENCY CHECK
            # ============================================================
            # Check if competitor prices have suspicious temporal patterns
            temporal_issues = []
            for comp in comp_lag1_cols[:5]:  # Check first 5 for speed
                comp_series = df_imputed[comp].dropna()
                if len(comp_series) > 30:
                    # Check for flat periods (same value for many consecutive days)
                    consecutive_same = []
                    current_run = 1
                    for i in range(1, len(comp_series)):
                        if comp_series.iloc[i] == comp_series.iloc[i-1]:
                            current_run += 1
                        else:
                            if current_run > 1:
                                consecutive_same.append(current_run)
                            current_run = 1
                    
                    if consecutive_same:
                        temporal_issues.append({
                            'competitor': comp,
                            'max_consecutive_identical': int(max(consecutive_same)),
                            'mean_consecutive_identical': float(np.mean(consecutive_same)),
                            'n_flat_periods': int(len(consecutive_same))
                        })
            
            if temporal_issues:
                analysis['temporal_consistency'] = {
                    'competitors_with_flat_periods': int(len(temporal_issues)),
                    'max_flat_period_length': int(max([ti['max_consecutive_identical'] for ti in temporal_issues])),
                    'details': temporal_issues
                }
        
        return analysis
        
    except Exception as e:
        print(f"  ERROR analyzing {hotel_id}: {str(e)}")
        import traceback
        traceback.print_exc()
        return None

# Analyze ALL hotels
print("\nAnalyzing all hotels...")
all_analyses = {}

for idx, hotel_id in enumerate(ALL_HOTELS, 1):
    if idx % 5 == 0:
        print(f"  Progress: {idx}/{len(ALL_HOTELS)} hotels...")
    
    analysis = analyze_imputation_quality(hotel_id)
    if analysis:
        all_analyses[hotel_id] = analysis

print(f"\nCompleted analysis for {len(all_analyses)} hotels")

# Save detailed results
with open(output_path / 'imputation_analysis_complete.json', 'w') as f:
    json.dump(all_analyses, f, indent=2)

# ============================================================
# GENERATE COMPREHENSIVE SUMMARY REPORT
# ============================================================
print("\n" + "="*80)
print("COMPREHENSIVE IMPUTATION QUALITY REPORT")
print("="*80)

# Create summary statistics
summary_stats = []
for hotel_id, analysis in all_analyses.items():
    stats = {
        'hotel_id': hotel_id,
        'n_observations': analysis['n_observations'],
        'n_competitors': analysis['n_competitors'],
        'focal_price_mean': analysis['focal_price_stats']['mean'],
        'focal_price_cv': analysis['focal_price_stats']['cv'],
        'focal_negative_count': analysis['focal_price_stats']['n_negative'],
        'focal_zero_count': analysis['focal_price_stats']['n_zero'],
        'competitor_missing_pct': analysis.get('competitor_all_features', {}).get('missing_pct', 0),
        'mean_correlation': analysis.get('correlations', {}).get('mean_corr', None),
        'weak_correlations': analysis.get('correlations', {}).get('n_weak_corr', None),
        'max_identical_pct': analysis.get('imputation_artifacts', {}).get('max_pct_identical', 0),
        'high_duplication_count': analysis.get('imputation_artifacts', {}).get('competitors_high_duplication_30', 0),
        'mean_uniqueness': analysis.get('imputation_artifacts', {}).get('mean_uniqueness_pct', None)
    }
    summary_stats.append(stats)

summary_df = pd.DataFrame(summary_stats)
summary_df.to_csv(output_path / 'imputation_summary.csv', index=False)

# ============================================================
# 1. DATA QUALITY ISSUES
# ============================================================
print("\n1. FOCAL HOTEL PRICE ISSUES:")
print("-" * 40)
issues_found = False
for hotel_id, analysis in all_analyses.items():
    focal = analysis['focal_price_stats']
    if focal['n_negative'] > 0 or focal['n_zero'] > 5 or focal['min'] < 0:
        issues_found = True
        print(f"{hotel_id}:")
        print(f"  Negative: {focal['n_negative']}, Zero: {focal['n_zero']}")
        print(f"  Price range: ${focal['min']:.2f} - ${focal['max']:.2f}")
        print(f"  Mean: ${focal['mean']:.2f}, CV: {focal['cv']:.3f}")
if not issues_found:
    print("  ✓ No major issues found")

# ============================================================
# 2. COMPETITOR PRICE ISSUES
# ============================================================
print("\n2. COMPETITOR PRICE ISSUES:")
print("-" * 40)
issues_found = False
for hotel_id, analysis in all_analyses.items():
    if 'competitor_price_stats_all' in analysis:
        comp = analysis['competitor_price_stats_all']
        if comp['n_negative'] > 0 or comp['pct_zero'] > 5:
            issues_found = True
            print(f"{hotel_id}:")
            print(f"  Negative: {comp['n_negative']} ({comp['pct_negative']:.1f}%)")
            print(f"  Zero: {comp['n_zero']} ({comp['pct_zero']:.1f}%)")
            print(f"  Min: ${comp['min']:.2f}, Max: ${comp['max']:.2f}")
if not issues_found:
    print("  ✓ No major issues found")

# ============================================================
# 3. HIGH MISSING DATA AFTER IMPUTATION
# ============================================================
print("\n3. REMAINING MISSING DATA (>10% after imputation):")
print("-" * 40)
high_missing = summary_df[summary_df['competitor_missing_pct'] > 10].sort_values('competitor_missing_pct', ascending=False)
if len(high_missing) > 0:
    for _, row in high_missing.iterrows():
        print(f"{row['hotel_id']}: {row['competitor_missing_pct']:.1f}% missing")
else:
    print("  ✓ All hotels have <10% missing data")

# ============================================================
# 4. WEAK CORRELATIONS
# ============================================================
print("\n4. WEAK COMPETITOR CORRELATIONS:")
print("-" * 40)
weak_corr = summary_df[summary_df['mean_correlation'].notna() & (summary_df['mean_correlation'] < 0.3)].sort_values('mean_correlation')
if len(weak_corr) > 0:
    for _, row in weak_corr.iterrows():
        print(f"{row['hotel_id']}: Mean corr = {row['mean_correlation']:.3f}, Weak = {int(row['weak_correlations'])}/{int(row['n_competitors'])}")
else:
    print("  ✓ All hotels have reasonable correlations (>0.3)")

# ============================================================
# 5. IMPUTATION ARTIFACTS
# ============================================================
print("\n5. IMPUTATION ARTIFACTS (High Duplication >30%):")
print("-" * 40)
artifacts = summary_df[summary_df['high_duplication_count'] > 0].sort_values('max_identical_pct', ascending=False)
if len(artifacts) > 0:
    for _, row in artifacts.iterrows():
        print(f"{row['hotel_id']}:")
        print(f"  Max identical: {row['max_identical_pct']:.1f}%")
        print(f"  Competitors with >30% duplication: {int(row['high_duplication_count'])}/{int(row['n_competitors'])}")
        print(f"  Mean uniqueness: {row['mean_uniqueness']:.1f}%")
else:
    print("  ✓ No significant duplication artifacts detected")

# ============================================================
# 6. TEMPORAL CONSISTENCY ISSUES
# ============================================================
print("\n6. TEMPORAL CONSISTENCY ISSUES:")
print("-" * 40)
issues_found = False
for hotel_id, analysis in all_analyses.items():
    if 'temporal_consistency' in analysis:
        temp = analysis['temporal_consistency']
        if temp['max_flat_period_length'] > 30:
            issues_found = True
            print(f"{hotel_id}:")
            print(f"  Competitors with flat periods: {temp['competitors_with_flat_periods']}")
            print(f"  Max consecutive identical values: {temp['max_flat_period_length']} days")
if not issues_found:
    print("  ✓ No major temporal consistency issues")

# ============================================================
# 7. OVERALL SUMMARY STATISTICS
# ============================================================
print("\n" + "="*80)
print("OVERALL SUMMARY STATISTICS")
print("="*80)
print(f"\nTotal hotels analyzed: {len(all_analyses)}")
print(f"\nFocal Hotel Prices:")
print(f"  Hotels with negative prices: {(summary_df['focal_negative_count'] > 0).sum()}")
print(f"  Hotels with zero prices: {(summary_df['focal_zero_count'] > 5).sum()}")
print(f"  Mean price CV: {summary_df['focal_price_cv'].mean():.3f}")

print(f"\nCompetitor Data Quality:")
print(f"  Hotels with >10% missing data: {(summary_df['competitor_missing_pct'] > 10).sum()}")
print(f"  Mean missing data: {summary_df['competitor_missing_pct'].mean():.1f}%")

print(f"\nCorrelations:")
print(f"  Hotels with mean corr <0.3: {(summary_df['mean_correlation'] < 0.3).sum()}")
print(f"  Overall mean correlation: {summary_df['mean_correlation'].mean():.3f}")

print(f"\nImputation Artifacts:")
print(f"  Hotels with >30% duplication: {(summary_df['high_duplication_count'] > 0).sum()}")
print(f"  Mean max duplication: {summary_df['max_identical_pct'].mean():.1f}%")

print(f"\n\nDetailed results saved to: {output_path}")
print("Files:")
print("  - imputation_analysis_complete.json (full details)")
print("  - imputation_summary.csv (summary table)")
print("\nComplete!")

COMPREHENSIVE IMPUTATION QUALITY ANALYSIS
Analyzing ALL 41 hotels
Comparing BEFORE and AFTER imputation
--------------------------------------------------------------------------------

Analyzing all hotels...
  Progress: 5/41 hotels...
  ERROR analyzing Hotel_08: [Errno 2] No such file or directory: '..\\data\\full-data\\processed\\Hotel_08_lagged_dataset.csv'
  Progress: 10/41 hotels...
  ERROR analyzing Hotel_11: [Errno 2] No such file or directory: '..\\data\\full-data\\processed\\Hotel_11_lagged_dataset.csv'


Traceback (most recent call last):
  File "C:\Users\Nandan Hegde\AppData\Local\Temp\ipykernel_2344\2728321625.py", line 48, in analyze_imputation_quality
    df_imputed = pd.read_csv(data_path / f'{hotel_id}_lagged_dataset.csv')
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
                   ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1880, in _make_engine
    self.handles = get_handle(
           

  Progress: 15/41 hotels...
  ERROR analyzing Hotel_16: [Errno 2] No such file or directory: '..\\data\\full-data\\processed\\Hotel_16_lagged_dataset.csv'


Traceback (most recent call last):
  File "C:\Users\Nandan Hegde\AppData\Local\Temp\ipykernel_2344\2728321625.py", line 48, in analyze_imputation_quality
    df_imputed = pd.read_csv(data_path / f'{hotel_id}_lagged_dataset.csv')
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
                   ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1880, in _make_engine
    self.handles = get_handle(
           

  Progress: 20/41 hotels...


  c /= stddev[:, None]
  c /= stddev[None, :]


  Progress: 25/41 hotels...
  Progress: 30/41 hotels...
  Progress: 35/41 hotels...
  ERROR analyzing Hotel_42: [Errno 2] No such file or directory: '..\\data\\full-data\\processed\\Hotel_42_lagged_dataset.csv'
  Progress: 40/41 hotels...
  ERROR analyzing Hotel_43: [Errno 2] No such file or directory: '..\\data\\full-data\\processed\\Hotel_43_lagged_dataset.csv'
  ERROR analyzing Hotel_44: [Errno 2] No such file or directory: '..\\data\\full-data\\processed\\Hotel_44_lagged_dataset.csv'

Completed analysis for 35 hotels

COMPREHENSIVE IMPUTATION QUALITY REPORT

1. FOCAL HOTEL PRICE ISSUES:
----------------------------------------
Hotel_13:
  Negative: 0, Zero: 42
  Price range: $0.00 - $289.00
  Mean: $136.78, CV: 0.490
Hotel_15:
  Negative: 0, Zero: 310
  Price range: $0.00 - $699.00
  Mean: $119.60, CV: 1.871

2. COMPETITOR PRICE ISSUES:
----------------------------------------
  ✓ No major issues found

3. REMAINING MISSING DATA (>10% after imputation):
----------------------------

Traceback (most recent call last):
  File "C:\Users\Nandan Hegde\AppData\Local\Temp\ipykernel_2344\2728321625.py", line 48, in analyze_imputation_quality
    df_imputed = pd.read_csv(data_path / f'{hotel_id}_lagged_dataset.csv')
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1026, in read_csv
    return _read(filepath_or_buffer, kwds)
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 620, in _read
    parser = TextFileReader(filepath_or_buffer, **kwds)
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1620, in __init__
    self._engine = self._make_engine(f, self.engine)
                   ~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^
  File "C:\Users\Nandan Hegde\AppData\Roaming\Python\Python313\site-packages\pandas\io\parsers\readers.py", line 1880, in _make_engine
    self.handles = get_handle(
           