In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

data_path = Path('../data/full-data/processed')

mapping_df = pd.read_csv('../data/full-data/hotel_mapping.csv')
hotel_list = mapping_df['masked_id'].tolist()

print("DIAGNOSING PRICE DATA QUALITY")
print("="*80)

# Hotels that had inf MAPE in original run
problem_hotels = ['Hotel_13', 'Hotel_15', 'Hotel_32', 'Hotel_34']

diagnostic_results = []

for hotel_masked_id in hotel_list:
    lagged_file = data_path / f'{hotel_masked_id}_lagged_dataset.csv'
    
    if not lagged_file.exists():
        continue
    
    df = pd.read_csv(lagged_file)
    
    if 'base_rate' not in df.columns:
        continue
    
    base_rate = df['base_rate']
    
    # Calculate statistics
    min_price = base_rate.min()
    max_price = base_rate.max()
    mean_price = base_rate.mean()
    median_price = base_rate.median()
    std_price = base_rate.std()
    
    # Count problematic values
    zero_count = (base_rate == 0).sum()
    near_zero_count = ((base_rate > 0) & (base_rate < 1)).sum()
    very_low_count = ((base_rate >= 1) & (base_rate < 10)).sum()
    
    # Price volatility
    cv = (std_price / mean_price * 100) if mean_price > 0 else 0
    
    # Check for outliers
    q1 = base_rate.quantile(0.25)
    q3 = base_rate.quantile(0.75)
    iqr = q3 - q1
    outlier_low = (base_rate < (q1 - 1.5 * iqr)).sum()
    outlier_high = (base_rate > (q3 + 1.5 * iqr)).sum()
    
    is_problem = hotel_masked_id in problem_hotels
    
    diagnostic_results.append({
        'hotel_id': hotel_masked_id,
        'had_inf_mape': 'YES' if is_problem else 'NO',
        'n_observations': len(base_rate),
        'min_price': min_price,
        'max_price': max_price,
        'mean_price': mean_price,
        'median_price': median_price,
        'std_price': std_price,
        'cv_percent': cv,
        'zero_count': zero_count,
        'near_zero_count': near_zero_count,
        'very_low_count': very_low_count,
        'outliers_low': outlier_low,
        'outliers_high': outlier_high
    })

diag_df = pd.DataFrame(diagnostic_results)

print("\n1. PROBLEM HOTELS (Had inf MAPE)")
print("-"*80)
problem_df = diag_df[diag_df['had_inf_mape'] == 'YES']
print(problem_df[['hotel_id', 'min_price', 'mean_price', 'zero_count', 'near_zero_count', 'very_low_count']])

print("\n2. GOOD HOTELS (Low MAPE) - For Comparison")
print("-"*80)
good_hotels = ['Hotel_09', 'Hotel_10']
good_df = diag_df[diag_df['hotel_id'].isin(good_hotels)]
print(good_df[['hotel_id', 'min_price', 'mean_price', 'zero_count', 'near_zero_count', 'very_low_count']])

print("\n3. DETAILED PRICE DISTRIBUTION - PROBLEM HOTELS")
print("="*80)

for hotel_id in problem_hotels:
    lagged_file = data_path / f'{hotel_id}_lagged_dataset.csv'
    
    if not lagged_file.exists():
        print(f"\n{hotel_id}: File not found")
        continue
    
    df = pd.read_csv(lagged_file)
    df['date'] = pd.to_datetime(df['date'])
    
    if 'base_rate' not in df.columns:
        continue
    
    base_rate = df['base_rate']
    
    print(f"\n{hotel_id}")
    print("-"*80)
    print(f"Total observations: {len(base_rate)}")
    print(f"Date range: {df['date'].min()} to {df['date'].max()}")
    print(f"\nPrice Statistics:")
    print(f"  Min: ${base_rate.min():.2f}")
    print(f"  Max: ${base_rate.max():.2f}")
    print(f"  Mean: ${base_rate.mean():.2f}")
    print(f"  Median: ${base_rate.median():.2f}")
    print(f"  Std: ${base_rate.std():.2f}")
    
    print(f"\nProblematic Values:")
    print(f"  Exactly zero: {(base_rate == 0).sum()} ({(base_rate == 0).sum()/len(base_rate)*100:.1f}%)")
    print(f"  Between 0-1: {((base_rate > 0) & (base_rate < 1)).sum()}")
    print(f"  Between 1-10: {((base_rate >= 1) & (base_rate < 10)).sum()}")
    print(f"  Between 10-50: {((base_rate >= 10) & (base_rate < 50)).sum()}")
    
    print(f"\nPrice Percentiles:")
    print(f"  1st percentile: ${base_rate.quantile(0.01):.2f}")
    print(f"  5th percentile: ${base_rate.quantile(0.05):.2f}")
    print(f"  25th percentile: ${base_rate.quantile(0.25):.2f}")
    print(f"  50th percentile: ${base_rate.quantile(0.50):.2f}")
    print(f"  75th percentile: ${base_rate.quantile(0.75):.2f}")
    print(f"  95th percentile: ${base_rate.quantile(0.95):.2f}")
    print(f"  99th percentile: ${base_rate.quantile(0.99):.2f}")
    
    # Show actual zero/near-zero values with dates
    problematic = df[base_rate < 10][['date', 'base_rate']].sort_values('base_rate')
    if len(problematic) > 0:
        print(f"\nActual problematic prices (< $10):")
        print(problematic.head(10))

print("\n" + "="*80)
print("SUMMARY OF FINDINGS")
print("="*80)

print("\nHotels with zero prices:")
zero_hotels = diag_df[diag_df['zero_count'] > 0]
print(f"Count: {len(zero_hotels)}")
if len(zero_hotels) > 0:
    print(zero_hotels[['hotel_id', 'zero_count', 'mean_price']])

print("\nHotels with prices < $1:")
near_zero_hotels = diag_df[diag_df['near_zero_count'] > 0]
print(f"Count: {len(near_zero_hotels)}")
if len(near_zero_hotels) > 0:
    print(near_zero_hotels[['hotel_id', 'near_zero_count', 'mean_price']])

print("\nPrice volatility (CV > 50% is high):")
high_volatility = diag_df[diag_df['cv_percent'] > 50].sort_values('cv_percent', ascending=False)
print(high_volatility[['hotel_id', 'mean_price', 'cv_percent']].head(10))

diag_df.to_csv(data_path / 'price_diagnostic_summary.csv', index=False)
print("\n✓ Full diagnostic saved to: price_diagnostic_summary.csv")

DIAGNOSING PRICE DATA QUALITY

1. PROBLEM HOTELS (Had inf MAPE)
--------------------------------------------------------------------------------
    hotel_id  min_price  mean_price  zero_count  near_zero_count  \
10  Hotel_13        0.0  136.778862          42                0   
12  Hotel_15        0.0  119.600000         310                0   
28  Hotel_32        0.0  360.080402         256                0   
30  Hotel_34        0.0  329.587065         184                0   

    very_low_count  
10               0  
12               0  
28               0  
30               0  

2. GOOD HOTELS (Low MAPE) - For Comparison
--------------------------------------------------------------------------------
   hotel_id  min_price  mean_price  zero_count  near_zero_count  \
7  Hotel_09      500.0  568.974359           0                0   
8  Hotel_10      150.0  164.182724           0                0   

   very_low_count  
7               0  
8               0  

3. DETAILED PRICE DIS