In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Configuration
data_path = Path('../data/full-data/processed')
output_path = Path('../data/full-data/diagnostics')
output_path.mkdir(parents=True, exist_ok=True)

# Poor performers to investigate
POOR_PERFORMERS = ['Hotel_01', 'Hotel_14', 'Hotel_17', 'Hotel_24', 'Hotel_07', 
                   'Hotel_13', 'Hotel_21', 'Hotel_23', 'Hotel_31', 'Hotel_33', 'Hotel_36']

print("="*80)
print("DIAGNOSTIC ANALYSIS: POOR PERFORMING HOTELS")
print("="*80)
print(f"Investigating {len(POOR_PERFORMERS)} hotels with R² < 0.15")
print("-" * 80)

def analyze_hotel(hotel_id):
    """Comprehensive diagnostic for a hotel"""
    print(f"\n{'='*80}")
    print(f"HOTEL: {hotel_id}")
    print(f"{'='*80}")
    
    try:
        df = pd.read_csv(data_path / f'{hotel_id}_lagged_dataset.csv')
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date').reset_index(drop=True)
        
        # Basic info
        print(f"\n1. BASIC STATISTICS:")
        print(f"   Total observations: {len(df)}")
        print(f"   Date range: {df['date'].min()} to {df['date'].max()}")
        print(f"   Base rate range: ${df['base_rate'].min():.2f} - ${df['base_rate'].max():.2f}")
        print(f"   Base rate mean: ${df['base_rate'].mean():.2f}")
        print(f"   Base rate std: ${df['base_rate'].std():.2f}")
        print(f"   Coefficient of variation: {(df['base_rate'].std() / df['base_rate'].mean()):.2%}")
        
        # Check for trends
        df['trend'] = df['base_rate'].rolling(window=30, min_periods=1).mean()
        df['deviation_from_trend'] = (df['base_rate'] - df['trend']) / df['trend']
        
        print(f"\n2. PRICE VOLATILITY:")
        print(f"   Mean absolute deviation from trend: {df['deviation_from_trend'].abs().mean():.2%}")
        print(f"   Max deviation from trend: {df['deviation_from_trend'].abs().max():.2%}")
        
        # Strong trend detection
        first_half_mean = df.iloc[:len(df)//2]['base_rate'].mean()
        second_half_mean = df.iloc[len(df)//2:]['base_rate'].mean()
        trend_change = (second_half_mean - first_half_mean) / first_half_mean
        print(f"   Price change (1st half vs 2nd half): {trend_change:.2%}")
        
        if abs(trend_change) > 0.20:
            print(f"   ⚠️ STRONG TREND DETECTED: {trend_change:.2%}")
        
        # Competitor analysis
        competitor_cols = [col for col in df.columns if '-USD' in col and 'lag_1' in col]
        print(f"\n3. COMPETITOR ANALYSIS:")
        print(f"   Number of competitors: {len(competitor_cols)}")
        
        if len(competitor_cols) > 0:
            # Correlation with competitors
            correlations = []
            for comp in competitor_cols:
                corr = df['base_rate'].corr(df[comp])
                correlations.append(corr)
            
            print(f"   Correlation with competitors:")
            print(f"     Mean: {np.mean(correlations):.3f}")
            print(f"     Max: {np.max(correlations):.3f}")
            print(f"     Min: {np.min(correlations):.3f}")
            
            if np.mean(correlations) < 0.3:
                print(f"   ⚠️ WEAK CORRELATION WITH COMPETITORS")
            
            # Missing data in competitors
            missing_rates = []
            for comp in competitor_cols:
                missing_rate = df[comp].isna().sum() / len(df)
                missing_rates.append(missing_rate)
            
            print(f"   Missing data in competitors:")
            print(f"     Mean: {np.mean(missing_rates):.1%}")
            print(f"     Max: {np.max(missing_rates):.1%}")
            
            if np.mean(missing_rates) > 0.30:
                print(f"   ⚠️ HIGH MISSING DATA IN COMPETITORS")
            
            # Check competitor price volatility
            comp_volatility = []
            for comp in competitor_cols:
                if df[comp].notna().sum() > 10:
                    cv = df[comp].std() / df[comp].mean()
                    comp_volatility.append(cv)
            
            if comp_volatility:
                print(f"   Competitor price volatility (CV):")
                print(f"     Mean: {np.mean(comp_volatility):.2%}")
                print(f"     Max: {np.max(comp_volatility):.2%}")
        
        # Seasonality check
        print(f"\n4. SEASONALITY PATTERNS:")
        df['month'] = df['date'].dt.month
        monthly_means = df.groupby('month')['base_rate'].mean()
        seasonal_range = (monthly_means.max() - monthly_means.min()) / monthly_means.mean()
        print(f"   Seasonal price range: {seasonal_range:.1%}")
        
        if seasonal_range < 0.10:
            print(f"   ⚠️ WEAK SEASONALITY")
        
        df['day_of_week'] = df['date'].dt.dayofweek
        weekly_means = df.groupby('day_of_week')['base_rate'].mean()
        weekly_range = (weekly_means.max() - weekly_means.min()) / weekly_means.mean()
        print(f"   Weekly price range: {weekly_range:.1%}")
        
        # Check for structural breaks
        print(f"\n5. STRUCTURAL BREAKS:")
        # Split into 4 quarters
        quarter_size = len(df) // 4
        quarter_means = []
        for i in range(4):
            start = i * quarter_size
            end = (i + 1) * quarter_size if i < 3 else len(df)
            quarter_means.append(df.iloc[start:end]['base_rate'].mean())
        
        max_jump = max(abs(quarter_means[i+1] - quarter_means[i]) / quarter_means[i] for i in range(3))
        print(f"   Max quarter-to-quarter change: {max_jump:.2%}")
        
        if max_jump > 0.30:
            print(f"   ⚠️ POSSIBLE STRUCTURAL BREAK")
        
        # Check for outliers
        print(f"\n6. OUTLIERS:")
        z_scores = np.abs((df['base_rate'] - df['base_rate'].mean()) / df['base_rate'].std())
        outliers = (z_scores > 3).sum()
        print(f"   Number of outliers (|z| > 3): {outliers}")
        print(f"   Percentage: {outliers / len(df):.1%}")
        
        if outliers / len(df) > 0.05:
            print(f"   ⚠️ HIGH OUTLIER RATE")
        
        # Summary diagnosis
        print(f"\n7. DIAGNOSIS SUMMARY:")
        issues = []
        
        if abs(trend_change) > 0.20:
            issues.append("Strong trend not captured by 30-day detrending")
        
        if len(competitor_cols) > 0 and np.mean(correlations) < 0.3:
            issues.append("Weak correlation with competitors - independent pricing")
        
        if len(competitor_cols) > 0 and np.mean(missing_rates) > 0.30:
            issues.append("High missing competitor data - imputation quality")
        
        if seasonal_range < 0.10:
            issues.append("Weak seasonality - temporal features less useful")
        
        if max_jump > 0.30:
            issues.append("Structural break - pricing strategy changed")
        
        if outliers / len(df) > 0.05:
            issues.append("Many outliers - data quality or special events")
        
        if issues:
            print(f"   Issues identified:")
            for i, issue in enumerate(issues, 1):
                print(f"     {i}. {issue}")
        else:
            print(f"   No clear issues identified - may need different features")
        
        # Recommendations
        print(f"\n8. RECOMMENDATIONS:")
        
        if abs(trend_change) > 0.20:
            print(f"   • Try longer detrending window (60 or 90 days)")
            print(f"   • Consider polynomial detrending")
        
        if len(competitor_cols) > 0 and np.mean(correlations) < 0.3:
            print(f"   • Add demand-side features (occupancy, booking pace)")
            print(f"   • Add autoregressive terms (own historical prices)")
            print(f"   • Consider event/holiday data")
        
        if seasonal_range > 0.20:
            print(f"   • Strong seasonality - ensure cyclic encoding is working")
            print(f"   • Consider holiday-specific features")
        
        if max_jump > 0.30:
            print(f"   • Split data before/after break and model separately")
            print(f"   • Add time period dummy variables")
        
        return {
            'hotel_id': hotel_id,
            'n_obs': len(df),
            'price_cv': df['base_rate'].std() / df['base_rate'].mean(),
            'trend_change': trend_change,
            'mean_competitor_corr': np.mean(correlations) if len(competitor_cols) > 0 else None,
            'mean_missing_rate': np.mean(missing_rates) if len(competitor_cols) > 0 else None,
            'seasonal_range': seasonal_range,
            'max_quarter_jump': max_jump,
            'outlier_rate': outliers / len(df),
            'issues': ', '.join(issues) if issues else 'None'
        }
        
    except Exception as e:
        print(f"ERROR: {str(e)}")
        return None

# Run diagnostics
results = []
for hotel_id in POOR_PERFORMERS:
    result = analyze_hotel(hotel_id)
    if result:
        results.append(result)

# Save summary
if results:
    summary_df = pd.DataFrame(results)
    summary_df.to_csv(output_path / 'poor_performers_diagnosis.csv', index=False)
    
    print(f"\n{'='*80}")
    print("OVERALL SUMMARY")
    print(f"{'='*80}")
    print(f"\nDiagnostics saved to: {output_path / 'poor_performers_diagnosis.csv'}")
    
    print(f"\nCommon Issues Across Hotels:")
    all_issues = []
    for r in results:
        if r['issues'] != 'None':
            all_issues.extend(r['issues'].split(', '))
    
    from collections import Counter
    issue_counts = Counter(all_issues)
    for issue, count in issue_counts.most_common():
        print(f"  • {issue}: {count} hotels")

print("\nComplete")

DIAGNOSTIC ANALYSIS: POOR PERFORMING HOTELS
Investigating 11 hotels with R² < 0.15
--------------------------------------------------------------------------------

HOTEL: Hotel_01

1. BASIC STATISTICS:
   Total observations: 554
   Date range: 2025-04-01 00:00:00 to 2026-10-06 00:00:00
   Base rate range: $169.00 - $589.00
   Base rate mean: $266.47
   Base rate std: $95.13
   Coefficient of variation: 35.70%

2. PRICE VOLATILITY:
   Mean absolute deviation from trend: 10.64%
   Max deviation from trend: 79.73%
   Price change (1st half vs 2nd half): -28.39%
   ⚠️ STRONG TREND DETECTED: -28.39%

3. COMPETITOR ANALYSIS:
   Number of competitors: 5
   Correlation with competitors:
     Mean: 0.756
     Max: 0.826
     Min: 0.685
   Missing data in competitors:
     Mean: 0.0%
     Max: 0.0%
   Competitor price volatility (CV):
     Mean: 33.19%
     Max: 41.80%

4. SEASONALITY PATTERNS:
   Seasonal price range: 72.3%
   Weekly price range: 19.8%

5. STRUCTURAL BREAKS:
   Max quarter-to-


1. BASIC STATISTICS:
   Total observations: 676
   Date range: 2024-11-30 00:00:00 to 2026-10-06 00:00:00
   Base rate range: $199.00 - $399.00
   Base rate mean: $304.07
   Base rate std: $38.96
   Coefficient of variation: 12.81%

2. PRICE VOLATILITY:
   Mean absolute deviation from trend: 6.04%
   Max deviation from trend: 49.10%
   Price change (1st half vs 2nd half): 11.10%

3. COMPETITOR ANALYSIS:
   Number of competitors: 10
   Correlation with competitors:
     Mean: 0.251
     Max: 0.498
     Min: 0.025
   ⚠️ WEAK CORRELATION WITH COMPETITORS
   Missing data in competitors:
     Mean: 0.0%
     Max: 0.0%
   Competitor price volatility (CV):
     Mean: 21.22%
     Max: 51.42%

4. SEASONALITY PATTERNS:
   Seasonal price range: 19.9%
   Weekly price range: 3.5%

5. STRUCTURAL BREAKS:
   Max quarter-to-quarter change: 19.14%

6. OUTLIERS:
   Number of outliers (|z| > 3): 0
   Percentage: 0.0%

7. DIAGNOSIS SUMMARY:
   Issues identified:
     1. Weak correlation with competitors -