In [1]:
"""
MISSING DATA MECHANISM ANALYSIS
================================
Analyze missing data patterns to determine mechanism:
- MCAR (Missing Completely at Random): Missingness unrelated to any variables
- MAR (Missing at Random): Missingness depends on observed data
- MNAR (Missing Not at Random): Missingness depends on unobserved values

This determines appropriate imputation strategy.
"""

import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

In [3]:
print("="*80)
print("MISSING DATA MECHANISM ANALYSIS")
print("="*80)

# ============================================================================
# LOAD RAW COMPETITOR DATA (BEFORE IMPUTATION)
# ============================================================================
data_path = Path().cwd().parent / 'data' / 'dataraw'

df_competitors = pd.read_csv(data_path / 'data-1757985744315.csv')
df_competitors['stay_date'] = pd.to_datetime(df_competitors['stay_date'])

print(f"\nRaw competitor data: {df_competitors.shape}")
print(f"Total missing prices: {df_competitors['price'].isnull().sum()}")
print(f"Missing rate: {df_competitors['price'].isnull().sum()/len(df_competitors)*100:.2f}%")

# ============================================================================
# MISSING DATA BY HOTEL
# ============================================================================
print("\n" + "="*80)
print("MISSING DATA PATTERNS BY HOTEL")
print("="*80)

missing_by_hotel = df_competitors.groupby('hotel_id').agg({
    'price': ['count', lambda x: x.isnull().sum(), 'mean', 'std']
})
missing_by_hotel.columns = ['total_obs', 'missing_count', 'price_mean', 'price_std']
missing_by_hotel['missing_rate'] = missing_by_hotel['missing_count'] / missing_by_hotel['total_obs']
missing_by_hotel = missing_by_hotel.sort_values('missing_rate', ascending=False)

print("\n" + missing_by_hotel.to_string())

# ============================================================================
# TEST 1: TEMPORAL PATTERNS (MCAR vs MAR/MNAR)
# ============================================================================
print("\n" + "="*80)
print("TEST 1: TEMPORAL PATTERNS")
print("="*80)

# Check if missingness varies by day of week, month, etc.
df_competitors['day_of_week'] = df_competitors['stay_date'].dt.dayofweek
df_competitors['month'] = df_competitors['stay_date'].dt.month
df_competitors['is_missing'] = df_competitors['price'].isnull().astype(int)

# Missing by day of week
missing_by_dow = df_competitors.groupby('day_of_week')['is_missing'].mean()
print("\nMissing rate by day of week:")
print(missing_by_dow)

# Chi-square test: Is missingness independent of day of week?
contingency_dow = pd.crosstab(df_competitors['day_of_week'], df_competitors['is_missing'])
chi2_dow, p_dow, dof_dow, expected_dow = stats.chi2_contingency(contingency_dow)
print(f"\nChi-square test (day of week vs missingness):")
print(f"  p-value = {p_dow:.4f}")
if p_dow < 0.05:
    print("  → Missingness is NOT independent of day of week (NOT MCAR)")
else:
    print("  → Missingness is independent of day of week")

# Missing by month
missing_by_month = df_competitors.groupby('month')['is_missing'].mean()
print("\nMissing rate by month:")
print(missing_by_month)

contingency_month = pd.crosstab(df_competitors['month'], df_competitors['is_missing'])
chi2_month, p_month, dof_month, expected_month = stats.chi2_contingency(contingency_month)
print(f"\nChi-square test (month vs missingness):")
print(f"  p-value = {p_month:.4f}")
if p_month < 0.05:
    print("  → Missingness is NOT independent of month (NOT MCAR)")
else:
    print("  → Missingness is independent of month")

# ============================================================================
# TEST 2: HOTEL-SPECIFIC PATTERNS
# ============================================================================
print("\n" + "="*80)
print("TEST 2: HOTEL-SPECIFIC PATTERNS")
print("="*80)

# Test if missingness varies by hotel
contingency_hotel = pd.crosstab(df_competitors['hotel_id'], df_competitors['is_missing'])
chi2_hotel, p_hotel, dof_hotel, expected_hotel = stats.chi2_contingency(contingency_hotel)
print(f"Chi-square test (hotel vs missingness):")
print(f"  p-value = {p_hotel:.4f}")
if p_hotel < 0.05:
    print("  → Missingness varies significantly by hotel (NOT MCAR)")
else:
    print("  → Missingness is uniform across hotels")

# ============================================================================
# TEST 3: CONSECUTIVE MISSING PATTERNS (GAP ANALYSIS)
# ============================================================================
print("\n" + "="*80)
print("TEST 3: GAP ANALYSIS")
print("="*80)

def analyze_gaps(group):
    """Analyze consecutive missing value patterns"""
    missing_mask = group['price'].isnull()
    
    # Find gaps (consecutive missing values)
    gaps = []
    gap_start = None
    
    for idx, is_missing in missing_mask.items():
        if is_missing and gap_start is None:
            gap_start = idx
        elif not is_missing and gap_start is not None:
            gaps.append(idx - gap_start)
            gap_start = None
    
    if gap_start is not None:
        gaps.append(len(missing_mask) - gap_start)
    
    return pd.Series({
        'n_gaps': len(gaps),
        'max_gap': max(gaps) if gaps else 0,
        'avg_gap': np.mean(gaps) if gaps else 0
    })

gap_analysis = df_competitors.groupby('hotel_id').apply(analyze_gaps)
print("\nGap analysis by hotel:")
print(gap_analysis.to_string())

# Kohea Kai specific
kohea_data = df_competitors[df_competitors['hotel_id'] == 'booking-us-kohea-kai-resort-maui-USD'].copy()
kohea_missing = kohea_data['price'].isnull()
print(f"\nKohea Kai Resort detailed:")
print(f"  Total observations: {len(kohea_data)}")
print(f"  Missing: {kohea_missing.sum()} ({kohea_missing.sum()/len(kohea_data)*100:.1f}%)")
print(f"  Largest consecutive gap: {gap_analysis.loc['booking-us-kohea-kai-resort-maui-USD', 'max_gap']} days")

# ============================================================================
# TEST 4: PRICE RELATIONSHIP (MAR vs MNAR)
# ============================================================================
print("\n" + "="*80)
print("TEST 4: PRICE RELATIONSHIP ANALYSIS")
print("="*80)

# For each hotel, compare observed prices when data is complete vs when other hotels are missing
# This tests if missingness in one hotel is related to prices in other hotels (MAR)

print("\nTesting if missingness relates to observed prices...")

# Create indicator: is ANY competitor missing on this date?
date_missing_counts = df_competitors.groupby('stay_date')['is_missing'].sum()
df_competitors['any_missing_on_date'] = df_competitors['stay_date'].map(date_missing_counts > 0)

# For hotels with complete data, compare prices on dates with vs without missing data
for hotel in df_competitors['hotel_id'].unique():
    hotel_data = df_competitors[df_competitors['hotel_id'] == hotel].copy()
    complete_data = hotel_data[~hotel_data['price'].isnull()]
    
    if len(complete_data) > 10:
        prices_when_others_missing = complete_data[complete_data['any_missing_on_date']]['price']
        prices_when_complete = complete_data[~complete_data['any_missing_on_date']]['price']
        
        if len(prices_when_others_missing) > 5 and len(prices_when_complete) > 5:
            t_stat, p_val = stats.ttest_ind(prices_when_others_missing, prices_when_complete)
            
            if p_val < 0.05:
                print(f"\n{hotel.split('booking-us-')[-1][:30]}:")
                print(f"  Mean price when others missing: ${prices_when_others_missing.mean():.2f}")
                print(f"  Mean price when complete: ${prices_when_complete.mean():.2f}")
                print(f"  p-value: {p_val:.4f} → Prices differ significantly")

# ============================================================================
# CONCLUSION AND RECOMMENDATIONS
# ============================================================================
print("\n" + "="*80)
print("MISSING DATA MECHANISM CONCLUSION")
print("="*80)

print("\nBased on statistical tests:")

conclusions = []

if p_hotel < 0.05:
    conclusions.append("- Missingness varies by hotel (NOT uniformly random)")
    
if p_dow < 0.05 or p_month < 0.05:
    conclusions.append("- Missingness has temporal patterns")
    
if gap_analysis['max_gap'].max() > 5:
    conclusions.append(f"- Large consecutive gaps exist (up to {int(gap_analysis['max_gap'].max())} days)")

for conclusion in conclusions:
    print(conclusion)

print("\nMechanism Assessment:")
if len(conclusions) == 0:
    print("  Likely MCAR (Missing Completely at Random)")
    print("  → Any imputation method appropriate")
elif p_hotel < 0.05 and (p_dow >= 0.05 and p_month >= 0.05):
    print("  Likely MAR (Missing at Random)")
    print("  → Missingness depends on hotel identity")
    print("  → Advanced imputation (MICE, KNN) appropriate")
else:
    print("  Likely MAR or MNAR")
    print("  → Complex patterns exist")
    
print("\nRecommended Imputation Strategy:")
print(f"  - Hotels with <10% missing & small gaps (<5 days): Time-Series Decay or MICE")
print(f"  - Hotels with >15% missing or large gaps (>5 days): Forward-fill + Interpolation")
print(f"\nHotels requiring baseline imputation:")
for hotel in gap_analysis[gap_analysis['max_gap'] > 5].index:
    print(f"  - {hotel.split('booking-us-')[-1]}")

MISSING DATA MECHANISM ANALYSIS

Raw competitor data: (1820, 5)
Total missing prices: 30
Missing rate: 1.65%

MISSING DATA PATTERNS BY HOTEL

                                                           total_obs  missing_count  price_mean   price_std  missing_rate
hotel_id                                                                                                                 
booking-us-aqua-pacific-monarch-USD                              348             16  279.358573   34.902711      0.045977
booking-us-courtyard-by-marriott-maui-kahului-airport-USD        353             11  442.033994   49.982087      0.031161
booking-us-kohea-kai-resort-maui-USD                             362              2  361.590387  207.216514      0.005525
booking-us-ohana-waikiki-malia-USD                               363              1  271.141212   42.537704      0.002755
booking-us-castle-kamaole-sands-USD                              364              0  347.425824   63.455941      0.000000

TES

  gap_analysis = df_competitors.groupby('hotel_id').apply(analyze_gaps)
