In [5]:
"""
DEBUG: Row Count Discrepancy Between Baseline and Advanced Imputation
=====================================================================
This script identifies why the two datasets have different row counts
"""

import pandas as pd
import numpy as np
from pathlib import Path

print("="*80)
print("ROW COUNT DISCREPANCY DEBUGGER")
print("="*80)

data_path = Path().cwd().parent /'Predictive-Models'/ 'data' / 'dataprocessed'

# ============================================================================
# LOAD BOTH DATASETS
# ============================================================================
print("\n1. LOADING DATASETS")
print("-"*80)

df_baseline = pd.read_csv(data_path / 'lagged_predictive_dataset.csv')
df_baseline['date'] = pd.to_datetime(df_baseline['date'])

df_advanced = pd.read_csv(data_path / 'lagged_predictive_dataset_advanced_imputation.csv')
df_advanced['date'] = pd.to_datetime(df_advanced['date'])

print(f"Baseline shape: {df_baseline.shape}")
print(f"Advanced shape: {df_advanced.shape}")
print(f"Difference: {df_baseline.shape[0] - df_advanced.shape[0]} rows")

# ============================================================================
# COMPARE DATE RANGES
# ============================================================================
print("\n2. DATE RANGE COMPARISON")
print("-"*80)

print(f"\nBaseline:")
print(f"  Start: {df_baseline['date'].min()}")
print(f"  End:   {df_baseline['date'].max()}")
print(f"  Days:  {(df_baseline['date'].max() - df_baseline['date'].min()).days + 1}")

print(f"\nAdvanced:")
print(f"  Start: {df_advanced['date'].min()}")
print(f"  End:   {df_advanced['date'].max()}")
print(f"  Days:  {(df_advanced['date'].max() - df_advanced['date'].min()).days + 1}")

# ============================================================================
# FIND MISSING DATES
# ============================================================================
print("\n3. IDENTIFYING MISSING DATES")
print("-"*80)

baseline_dates = set(df_baseline['date'])
advanced_dates = set(df_advanced['date'])

dates_only_in_baseline = baseline_dates - advanced_dates
dates_only_in_advanced = advanced_dates - baseline_dates

if dates_only_in_baseline:
    print(f"\nDates in BASELINE but NOT in ADVANCED ({len(dates_only_in_baseline)}):")
    for date in sorted(dates_only_in_baseline):
        print(f"  {date.date()}")
        
if dates_only_in_advanced:
    print(f"\nDates in ADVANCED but NOT in BASELINE ({len(dates_only_in_advanced)}):")
    for date in sorted(dates_only_in_advanced):
        print(f"  {date.date()}")

if not dates_only_in_baseline and not dates_only_in_advanced:
    print("\nBoth datasets have the same dates!")
    print("Issue must be in how rows are counted/filtered")

# ============================================================================
# CHECK FOR DUPLICATES
# ============================================================================
print("\n4. CHECKING FOR DUPLICATE DATES")
print("-"*80)

baseline_dupes = df_baseline['date'].duplicated().sum()
advanced_dupes = df_advanced['date'].duplicated().sum()

print(f"Baseline duplicates: {baseline_dupes}")
print(f"Advanced duplicates: {advanced_dupes}")

# ============================================================================
# CHECK FOR NULL VALUES IN DATE COLUMN
# ============================================================================
print("\n5. CHECKING FOR NULL DATES")
print("-"*80)

baseline_null_dates = df_baseline['date'].isnull().sum()
advanced_null_dates = df_advanced['date'].isnull().sum()

print(f"Baseline null dates: {baseline_null_dates}")
print(f"Advanced null dates: {advanced_null_dates}")

# ============================================================================
# COMPARE MISSING VALUES IN KEY COLUMNS
# ============================================================================
print("\n6. MISSING VALUES IN KEY COLUMNS")
print("-"*80)

print("\nBaseline:")
print(df_baseline[['date', 'base_rate']].isnull().sum())

print("\nAdvanced:")
print(df_advanced[['date', 'base_rate']].isnull().sum())

# ============================================================================
# CHECK SOURCE DATA (BEFORE LAG CREATION)
# ============================================================================
print("\n7. CHECKING SOURCE DATA (BEFORE LAG CREATION)")
print("-"*80)

# Load the price matrices that were used as input
try:
    matrix_baseline = pd.read_csv(data_path / 'competitor_price_matrix.csv')
    matrix_baseline['date'] = pd.to_datetime(matrix_baseline['date'])
    print(f"\nBaseline source (competitor_price_matrix.csv): {matrix_baseline.shape}")
    print(f"  Date range: {matrix_baseline['date'].min()} to {matrix_baseline['date'].max()}")
    print(f"  Total days: {len(matrix_baseline)}")
except Exception as e:
    print(f"\nCould not load baseline source: {e}")

try:
    matrix_advanced = pd.read_csv(data_path / 'competitor_price_matrix_advanced_imputation.csv')
    matrix_advanced['date'] = pd.to_datetime(matrix_advanced['date'])
    print(f"\nAdvanced source (competitor_price_matrix_advanced_imputation.csv): {matrix_advanced.shape}")
    print(f"  Date range: {matrix_advanced['date'].min()} to {matrix_advanced['date'].max()}")
    print(f"  Total days: {len(matrix_advanced)}")
    
    # Compare source data
    if 'matrix_baseline' in locals():
        source_baseline_dates = set(matrix_baseline['date'])
        source_advanced_dates = set(matrix_advanced['date'])
        
        source_diff = source_baseline_dates - source_advanced_dates
        if source_diff:
            print(f"\nDates in baseline source but NOT in advanced source:")
            for date in sorted(source_diff):
                print(f"  {date.date()}")
        else:
            print("\nBoth source matrices have identical dates")
            print("Row count difference happens during LAG CREATION process")
            
except Exception as e:
    print(f"\nCould not load advanced source: {e}")

# ============================================================================
# TRACE THE PROBLEM
# ============================================================================
print("\n" + "="*80)
print("DIAGNOSIS")
print("="*80)

if dates_only_in_baseline:
    print("\nROOT CAUSE: Advanced imputation is missing specific dates")
    print("LOCATION: Either in imputation or lag creation process")
    print("ACTION: Check why these dates were dropped")
elif dates_only_in_advanced:
    print("\nROOT CAUSE: Baseline is missing specific dates")
    print("LOCATION: Baseline processing dropped dates")
    print("ACTION: Check baseline lag creation logic")
elif baseline_dupes > 0 or advanced_dupes > 0:
    print("\nROOT CAUSE: Duplicate dates in one dataset")
    print("ACTION: Check for duplicate date handling in processing")
elif 'matrix_baseline' in locals() and 'matrix_advanced' in locals():
    if len(matrix_baseline) != len(matrix_advanced):
        print("\nROOT CAUSE: Source data has different row counts BEFORE lag creation")
        print("LOCATION: In the imputation process itself")
        print("ACTION: Check imputation scripts for row dropping")
    else:
        print("\nROOT CAUSE: Lag creation process handles the two datasets differently")
        print("LOCATION: In lagged_data_preparation scripts")
        print("ACTION: Compare lag creation logic between baseline and advanced versions")
else:
    print("\nCannot determine root cause. Need to check source files.")

print("\n" + "="*80)

ROW COUNT DISCREPANCY DEBUGGER

1. LOADING DATASETS
--------------------------------------------------------------------------------
Baseline shape: (360, 68)
Advanced shape: (359, 68)
Difference: 1 rows

2. DATE RANGE COMPARISON
--------------------------------------------------------------------------------

Baseline:
  Start: 2025-09-21 00:00:00
  End:   2026-09-15 00:00:00
  Days:  360

Advanced:
  Start: 2025-09-21 00:00:00
  End:   2026-09-14 00:00:00
  Days:  359

3. IDENTIFYING MISSING DATES
--------------------------------------------------------------------------------

Dates in BASELINE but NOT in ADVANCED (1):
  2026-09-15

4. CHECKING FOR DUPLICATE DATES
--------------------------------------------------------------------------------
Baseline duplicates: 0
Advanced duplicates: 0

5. CHECKING FOR NULL DATES
--------------------------------------------------------------------------------
Baseline null dates: 0
Advanced null dates: 0

6. MISSING VALUES IN KEY COLUMNS
--------

In [8]:
import pandas as pd
from pathlib import Path

data_path = Path().cwd().parent / 'data' / 'dataprocessed'

# Load both source matrices
baseline_matrix = pd.read_csv(data_path / 'competitor_price_matrix.csv')
baseline_matrix['date'] = pd.to_datetime(baseline_matrix['stay_date'])

advanced_matrix = pd.read_csv(data_path / 'competitor_price_matrix_advanced_imputation.csv')
advanced_matrix['date'] = pd.to_datetime(advanced_matrix['stay_date'])

print(f"Baseline matrix: {len(baseline_matrix)} rows")
print(f"  Date range: {baseline_matrix['date'].min()} to {baseline_matrix['date'].max()}")

print(f"\nAdvanced matrix: {len(advanced_matrix)} rows")
print(f"  Date range: {advanced_matrix['date'].min()} to {advanced_matrix['date'].max()}")

# Find missing dates
baseline_dates = set(baseline_matrix['date'])
advanced_dates = set(advanced_matrix['date'])

missing_in_advanced = baseline_dates - advanced_dates
if missing_in_advanced:
    print(f"\nMissing in ADVANCED matrix:")
    for date in sorted(missing_in_advanced):
        print(f"  {date.date()}")

Baseline matrix: 365 rows
  Date range: 2025-09-16 00:00:00 to 2026-09-15 00:00:00

Advanced matrix: 364 rows
  Date range: 2025-09-16 00:00:00 to 2026-09-14 00:00:00

Missing in ADVANCED matrix:
  2026-09-15
