In [1]:
"""
LAGGED DATA PREPARATION - INDEPENDENT IMPUTATION
Matches baseline/KNN/MICE/timeseries structure exactly
Output: lagged_predictive_dataset_independent.csv (360 observations)
"""

import pandas as pd
import numpy as np
from pathlib import Path
from itertools import combinations

print("="*80)
print("LAGGED DATA PREPARATION - INDEPENDENT IMPUTATION")
print("="*80)

# ============================================================================
# LOAD DATA
# ============================================================================
data_path = Path('../../../data/dataprocessed')

focal_daily = pd.read_csv(data_path / 'focal_daily_aggregated.csv')
competitor_matrix = pd.read_csv(data_path / 'competitor_price_matrix_independent.csv')

focal_daily['date'] = pd.to_datetime(focal_daily['stay_date'])
competitor_matrix['date'] = pd.to_datetime(competitor_matrix['stay_date'])

df = focal_daily.merge(competitor_matrix, on='date', how='left')
df = df.drop(['stay_date_x', 'stay_date_y'], axis=1, errors='ignore')
df = df.sort_values('date').reset_index(drop=True)

competitor_cols = [col for col in df.columns if 'booking' in col]

# ============================================================================
# CREATE COMPETITOR LAGS (NO SELF LAGS!)
# ============================================================================
lags = [1, 2, 3, 4, 5]

for hotel in competitor_cols:
    for lag in lags:
        df[f'{hotel}_lag_{lag}'] = df[hotel].shift(lag)

# ============================================================================
# CREATE AGGREGATE FEATURES FROM LAGS
# ============================================================================
lag_cols = [col for col in df.columns if '_lag_' in col]

for lag in lags:
    lag_cols_for_lag = [col for col in lag_cols if f'_lag_{lag}' in col]
    df[f'comp_min_lag_{lag}'] = df[lag_cols_for_lag].min(axis=1)
    df[f'comp_max_lag_{lag}'] = df[lag_cols_for_lag].max(axis=1)
    df[f'comp_mean_lag_{lag}'] = df[lag_cols_for_lag].mean(axis=1)
    df[f'comp_median_lag_{lag}'] = df[lag_cols_for_lag].median(axis=1)

# ============================================================================
# CREATE ROLLING FEATURES
# ============================================================================
windows = [3, 7]

for hotel in competitor_cols:
    for window in windows:
        df[f'{hotel}_rolling_mean_{window}'] = df[hotel].rolling(window=window).mean()
        df[f'{hotel}_rolling_std_{window}'] = df[hotel].rolling(window=window).std()

# ============================================================================
# CREATE POLYNOMIAL FEATURES
# ============================================================================
for hotel in competitor_cols:
    df[f'{hotel}_squared'] = df[hotel] ** 2
    df[f'{hotel}_cubed'] = df[hotel] ** 3

# ============================================================================
# CREATE INTERACTION FEATURES
# ============================================================================
for hotel1, hotel2 in combinations(competitor_cols, 2):
    interaction_name = f'{hotel1}_x_{hotel2}'.replace('booking-us-', '').replace('-USD', '')
    if len(interaction_name) > 50:
        interaction_name = interaction_name[:50]
    df[interaction_name] = df[hotel1] * df[hotel2]

# ============================================================================
# CLEAN AND FINALIZE
# ============================================================================
# Remove first 5 rows (max lag) to get 360 observations
df_clean = df.iloc[5:].copy()

# Fill any remaining NaN in rolling features
rolling_cols = [col for col in df_clean.columns if 'rolling' in col]
for col in rolling_cols:
    if df_clean[col].isnull().any():
        df_clean[col] = df_clean[col].bfill()

if df_clean.isnull().sum().sum() > 0:
    df_clean = df_clean.dropna()

# ============================================================================
# SAVE
# ============================================================================
output_path = Path('../../data/dataprocessed')
output_path.mkdir(parents=True, exist_ok=True)

df_clean.to_csv(output_path / 'lagged_predictive_dataset_independent.csv', index=False)

print(f"\n✓ Dataset saved: lagged_predictive_dataset_independent.csv")
print(f"  Observations: {len(df_clean)}")
print(f"  Features: {len(df_clean.columns)}")
print(f"  Status: {'✓ PASS' if len(df_clean) == 360 else '⚠️ WARNING'} (Expected: 360, Got: {len(df_clean)})")
print("="*80)

LAGGED DATA PREPARATION - INDEPENDENT IMPUTATION

✓ Dataset saved: lagged_predictive_dataset_independent.csv
  Observations: 360
  Features: 96
  Status: ✓ PASS (Expected: 360, Got: 360)
