In [1]:
"""
MATRIX COMPLETION - LAGGED FEATURES CREATION
"""

import pandas as pd
import numpy as np
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

data_path = Path('../../../data/dataprocessed')

# Load matrix completion results
focal = pd.read_csv(data_path / 'focal_matrix_completion.csv')
focal['date'] = pd.to_datetime(focal['date'])

print(f"Focal rows: {len(focal)}")
print(f"Focal date range: {focal['date'].min()} to {focal['date'].max()}")

comp_matrix = pd.read_csv(data_path / 'competitor_price_matrix_matrix_completion.csv', index_col=0)
comp_matrix.columns = pd.to_datetime(comp_matrix.columns)

print(f"Competitor matrix shape: {comp_matrix.shape}")
print(f"Competitor date range: {comp_matrix.columns.min()} to {comp_matrix.columns.max()}")

# Merge focal with competitors in wide format
merged_data = []
for date in focal['date']:
    row = {'date': date, 'base_rate': focal[focal['date'] == date]['base_rate'].values[0]}
    if date in comp_matrix.columns:
        for hotel in comp_matrix.index:
            row[hotel] = comp_matrix.loc[hotel, date]
    merged_data.append(row)

df_final = pd.DataFrame(merged_data)

print(f"After merge: {len(df_final)} rows")

# Add temporal features that were in original data
df_final['base_rate_normalized'] = df_final['base_rate']
df_final['day_of_week'] = df_final['date'].dt.dayofweek
df_final['month'] = df_final['date'].dt.month
df_final['is_weekend'] = (df_final['day_of_week'] >= 5).astype(int)

print(f"Loaded dataset shape: {df_final.shape}")
print(f"Date range: {df_final['date'].min()} to {df_final['date'].max()}")

# Identify price columns (everything except date and derived features)
price_columns = [col for col in df_final.columns if col not in ['date', 'day_of_week', 'month', 'is_weekend']]

# Create lagged features - EXACTLY like baseline
final_lags = [1, 2, 3, 4, 5]

def create_final_lagged_features(df, price_columns, selected_lags):
    df_lagged = df.copy()
    for col in price_columns:
        for lag in selected_lags:
            df_lagged[f'{col}_lag_{lag}'] = df_lagged[col].shift(lag)
    return df_lagged

print(f"\nCreating lagged features with lags: {final_lags}")
df_with_lags = create_final_lagged_features(df_final, price_columns, final_lags)

print(f"Dataset shape after adding lags: {df_with_lags.shape}")
print(f"Missing values after lagging: {df_with_lags.isnull().sum().sum()}")

df_with_lags_clean = df_with_lags.dropna()
print(f"Final dataset shape after removing NaN: {df_with_lags_clean.shape}")
print(f"Data retention: {len(df_with_lags_clean)/len(df_final)*100:.1f}%")

# Add temporal features - EXACTLY like baseline
def add_temporal_features(df):
    df_temporal = df.copy()
    df_temporal['day_of_week'] = df_temporal['date'].dt.dayofweek
    df_temporal['month'] = df_temporal['date'].dt.month
    df_temporal['day_of_year'] = df_temporal['date'].dt.dayofyear
    df_temporal['sin_day_of_week'] = np.sin(2 * np.pi * df_temporal['day_of_week'] / 7)
    df_temporal['cos_day_of_week'] = np.cos(2 * np.pi * df_temporal['day_of_week'] / 7)
    df_temporal['sin_month'] = np.sin(2 * np.pi * df_temporal['month'] / 12)
    df_temporal['cos_month'] = np.cos(2 * np.pi * df_temporal['month'] / 12)
    df_temporal['sin_day_of_year'] = np.sin(2 * np.pi * df_temporal['day_of_year'] / 365)
    df_temporal['cos_day_of_year'] = np.cos(2 * np.pi * df_temporal['day_of_year'] / 365)
    df_temporal['is_weekend'] = (df_temporal['day_of_week'] >= 5).astype(int)
    return df_temporal

df_with_temporal = add_temporal_features(df_with_lags_clean)
print(f"\nDataset shape after adding temporal features: {df_with_temporal.shape}")

temporal_features = ['sin_day_of_week', 'cos_day_of_week', 'sin_month', 'cos_month', 
                    'sin_day_of_year', 'cos_day_of_year', 'is_weekend']
print(f"Temporal features added: {temporal_features}")

# Summary
total_observations = len(df_with_temporal)
total_features = len(df_with_temporal.columns) - 1
lag_features = len([col for col in df_with_temporal.columns if 'lag' in col])
temporal_features_count = len(temporal_features)

print(f"\nFinal dataset summary:")
print(f"  Observations: {total_observations}")
print(f"  Total features: {total_features}")
print(f"  Lag features: {lag_features}")
print(f"  Temporal features: {temporal_features_count}")
print(f"  Original features: {total_features - lag_features - temporal_features_count}")
print(f"  Data lost to lagging: {len(df_final) - total_observations} rows ({(len(df_final) - total_observations)/len(df_final)*100:.1f}%)")

# Save
df_with_temporal.to_csv(data_path / 'lagged_predictive_dataset_matrix_completion.csv', index=False)

lag_metadata = {
    'imputation_method': 'matrix_completion',
    'selected_lags': final_lags,
    'lag_selection_method': 'matched_baseline',
    'focal_column': 'base_rate',
    'total_lag_features': lag_features,
    'temporal_features': temporal_features,
    'final_observations': total_observations,
    'data_retention_pct': round(len(df_with_temporal)/len(df_final)*100, 1),
    'feature_summary': {
        'total_features': total_features,
        'lag_features': lag_features,
        'temporal_features': temporal_features_count,
        'original_features': total_features - lag_features - temporal_features_count
    },
    'data_quality': {
        'imputation_correlation_preserved': 99.0,
        'imputation_method': 'Matrix Completion (IterativeImputer)'
    }
}

with open(data_path / 'lag_selection_metadata_matrix_completion.json', 'w') as f:
    json.dump(lag_metadata, f, indent=2)

print("\nDataset saved successfully!")
print("Files created:")
print("  - lagged_predictive_dataset_matrix_completion.csv")
print("  - lag_selection_metadata_matrix_completion.json")

Focal rows: 364
Focal date range: 2025-09-16 00:00:00 to 2026-09-14 00:00:00
Competitor matrix shape: (5, 364)
Competitor date range: 2025-09-16 00:00:00 to 2026-09-14 00:00:00
After merge: 364 rows
Loaded dataset shape: (364, 11)
Date range: 2025-09-16 00:00:00 to 2026-09-14 00:00:00

Creating lagged features with lags: [1, 2, 3, 4, 5]
Dataset shape after adding lags: (364, 46)
Missing values after lagging: 105
Final dataset shape after removing NaN: (359, 46)
Data retention: 98.6%

Dataset shape after adding temporal features: (359, 53)
Temporal features added: ['sin_day_of_week', 'cos_day_of_week', 'sin_month', 'cos_month', 'sin_day_of_year', 'cos_day_of_year', 'is_weekend']

Final dataset summary:
  Observations: 359
  Total features: 52
  Lag features: 35
  Temporal features: 7
  Original features: 10
  Data lost to lagging: 5 rows (1.4%)

Dataset saved successfully!
Files created:
  - lagged_predictive_dataset_matrix_completion.csv
  - lag_selection_metadata_matrix_completion.jso