In [1]:
"""
MATRIX COMPLETION IMPUTATION
"""

import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

data_path = Path('../../data/dataraw')

# Load focal
focal = pd.read_csv(data_path / 'data-1757985699237.csv')
focal['stay_date'] = pd.to_datetime(focal['stay_date'])
focal = focal.groupby('stay_date')['price'].min().reset_index()
focal.columns = ['date', 'base_rate']
focal = focal.sort_values('date').reset_index(drop=True)

print(f"Focal: {len(focal)} days")

# Load compset
compset = pd.read_csv(data_path / 'data-1757985744315.csv')
compset['stay_date'] = pd.to_datetime(compset['stay_date'])
compset = compset.groupby(['stay_date', 'hotel_id'])['price'].min().reset_index()
compset.columns = ['date', 'hotel_id', 'competitor_price']

print(f"Compset: {compset['hotel_id'].nunique()} hotels")

# Create price matrix
print("\n" + "="*80)
print("CREATING PRICE MATRIX")
print("="*80)

comp_matrix = compset.pivot(index='date', columns='hotel_id', values='competitor_price')
merged = focal[['date', 'base_rate']].merge(comp_matrix, on='date', how='inner')
merged = merged.sort_values('date').reset_index(drop=True)
hotel_names = ['base_rate'] + list(comp_matrix.columns)

price_matrix = merged.drop('date', axis=1).values
dates = merged['date'].values

print(f"Matrix shape: {price_matrix.shape}")
print(f"Missing: {np.isnan(price_matrix).sum()} / {price_matrix.size} ({100 * np.isnan(price_matrix).sum() / price_matrix.size:.1f}%)")

# Apply matrix completion
print("\n" + "="*80)
print("APPLYING MATRIX COMPLETION")
print("="*80)

try:
    from fancyimpute import SoftImpute
    max_rank = min(3, min(price_matrix.shape) - 1)
    imputer = SoftImpute(max_rank=max_rank, verbose=False)
    completed_matrix = imputer.fit_transform(price_matrix)
    method_used = "Soft-Impute"
    print(f"Method: Soft-Impute (max_rank={max_rank})")
except ImportError:
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    from sklearn.linear_model import BayesianRidge
    imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=42, verbose=0)
    completed_matrix = imputer.fit_transform(price_matrix)
    method_used = "IterativeImputer"
    print("Method: IterativeImputer")

# Handle remaining missing values
remaining_missing = np.isnan(completed_matrix).sum()
if remaining_missing > 0:
    for i in range(completed_matrix.shape[1]):
        col_mean = np.nanmean(completed_matrix[:, i])
        if np.isnan(col_mean):
            col_mean = np.nanmean(completed_matrix)
        completed_matrix[np.isnan(completed_matrix[:, i]), i] = col_mean

# Clip negative values
if (completed_matrix < 0).sum() > 0:
    completed_matrix = np.maximum(completed_matrix, 0)

# Calculate correlations
original_df = pd.DataFrame(price_matrix, columns=hotel_names)
completed_df_temp = pd.DataFrame(completed_matrix, columns=hotel_names)
original_corr = original_df.corr()
completed_corr = completed_df_temp.corr()
corr_diff = np.abs(original_corr - completed_corr)
valid_corr = ~(original_corr.isna() | completed_corr.isna())
avg_corr_diff = corr_diff[valid_corr].mean().mean()

print(f"Correlation preserved: {(1 - avg_corr_diff)*100:.1f}%")

# Save results
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

completed_df = pd.DataFrame(completed_matrix, columns=hotel_names)
completed_df.insert(0, 'date', dates)

# Save full matrix
output_path = Path('../../data/dataprocessed')
completed_df.to_csv(output_path / 'matrix_completion_imputed.csv', index=False)
print("Saved: matrix_completion_imputed.csv")

# Save focal
focal_completed = completed_df[['date', 'base_rate']].copy()
focal_completed.to_csv(output_path / 'focal_matrix_completion.csv', index=False)
print("Saved: focal_matrix_completion.csv")

# Save competitors
comp_completed = completed_df.drop('base_rate', axis=1).copy()
comp_melted = comp_completed.melt(id_vars='date', var_name='hotel_id', value_name='competitor_price')
comp_melted.to_csv(output_path / 'competitors_matrix_completion.csv', index=False)
print("Saved: competitors_matrix_completion.csv")

# Save competitor matrix
comp_matrix_completed = comp_completed.set_index('date').T
comp_matrix_completed.to_csv(output_path / 'competitor_price_matrix_matrix_completion.csv')
print("Saved: competitor_price_matrix_matrix_completion.csv")

print("\nDone. Ready for lagged feature creation.")

Focal: 365 days
Compset: 5 hotels

CREATING PRICE MATRIX
Matrix shape: (364, 6)
Missing: 30 / 2184 (1.4%)

APPLYING MATRIX COMPLETION
Method: IterativeImputer
Correlation preserved: 99.0%

SAVING RESULTS
Saved: matrix_completion_imputed.csv
Saved: focal_matrix_completion.csv
Saved: competitors_matrix_completion.csv
Saved: competitor_price_matrix_matrix_completion.csv

Done. Ready for lagged feature creation.
