In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

data_path = Path('../data/full-data/raw')
output_path = Path('../data/full-data/processed')
output_path.mkdir(parents=True, exist_ok=True)

mapping_df = pd.read_csv('../data/full-data/hotel_mapping.csv')
hotel_list = mapping_df['masked_id'].tolist()

print(f"Processing {len(hotel_list)} focal hotels for Matrix Completion")

summary_results = []

for hotel_masked_id in hotel_list:
    print(f"\n{'='*80}")
    print(f"Processing: {hotel_masked_id}")
    print(f"{'='*80}")
    
    try:
        focal_file = data_path / f'{hotel_masked_id}_focal.csv'
        if not focal_file.exists():
            print(f"Focal file not found, skipping...")
            continue
            
        focal = pd.read_csv(focal_file)
        focal['stay_date'] = pd.to_datetime(focal['stay_date'])
        focal = focal.groupby('stay_date')['price'].min().reset_index()
        focal.columns = ['date', 'base_rate']
        focal = focal.sort_values('date').reset_index(drop=True)
        
        print(f"Focal: {len(focal)} days ({focal['date'].min()} to {focal['date'].max()})")
        
        comp_file = data_path / f'{hotel_masked_id}_competitors.csv'
        if not comp_file.exists():
            print(f"Competitor file not found, skipping...")
            continue
            
        compset = pd.read_csv(comp_file)
        compset['stay_date'] = pd.to_datetime(compset['stay_date'])
        compset = compset.groupby(['stay_date', 'hotel_id'])['price'].min().reset_index()
        compset.columns = ['date', 'hotel_id', 'competitor_price']
        
        num_competitors = compset['hotel_id'].nunique()
        print(f"Compset: {num_competitors} hotels")
        
        comp_matrix = compset.pivot(index='date', columns='hotel_id', values='competitor_price')
        merged = focal[['date', 'base_rate']].merge(comp_matrix, on='date', how='inner')
        merged = merged.sort_values('date').reset_index(drop=True)
        
        # Store original column names BEFORE creating matrix
        original_hotel_names = ['base_rate'] + list(comp_matrix.columns)
        
        price_matrix = merged.drop('date', axis=1).values
        dates = merged['date'].values
        
        missing_count = np.isnan(price_matrix).sum()
        missing_pct = 100 * missing_count / price_matrix.size
        
        print(f"Matrix: {price_matrix.shape}")
        print(f"Missing: {missing_count} / {price_matrix.size} ({missing_pct:.1f}%)")
        
        try:
            from fancyimpute import SoftImpute
            max_rank = min(3, min(price_matrix.shape) - 1)
            imputer = SoftImpute(max_rank=max_rank, verbose=False)
            completed_matrix = imputer.fit_transform(price_matrix)
            method_used = "Soft-Impute"
            print(f"Method: Soft-Impute (max_rank={max_rank})")
        except ImportError:
            from sklearn.experimental import enable_iterative_imputer
            from sklearn.impute import IterativeImputer
            from sklearn.linear_model import BayesianRidge
            imputer = IterativeImputer(estimator=BayesianRidge(), max_iter=10, random_state=42, verbose=0)
            completed_matrix = imputer.fit_transform(price_matrix)
            method_used = "IterativeImputer"
            print("Method: IterativeImputer")
        
        remaining_missing = np.isnan(completed_matrix).sum()
        if remaining_missing > 0:
            for i in range(completed_matrix.shape[1]):
                col_mean = np.nanmean(completed_matrix[:, i])
                if np.isnan(col_mean):
                    col_mean = np.nanmean(completed_matrix)
                completed_matrix[np.isnan(completed_matrix[:, i]), i] = col_mean
        
        if (completed_matrix < 0).sum() > 0:
            completed_matrix = np.maximum(completed_matrix, 0)
        
        # FIX: Check if imputer dropped columns (all NaN) and adjust column names
        if completed_matrix.shape[1] != len(original_hotel_names):
            print(f"Warning: Imputer dropped {len(original_hotel_names) - completed_matrix.shape[1]} column(s) (all NaN)")
            # Find which columns survived (not all NaN)
            survived_cols = []
            for col_idx, col_name in enumerate(original_hotel_names):
                col_data = price_matrix[:, col_idx]
                if not np.all(np.isnan(col_data)):
                    survived_cols.append(col_name)
            hotel_names = survived_cols
            print(f"Adjusted to {len(hotel_names)} columns: {hotel_names}")
        else:
            hotel_names = original_hotel_names
        
        # Use adjusted hotel_names for correlation calculation
        # Need to recalculate with only survived columns
        survived_indices = [i for i, col in enumerate(original_hotel_names) if col in hotel_names]
        price_matrix_survived = price_matrix[:, survived_indices]
        
        original_df = pd.DataFrame(price_matrix_survived, columns=hotel_names)
        completed_df_temp = pd.DataFrame(completed_matrix, columns=hotel_names)
        original_corr = original_df.corr()
        completed_corr = completed_df_temp.corr()
        corr_diff = np.abs(original_corr - completed_corr)
        valid_corr = ~(original_corr.isna() | completed_corr.isna())
        avg_corr_diff = corr_diff[valid_corr].mean().mean()
        corr_preserved = (1 - avg_corr_diff) * 100
        
        print(f"Correlation preserved: {corr_preserved:.1f}%")
        
        completed_df = pd.DataFrame(completed_matrix, columns=hotel_names)
        completed_df.insert(0, 'date', dates)
        
        completed_df.to_csv(output_path / f'{hotel_masked_id}_matrix_completion_imputed.csv', index=False)
        
        focal_completed = completed_df[['date', 'base_rate']].copy()
        focal_completed.to_csv(output_path / f'{hotel_masked_id}_focal_matrix_completion.csv', index=False)
        
        comp_completed = completed_df.drop('base_rate', axis=1).copy()
        comp_melted = comp_completed.melt(id_vars='date', var_name='hotel_id', value_name='competitor_price')
        comp_melted.to_csv(output_path / f'{hotel_masked_id}_competitors_matrix_completion.csv', index=False)
        
        comp_matrix_completed = comp_completed.set_index('date').T
        comp_matrix_completed.to_csv(output_path / f'{hotel_masked_id}_competitor_price_matrix.csv')
        
        print(f"Saved: 4 output files")
        
        summary_results.append({
            'hotel_id': hotel_masked_id,
            'matrix_rows': price_matrix.shape[0],
            'matrix_cols': len(hotel_names),
            'num_competitors': len(hotel_names) - 1,
            'missing_values': missing_count,
            'missing_pct': missing_pct,
            'correlation_preserved_pct': corr_preserved,
            'method': method_used,
            'date_start': focal['date'].min(),
            'date_end': focal['date'].max(),
            'status': 'Success'
        })
        
    except Exception as e:
        print(f"Error: {str(e)}")
        summary_results.append({
            'hotel_id': hotel_masked_id,
            'matrix_rows': 0,
            'matrix_cols': 0,
            'num_competitors': 0,
            'missing_values': 0,
            'missing_pct': 0,
            'correlation_preserved_pct': 0,
            'method': 'Failed',
            'date_start': None,
            'date_end': None,
            'status': f'Failed: {str(e)}'
        })

summary_df = pd.DataFrame(summary_results)
summary_df.to_csv(output_path / 'matrix_completion_summary.csv', index=False)

print(f"\n{'='*80}")
print("MATRIX COMPLETION SUMMARY")
print(f"{'='*80}")
print(f"\nTotal hotels processed: {len(hotel_list)}")
print(f"Successful: {len(summary_df[summary_df['status'] == 'Success'])}")
print(f"Failed: {len(summary_df[summary_df['status'] != 'Success'])}")

success_df = summary_df[summary_df['status'] == 'Success']
if len(success_df) > 0:
    print(f"\nAverage matrix size: {success_df['matrix_rows'].mean():.0f} rows × {success_df['matrix_cols'].mean():.1f} cols")
    print(f"Average competitors: {success_df['num_competitors'].mean():.1f}")
    print(f"Average missing: {success_df['missing_pct'].mean():.2f}%")
    print(f"Average correlation preserved: {success_df['correlation_preserved_pct'].mean():.1f}%")

print(f"\nSummary saved: matrix_completion_summary.csv")
print(summary_df[['hotel_id', 'matrix_rows', 'num_competitors', 'missing_pct', 'correlation_preserved_pct', 'status']])

Processing 44 focal hotels for Matrix Completion

Processing: Hotel_01
Focal: 563 days (2025-03-27 00:00:00 to 2026-10-10 00:00:00)
Compset: 5 hotels
Matrix: (559, 6)
Missing: 859 / 3354 (25.6%)
Method: IterativeImputer
Correlation preserved: 86.3%
Saved: 4 output files

Processing: Hotel_02
Focal: 438 days (2025-07-29 00:00:00 to 2026-10-09 00:00:00)
Compset: 4 hotels
Matrix: (430, 5)
Missing: 85 / 2150 (4.0%)
Method: IterativeImputer
Correlation preserved: 97.1%
Saved: 4 output files

Processing: Hotel_03
Focal: 401 days (2024-11-26 00:00:00 to 2025-12-31 00:00:00)
Compset: 9 hotels
Matrix: (401, 10)
Missing: 760 / 4010 (19.0%)
Method: IterativeImputer
Correlation preserved: 96.5%
Saved: 4 output files

Processing: Hotel_04
Focal: 632 days (2025-01-16 00:00:00 to 2026-10-09 00:00:00)
Compset: 4 hotels
Matrix: (625, 5)
Missing: 406 / 3125 (13.0%)
Method: IterativeImputer
Correlation preserved: 84.3%
Saved: 4 output files

Processing: Hotel_05
Focal: 683 days (2024-11-26 00:00:00 to 20

Method: IterativeImputer
Correlation preserved: 91.9%
Saved: 4 output files

Processing: Hotel_22
Focal: 675 days (2024-11-25 00:00:00 to 2026-09-30 00:00:00)
Compset: 9 hotels
Matrix: (675, 10)
Missing: 570 / 6750 (8.4%)
Method: IterativeImputer
Correlation preserved: 94.8%
Saved: 4 output files

Processing: Hotel_23
Focal: 684 days (2024-11-25 00:00:00 to 2026-10-09 00:00:00)
Compset: 10 hotels
Matrix: (681, 11)
Missing: 1271 / 7491 (17.0%)
Method: IterativeImputer
Correlation preserved: 95.8%
Saved: 4 output files

Processing: Hotel_24
Focal: 614 days (2024-11-25 00:00:00 to 2026-07-31 00:00:00)
Compset: 10 hotels
Matrix: (614, 11)
Missing: 614 / 6754 (9.1%)
Method: IterativeImputer
Correlation preserved: 92.3%
Saved: 4 output files

Processing: Hotel_25
Focal: 674 days (2024-11-26 00:00:00 to 2026-09-30 00:00:00)
Compset: 10 hotels
Matrix: (674, 11)
Missing: 1376 / 7414 (18.6%)
Method: IterativeImputer
Correlation preserved: 91.9%
Saved: 4 output files

Processing: Hotel_26
Focal: 