In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
import warnings
warnings.filterwarnings('ignore')

data_path = Path('../data/full-data/processed')
output_path = Path('../data/full-data/processed')

mapping_df = pd.read_csv('../data/full-data/hotel_mapping.csv')
hotel_list = mapping_df['masked_id'].tolist()

print(f"Creating lagged features for {len(hotel_list)} focal hotels")

summary_results = []

for hotel_masked_id in hotel_list:
    print(f"\n{'='*80}")
    print(f"Processing: {hotel_masked_id}")
    print(f"{'='*80}")
    
    try:
        focal_file = data_path / f'{hotel_masked_id}_focal_matrix_completion.csv'
        if not focal_file.exists():
            print(f"Focal file not found, skipping...")
            continue
        
        focal = pd.read_csv(focal_file)
        focal['date'] = pd.to_datetime(focal['date'])
        
        print(f"Focal: {len(focal)} rows ({focal['date'].min()} to {focal['date'].max()})")
        
        comp_matrix_file = data_path / f'{hotel_masked_id}_competitor_price_matrix.csv'
        if not comp_matrix_file.exists():
            print(f"Competitor matrix not found, skipping...")
            continue
        
        comp_matrix = pd.read_csv(comp_matrix_file, index_col=0)
        comp_matrix.columns = pd.to_datetime(comp_matrix.columns)
        
        print(f"Competitor matrix: {comp_matrix.shape}")
        
        merged_data = []
        for date in focal['date']:
            row = {'date': date, 'base_rate': focal[focal['date'] == date]['base_rate'].values[0]}
            if date in comp_matrix.columns:
                for hotel in comp_matrix.index:
                    row[hotel] = comp_matrix.loc[hotel, date]
            merged_data.append(row)
        
        df_final = pd.DataFrame(merged_data)
        
        df_final['day_of_week'] = df_final['date'].dt.dayofweek
        df_final['month'] = df_final['date'].dt.month
        df_final['is_weekend'] = (df_final['day_of_week'] >= 5).astype(int)
        
        print(f"After merge: {df_final.shape}")
        
        price_columns = [col for col in df_final.columns if col not in ['date', 'day_of_week', 'month', 'is_weekend']]
        
        final_lags = [1, 2, 3, 4, 5]
        
        df_lagged = df_final.copy()
        for col in price_columns:
            for lag in final_lags:
                df_lagged[f'{col}_lag_{lag}'] = df_lagged[col].shift(lag)
        
        print(f"After lagging: {df_lagged.shape}")
        print(f"Missing values: {df_lagged.isnull().sum().sum()}")
        
        df_lagged_clean = df_lagged.dropna()
        print(f"After dropping NaN: {df_lagged_clean.shape}")
        print(f"Data retention: {len(df_lagged_clean)/len(df_final)*100:.1f}%")
        
        df_with_temporal = df_lagged_clean.copy()
        df_with_temporal['day_of_year'] = df_with_temporal['date'].dt.dayofyear
        df_with_temporal['sin_day_of_week'] = np.sin(2 * np.pi * df_with_temporal['day_of_week'] / 7)
        df_with_temporal['cos_day_of_week'] = np.cos(2 * np.pi * df_with_temporal['day_of_week'] / 7)
        df_with_temporal['sin_month'] = np.sin(2 * np.pi * df_with_temporal['month'] / 12)
        df_with_temporal['cos_month'] = np.cos(2 * np.pi * df_with_temporal['month'] / 12)
        df_with_temporal['sin_day_of_year'] = np.sin(2 * np.pi * df_with_temporal['day_of_year'] / 365)
        df_with_temporal['cos_day_of_year'] = np.cos(2 * np.pi * df_with_temporal['day_of_year'] / 365)
        
        print(f"Final shape: {df_with_temporal.shape}")
        
        df_with_temporal.to_csv(output_path / f'{hotel_masked_id}_lagged_dataset.csv', index=False)
        
        total_observations = len(df_with_temporal)
        total_features = len(df_with_temporal.columns) - 1
        lag_features = len([col for col in df_with_temporal.columns if 'lag' in col])
        temporal_features = 7
        
        lag_metadata = {
            'hotel_id': hotel_masked_id,
            'imputation_method': 'matrix_completion',
            'selected_lags': final_lags,
            'focal_column': 'base_rate',
            'total_lag_features': lag_features,
            'temporal_features': ['sin_day_of_week', 'cos_day_of_week', 'sin_month', 'cos_month', 
                                  'sin_day_of_year', 'cos_day_of_year', 'is_weekend'],
            'final_observations': total_observations,
            'original_observations': len(df_final),
            'data_retention_pct': round(len(df_with_temporal)/len(df_final)*100, 1),
            'feature_summary': {
                'total_features': total_features,
                'lag_features': lag_features,
                'temporal_features': temporal_features,
                'price_features': len(price_columns)
            }
        }
        
        with open(output_path / f'{hotel_masked_id}_lag_metadata.json', 'w') as f:
            json.dump(lag_metadata, f, indent=2)
        
        print(f"Saved: lagged dataset and metadata")
        
        summary_results.append({
            'hotel_id': hotel_masked_id,
            'original_rows': len(df_final),
            'final_rows': total_observations,
            'rows_lost': len(df_final) - total_observations,
            'data_retention_pct': round(len(df_with_temporal)/len(df_final)*100, 1),
            'total_features': total_features,
            'lag_features': lag_features,
            'price_columns': len(price_columns),
            'num_competitors': len(price_columns) - 1,
            'status': 'Success'
        })
        
    except Exception as e:
        print(f"Error: {str(e)}")
        summary_results.append({
            'hotel_id': hotel_masked_id,
            'original_rows': 0,
            'final_rows': 0,
            'rows_lost': 0,
            'data_retention_pct': 0,
            'total_features': 0,
            'lag_features': 0,
            'price_columns': 0,
            'num_competitors': 0,
            'status': f'Failed: {str(e)}'
        })

summary_df = pd.DataFrame(summary_results)
summary_df.to_csv(output_path / 'lagged_features_summary.csv', index=False)

print(f"\n{'='*80}")
print("LAGGED FEATURES SUMMARY")
print(f"{'='*80}")
print(f"\nTotal hotels processed: {len(hotel_list)}")
print(f"Successful: {len(summary_df[summary_df['status'] == 'Success'])}")
print(f"Failed: {len(summary_df[summary_df['status'] != 'Success'])}")

success_df = summary_df[summary_df['status'] == 'Success']
if len(success_df) > 0:
    print(f"\nAverage original rows: {success_df['original_rows'].mean():.0f}")
    print(f"Average final rows: {success_df['final_rows'].mean():.0f}")
    print(f"Average rows lost to lagging: {success_df['rows_lost'].mean():.1f}")
    print(f"Average data retention: {success_df['data_retention_pct'].mean():.1f}%")
    print(f"Average total features: {success_df['total_features'].mean():.0f}")
    print(f"Average lag features: {success_df['lag_features'].mean():.0f}")
    print(f"Average competitors: {success_df['num_competitors'].mean():.1f}")

print(f"\nSummary saved: lagged_features_summary.csv")
print(summary_df[['hotel_id', 'original_rows', 'final_rows', 'data_retention_pct', 'total_features', 'lag_features', 'status']])

Creating lagged features for 44 focal hotels

Processing: Hotel_01
Focal: 559 rows (2025-03-27 00:00:00 to 2026-10-06 00:00:00)
Competitor matrix: (5, 559)
After merge: (559, 10)
After lagging: (559, 40)
Missing values: 90
After dropping NaN: (554, 40)
Data retention: 99.1%
Final shape: (554, 47)
Saved: lagged dataset and metadata

Processing: Hotel_02
Focal: 430 rows (2025-08-03 00:00:00 to 2026-10-06 00:00:00)
Competitor matrix: (4, 430)
After merge: (430, 9)
After lagging: (430, 34)
Missing values: 75
After dropping NaN: (425, 34)
Data retention: 98.8%
Final shape: (425, 41)
Saved: lagged dataset and metadata

Processing: Hotel_03
Focal: 401 rows (2024-11-26 00:00:00 to 2025-12-31 00:00:00)
Competitor matrix: (9, 401)
After merge: (401, 14)
After lagging: (401, 64)
Missing values: 150
After dropping NaN: (396, 64)
Data retention: 98.8%
Final shape: (396, 71)
Saved: lagged dataset and metadata

Processing: Hotel_04
Focal: 625 rows (2025-01-20 00:00:00 to 2026-10-06 00:00:00)
Competit

After merge: (645, 15)
After lagging: (645, 70)
Missing values: 165
After dropping NaN: (640, 70)
Data retention: 99.2%
Final shape: (640, 77)
Saved: lagged dataset and metadata

Processing: Hotel_22
Focal: 675 rows (2024-11-25 00:00:00 to 2026-09-30 00:00:00)
Competitor matrix: (9, 675)
After merge: (675, 14)
After lagging: (675, 64)
Missing values: 150
After dropping NaN: (670, 64)
Data retention: 99.3%
Final shape: (670, 71)
Saved: lagged dataset and metadata

Processing: Hotel_23
Focal: 681 rows (2024-11-25 00:00:00 to 2026-10-06 00:00:00)
Competitor matrix: (10, 681)
After merge: (681, 15)
After lagging: (681, 70)
Missing values: 165
After dropping NaN: (676, 70)
Data retention: 99.3%
Final shape: (676, 77)
Saved: lagged dataset and metadata

Processing: Hotel_24
Focal: 614 rows (2024-11-25 00:00:00 to 2026-07-31 00:00:00)
Competitor matrix: (10, 614)
After merge: (614, 15)
After lagging: (614, 70)
Missing values: 165
After dropping NaN: (609, 70)
Data retention: 99.2%
Final shape

After merge: (446, 12)
After lagging: (446, 52)
Missing values: 120
After dropping NaN: (441, 52)
Data retention: 98.9%
Final shape: (441, 59)
Saved: lagged dataset and metadata

Processing: Hotel_41
Focal: 430 rows (2025-08-03 00:00:00 to 2026-10-06 00:00:00)
Competitor matrix: (6, 430)
After merge: (430, 11)
After lagging: (430, 46)
Missing values: 105
After dropping NaN: (425, 46)
Data retention: 98.8%
Final shape: (425, 53)
Saved: lagged dataset and metadata

Processing: Hotel_42
Focal file not found, skipping...

Processing: Hotel_43
Focal file not found, skipping...

Processing: Hotel_44
Focal file not found, skipping...

LAGGED FEATURES SUMMARY

Total hotels processed: 44
Successful: 38
Failed: 0

Average original rows: 507
Average final rows: 502
Average rows lost to lagging: 5.0
Average data retention: 98.5%
Average total features: 59
Average lag features: 41
Average competitors: 7.1

Summary saved: lagged_features_summary.csv
    hotel_id  original_rows  final_rows  data_rete