In [1]:
"""
LAGGED DATA PREPARATION - ADVANCED IMPUTATION VERSION
======================================================
This notebook creates lagged features using the improved competitor data
from advanced imputation methods (Time-Series Decay).

Outputs: lagged_predictive_dataset_advanced_imputation.csv
"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import sys
import os
import json
from datetime import datetime, timedelta
from pathlib import Path

print("="*80)
print("LAGGED DATA PREPARATION - ADVANCED IMPUTATION VERSION")
print("="*80)

LAGGED DATA PREPARATION - ADVANCED IMPUTATION VERSION


In [2]:
# ============================================================================
# SECTION 1: DATA LOADING
# ============================================================================
print("\n" + "="*80)
print("SECTION 1: DATA LOADING")
print("="*80)

def load_preprocessed_data_timeseries():
    """Load cleaned data using TIME-SERIES DECAY IMPUTATION competitor data"""
    
    data_path = Path().cwd().parent.parent.parent / 'data' / 'dataprocessed'
    sys.path.append(str(data_path))
    
    # Use focal data (unchanged)
    focal_daily = pd.read_csv(data_path / 'focal_daily_aggregated.csv')
    
    # Use TIME-SERIES DECAY IMPUTATION competitor matrix
    competitor_matrix = pd.read_csv(data_path / 'competitor_price_matrix_timeseries.csv')
    
    focal_daily['date'] = pd.to_datetime(focal_daily['stay_date'])
    competitor_matrix['date'] = pd.to_datetime(competitor_matrix['stay_date'])
    
    df_merged = focal_daily.merge(competitor_matrix, on='date', how='inner')
    df_merged = df_merged.drop(['stay_date_x', 'stay_date_y'], axis=1, errors='ignore')
    df_merged = df_merged.sort_values('date').reset_index(drop=True)
    
    return df_merged

df_final = load_preprocessed_data_timeseries()
print(f"Loaded dataset shape: {df_final.shape}")
print(f"Date range: {df_final['date'].min()} to {df_final['date'].max()}")
print(f"Using TIME-SERIES DECAY IMPUTATION competitor data")

# ============================================================================
# SECTION 2: DATA STRUCTURE VALIDATION
# ============================================================================
print("\n" + "="*80)
print("SECTION 2: DATA STRUCTURE VALIDATION")
print("="*80)

print("Dataset structure:")
print(df_final.head())
print(f"\nColumns: {list(df_final.columns)}")
print(f"Data types:\n{df_final.dtypes}")
print(f"Missing values:\n{df_final.isnull().sum()}")

price_columns = [col for col in df_final.columns if col != 'date']
print(f"\nPrice columns identified: {price_columns}")

focal_col = None
for col in price_columns:
    if 'focal' in col.lower() or any(keyword in col.lower() for keyword in ['base_rate', 'price']):
        focal_col = col
        break

if focal_col:
    print(f"Focal hotel price column: {focal_col}")
else:
    print("Warning: Could not identify focal hotel price column")

# ============================================================================
# SECTION 3: STATISTICAL LAG ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("SECTION 3: STATISTICAL LAG ANALYSIS")
print("="*80)

def create_comprehensive_lags(df, price_columns, max_lag=14):
    """Create lags up to max_lag for analysis"""
    lag_dfs = []
    
    for col in price_columns:
        col_lags = {}
        for lag in range(1, max_lag + 1):
            col_lags[f'{col}_lag_{lag}'] = df[col].shift(lag)
        lag_dfs.append(pd.DataFrame(col_lags))
    
    df_analysis = pd.concat([df] + lag_dfs, axis=1)
    return df_analysis

def analyze_autocorrelation(series, max_lag=14):
    """Compute autocorrelation function"""
    from statsmodels.tsa.stattools import acf, pacf
    
    autocorr = acf(series.dropna(), nlags=max_lag, fft=False)
    partial_autocorr = pacf(series.dropna(), nlags=max_lag)
    
    return autocorr, partial_autocorr

def analyze_cross_correlation(focal_series, competitor_series, max_lag=14):
    """Compute cross-correlation between focal and competitor prices"""
    correlations = {}
    
    for lag in range(1, max_lag + 1):
        competitor_lagged = competitor_series.shift(lag)
        corr = focal_series.corr(competitor_lagged)
        correlations[lag] = corr
    
    return correlations

def select_optimal_lags_aic(df, focal_col, max_lag=14):
    """Select optimal lag structure using AIC"""
    from sklearn.linear_model import LinearRegression
    from sklearn.metrics import mean_squared_error
    
    results = {}
    
    competitor_cols = [col for col in df.columns if col != focal_col and col != 'date' and 'lag' not in col]
    
    for max_lag_test in range(1, max_lag + 1):
        lag_features = []
        for col in [focal_col] + competitor_cols:
            for lag in range(1, max_lag_test + 1):
                if f'{col}_lag_{lag}' in df.columns:
                    lag_features.append(f'{col}_lag_{lag}')
        
        if len(lag_features) > 0:
            df_test = df[lag_features + [focal_col]].dropna()
            
            if len(df_test) > len(lag_features) + 1:
                X = df_test[lag_features]
                y = df_test[focal_col]
                
                model = LinearRegression().fit(X, y)
                y_pred = model.predict(X)
                mse = mean_squared_error(y, y_pred)
                
                n = len(y)
                k = len(lag_features)
                aic = n * np.log(mse) + 2 * k
                
                results[max_lag_test] = {
                    'aic': aic,
                    'mse': mse,
                    'n_features': k,
                    'n_obs': n
                }
    
    return results

print("Creating comprehensive lags for analysis...")
max_lag_analysis = 14
price_columns = [col for col in df_final.columns if col != 'date']
df_analysis = create_comprehensive_lags(df_final, price_columns, max_lag_analysis)

if focal_col:
    print(f"Analyzing focal hotel column: {focal_col}")
    
    autocorr, pacf_vals = analyze_autocorrelation(df_final[focal_col], max_lag_analysis)
    
    print("\nAutocorrelation Analysis:")
    print("Lag\tACF\tPACF\tSignificant")
    for i in range(1, min(8, len(autocorr))):
        significant = "Yes" if abs(autocorr[i]) > 0.1 else "No"
        print(f"{i}\t{autocorr[i]:.3f}\t{pacf_vals[i]:.3f}\t{significant}")
    
    print("\nCross-correlation with competitors:")
    competitor_cols = [col for col in price_columns if col != focal_col]
    
    optimal_lags_by_competitor = {}
    for comp_col in competitor_cols[:3]:
        cross_corr = analyze_cross_correlation(df_final[focal_col], df_final[comp_col], max_lag_analysis)
        
        max_corr_lag = max(cross_corr.keys(), key=lambda k: abs(cross_corr[k]))
        optimal_lags_by_competitor[comp_col] = {
            'optimal_lag': max_corr_lag,
            'correlation': cross_corr[max_corr_lag]
        }
        
        print(f"{comp_col[:20]}... - Optimal lag: {max_corr_lag}, Correlation: {cross_corr[max_corr_lag]:.3f}")
    
    print("\nAIC-based lag selection:")
    aic_results = select_optimal_lags_aic(df_analysis, focal_col, max_lag_analysis)
    
    if aic_results:
        optimal_lag_aic = min(aic_results.keys(), key=lambda k: aic_results[k]['aic'])
        print(f"Optimal max lag (AIC): {optimal_lag_aic}")
        print(f"AIC: {aic_results[optimal_lag_aic]['aic']:.2f}")
        print(f"Features: {aic_results[optimal_lag_aic]['n_features']}")
        
        most_common_lags = []
        for comp_data in optimal_lags_by_competitor.values():
            if abs(comp_data['correlation']) > 0.05:
                most_common_lags.append(comp_data['optimal_lag'])
        
        if most_common_lags:
            from collections import Counter
            lag_counts = Counter(most_common_lags)
            significant_autocorr_lags = [i for i in range(1, 8) if abs(autocorr[i]) > 0.1]
            
            final_lags = list(set([1] + significant_autocorr_lags + [lag for lag, count in lag_counts.most_common(2)]))
            final_lags = sorted([lag for lag in final_lags if lag <= min(optimal_lag_aic, 5)])
            
            print(f"\nRecommended lags based on analysis (capped at 5 days): {final_lags}")
        else:
            final_lags = [1, 3, 7]
            print(f"\nDefaulting to standard lags: {final_lags}")
    else:
        final_lags = [1, 3, 7]
        print(f"\nDefaulting to standard lags: {final_lags}")
else:
    final_lags = [1, 3, 7]
    print("Could not identify focal hotel column, using default lags: [1, 3, 7]")

# ============================================================================
# SECTION 4: FINAL LAGGED FEATURE CREATION
# ============================================================================
print("\n" + "="*80)
print("SECTION 4: FINAL LAGGED FEATURE CREATION")
print("="*80)

def create_final_lagged_features(df, price_columns, selected_lags):
    """Create final lagged features based on analysis"""
    lag_dfs = []
    
    for col in price_columns:
        col_lags = {}
        for lag in selected_lags:
            col_lags[f'{col}_lag_{lag}'] = df[col].shift(lag)
        lag_dfs.append(pd.DataFrame(col_lags))
    
    df_lagged = pd.concat([df] + lag_dfs, axis=1)
    return df_lagged

print(f"Creating final lagged features with selected lags: {final_lags}")
df_with_lags = create_final_lagged_features(df_final, price_columns, final_lags)

print(f"Dataset shape after adding selected lags: {df_with_lags.shape}")
print(f"Missing values after lagging: {df_with_lags.isnull().sum().sum()}")

df_with_lags_clean = df_with_lags.dropna()
print(f"Final dataset shape after removing NaN: {df_with_lags_clean.shape}")
print(f"Data retention: {len(df_with_lags_clean)/len(df_final)*100:.1f}%")

# ============================================================================
# SECTION 5: ADD TEMPORAL FEATURES
# ============================================================================
print("\n" + "="*80)
print("SECTION 5: TEMPORAL FEATURE ENGINEERING")
print("="*80)

def add_temporal_features(df):
    """Add cyclical temporal features"""
    df_temporal = df.copy()
    
    df_temporal['day_of_week'] = df_temporal['date'].dt.dayofweek
    df_temporal['month'] = df_temporal['date'].dt.month
    df_temporal['day_of_year'] = df_temporal['date'].dt.dayofyear
    
    df_temporal['sin_day_of_week'] = np.sin(2 * np.pi * df_temporal['day_of_week'] / 7)
    df_temporal['cos_day_of_week'] = np.cos(2 * np.pi * df_temporal['day_of_week'] / 7)
    df_temporal['sin_month'] = np.sin(2 * np.pi * df_temporal['month'] / 12)
    df_temporal['cos_month'] = np.cos(2 * np.pi * df_temporal['month'] / 12)
    df_temporal['sin_day_of_year'] = np.sin(2 * np.pi * df_temporal['day_of_year'] / 365)
    df_temporal['cos_day_of_year'] = np.cos(2 * np.pi * df_temporal['day_of_year'] / 365)
    
    df_temporal['is_weekend'] = (df_temporal['day_of_week'] >= 5).astype(int)
    
    return df_temporal

df_with_temporal = add_temporal_features(df_with_lags_clean)
print(f"Dataset shape after adding temporal features: {df_with_temporal.shape}")

temporal_features = ['sin_day_of_week', 'cos_day_of_week', 'sin_month', 'cos_month', 
                    'sin_day_of_year', 'cos_day_of_year', 'is_weekend']
print(f"Temporal features added: {temporal_features}")

# ============================================================================
# SECTION 6: FEATURE CORRELATION ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("SECTION 6: FEATURE CORRELATION ANALYSIS")
print("="*80)

def analyze_selected_lag_features(df, focal_col, selected_lags):
    """Analyze the performance of selected lag features"""
    lag_columns = []
    for col in df.columns:
        if any(f'_lag_{lag}' in col for lag in selected_lags):
            lag_columns.append(col)
    
    if focal_col and lag_columns:
        correlations = df[lag_columns].corrwith(df[focal_col]).abs().sort_values(ascending=False)
        
        print(f"Top 10 correlations with {focal_col}:")
        print(correlations.head(10))
        
        focal_lag_cols = [col for col in lag_columns if focal_col.split('_')[0] in col]
        competitor_lag_cols = [col for col in lag_columns if col not in focal_lag_cols]
        
        if focal_lag_cols:
            print(f"\nFocal hotel lag correlations:")
            focal_corrs = df[focal_lag_cols].corrwith(df[focal_col]).abs().sort_values(ascending=False)
            print(focal_corrs)
        
        if competitor_lag_cols:
            print(f"\nTop competitor lag correlations:")
            comp_corrs = df[competitor_lag_cols].corrwith(df[focal_col]).abs().sort_values(ascending=False)
            print(comp_corrs.head(5))
    
    return lag_columns

if focal_col:
    final_lag_columns = analyze_selected_lag_features(df_with_temporal, focal_col, final_lags)
    print(f"\nTotal lag features created: {len(final_lag_columns)}")
    print(f"Lag features: {final_lag_columns[:5]}... (showing first 5)")

# ============================================================================
# SECTION 7: SUMMARY STATISTICS
# ============================================================================
print("\n" + "="*80)
print("SECTION 7: FINAL DATASET SUMMARY")
print("="*80)

total_observations = len(df_with_temporal)
total_features = len(df_with_temporal.columns) - 1
lag_features = len([col for col in df_with_temporal.columns if 'lag' in col])
temporal_features_count = len(temporal_features)

print(f"Final dataset summary:")
print(f"  Observations: {total_observations}")
print(f"  Total features: {total_features}")
print(f"  Lag features: {lag_features}")
print(f"  Temporal features: {temporal_features_count}")
print(f"  Original features: {total_features - lag_features - temporal_features_count}")
print(f"  Data lost to lagging: {len(df_final) - total_observations} rows ({(len(df_final) - total_observations)/len(df_final)*100:.1f}%)")

# ============================================================================
# SECTION 8: SAVE DATASET
# ============================================================================
print("\n" + "="*80)
print("SECTION 8: SAVE PREPARED DATASET")
print("="*80)

output_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'
output_path.mkdir(parents=True, exist_ok=True)

# Save with timeseries suffix
df_with_temporal.to_csv(output_path / 'lagged_predictive_dataset_timeseries.csv', index=False)

lag_metadata = {
    'imputation_method': 'timeseries',
    'selected_lags': final_lags,
    'lag_selection_method': 'statistical_analysis',
    'focal_column': focal_col,
    'total_lag_features': lag_features,
    'temporal_features': temporal_features,
    'final_observations': total_observations,
    'data_retention_pct': round(len(df_with_temporal)/len(df_final)*100, 1),
    'feature_summary': {
        'total_features': total_features,
        'lag_features': lag_features,
        'temporal_features': temporal_features_count,
        'original_features': total_features - lag_features - temporal_features_count
    },
    'data_quality': {
        'imputation_correlation_distortion': 0.0039,
        'imputation_method': 'Time-Series Decay'
    }
}

with open(output_path / 'lag_selection_metadata_timeseries.json', 'w') as f:
    json.dump(lag_metadata, f, indent=2)

print("Dataset saved successfully!")
print(f"\nFiles created:")
print(f"  - lagged_predictive_dataset_timeseries.csv")
print(f"  - lag_selection_metadata_timeseries.json")


SECTION 1: DATA LOADING
Loaded dataset shape: (365, 11)
Date range: 2025-09-16 00:00:00 to 2026-09-15 00:00:00
Using TIME-SERIES DECAY IMPUTATION competitor data

SECTION 2: DATA STRUCTURE VALIDATION
Dataset structure:
   base_rate  base_rate_normalized  day_of_week  month  is_weekend       date  \
0      219.0               188.290            1      9           0 2025-09-16   
1      249.0               243.145            2      9           0 2025-09-17   
2      269.0               279.715            3      9           0 2025-09-18   
3      209.0               170.005            4      9           0 2025-09-19   
4      209.0               170.005            5      9           1 2025-09-20   

   booking-us-aqua-pacific-monarch-USD  booking-us-castle-kamaole-sands-USD  \
0                                179.0                                308.0   
1                                214.0                                308.0   
2                                179.0                  