In [1]:
"""
COMPREHENSIVE MODELING - ALL IMPUTATION METHODS & HOTELS
=========================================================
Builds models for all 6 hotels using all 4 imputation methods.
Total: 24 models (6 hotels × 4 methods)

Extracts complete equations with performance metrics for final report.
"""

import pandas as pd
import numpy as np
from pathlib import Path
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

In [2]:
"""
EXTRACT COMPLETE MODEL EQUATIONS - ALL IMPUTATION METHODS
==========================================================
This script matches your exact preprocessing pipeline to extract equations.
"""

import pandas as pd
import numpy as np
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from pathlib import Path
import json

print("="*80)
print("EXTRACTING COMPLETE MODEL EQUATIONS - ALL METHODS")
print("="*80)

def preprocess_dataset(df, y, hotel_id):
    """Apply your exact preprocessing pipeline"""
    
    # Remove own lags
    if hotel_id == 'FOCAL':
        focal_lag_cols = [col for col in df.columns if 'base_rate_lag' in col.lower()]
        df = df.drop(columns=focal_lag_cols)
    else:
        own_lag_cols = [col for col in df.columns if f'{hotel_id}_lag' in col]
        df = df.drop(columns=own_lag_cols)
    
    # Create week features
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(float)
    week_mean = df['week_of_year'].mean()
    df['week_centered'] = df['week_of_year'] - week_mean
    df['week_squared'] = df['week_centered'] ** 2
    df['week_cubic'] = df['week_centered'] ** 3
    df['week_quartic'] = df['week_centered'] ** 4
    df['sin_week'] = np.sin(2 * np.pi * df['week_of_year'] / 52).astype(float)
    df['cos_week'] = np.cos(2 * np.pi * df['week_of_year'] / 52).astype(float)
    
    # Seasonal indicators
    if 'is_holiday' not in df.columns:
        df['is_holiday'] = 0
        df.loc[df['date'].dt.month.isin([12, 1]), 'is_holiday'] = 1
        df.loc[(df['date'].dt.month == 7) & (df['date'].dt.day <= 7), 'is_holiday'] = 1
        df.loc[(df['date'].dt.month == 11) & (df['date'].dt.day >= 22), 'is_holiday'] = 1
    
    df['is_peak_season'] = ((df['date'].dt.month.isin([6, 7, 8])) | 
                            (df['date'].dt.month.isin([12, 1]))).astype(int)
    df['is_summer'] = df['date'].dt.month.isin([6, 7, 8]).astype(int)
    
    # Get competitor lags
    competitor_cols = [col for col in df.columns if 'booking-us' in col and 'lag' in col]
    comp_corr_with_target = df[competitor_cols].corrwith(y).abs().sort_values(ascending=False)
    
    all_competitors = []
    for comp in comp_corr_with_target.index:
        is_redundant = False
        for existing in all_competitors:
            if abs(df[comp].corr(df[existing])) > 0.90:
                is_redundant = True
                break
        if not is_redundant:
            all_competitors.append(comp)
    
    seasonal_cols = ['is_holiday', 'is_peak_season', 'is_summer']
    
    # Feature pools
    features_A = all_competitors + seasonal_cols + ['sin_week', 'cos_week']
    features_B = all_competitors + seasonal_cols + ['week_centered', 'week_squared', 'week_cubic', 'week_quartic']
    features_D = list(set(features_A + features_B))
    
    return df, features_D

def extract_equation_details(df, y, features_D, hotel_name, imputation_method):
    """Extract complete equation using Lasso + OLS"""
    
    # Lasso selection
    X_D = df[features_D].dropna()
    y_D = y[X_D.index]
    
    scaler = StandardScaler()
    X_D_scaled = scaler.fit_transform(X_D)
    
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000)
    lasso.fit(X_D_scaled, y_D)
    
    selected_mask = lasso.coef_ != 0
    selected_features = np.array(features_D)[selected_mask]
    
    if len(selected_features) == 0:
        # No features selected - return baseline model
        return {
            'hotel': hotel_name,
            'imputation_method': imputation_method,
            'intercept': float(y_D.mean()),
            'coefficients': [],
            'metrics': {
                'adj_r2': 0.0,
                'rmse': float(np.std(y_D)),
                'mape': float(np.mean(np.abs(y_D - y_D.mean()) / y_D) * 100),
                'n_features': 0,
                'n_observations': int(len(y_D)),
                'mean_price': float(np.mean(y_D))
            }
        }
    
    # Refit with OLS
    X_selected = df.loc[X_D.index, list(selected_features)]
    y_selected = y[X_D.index]
    
    X_with_const = sm.add_constant(X_selected)
    model = sm.OLS(y_selected, X_with_const).fit(cov_type='HC1')
    
    # Extract coefficients
    coefficients = []
    for feat in selected_features:
        coef = model.params[feat]
        pval = model.pvalues[feat]
        
        if pval < 0.001:
            sig = '***'
        elif pval < 0.01:
            sig = '**'
        elif pval < 0.05:
            sig = '*'
        else:
            sig = ''
        
        # Categorize feature
        if 'week' in feat.lower() or 'cos' in feat.lower() or 'sin' in feat.lower():
            category = 'Temporal'
        elif 'lag' in feat.lower():
            category = 'Competitor'
        else:
            category = 'Seasonal'
        
        coefficients.append({
            'feature': feat,
            'coefficient': float(coef),
            'p_value': float(pval),
            'significance': sig,
            'category': category
        })
    
    # Calculate metrics
    y_pred = model.predict(X_with_const)
    mape = float(np.mean(np.abs((y_selected - y_pred) / y_selected)) * 100)
    
    return {
        'hotel': hotel_name,
        'imputation_method': imputation_method,
        'intercept': float(model.params['const']),
        'coefficients': coefficients,
        'metrics': {
            'adj_r2': float(model.rsquared_adj),
            'rmse': float(np.sqrt(model.mse_resid)),
            'mape': mape,
            'n_features': int(len(selected_features)),
            'n_observations': int(len(y_selected)),
            'mean_price': float(np.mean(y_selected))
        }
    }

# Process all methods and hotels
data_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'

imputation_methods = ['baseline', 'knn', 'mice', 'timeseries']
hotel_ids = [
    'FOCAL',
    'booking-us-aqua-pacific-monarch-USD',
    'booking-us-castle-kamaole-sands-USD',
    'booking-us-courtyard-by-marriott-maui-kahului-airport-USD',
    'booking-us-kohea-kai-resort-maui-USD',
    'booking-us-ohana-waikiki-malia-USD'
]

all_equations = []

for method in imputation_methods:
    print(f"\n{'='*80}")
    print(f"PROCESSING: {method.upper()}")
    print(f"{'='*80}")
    
    df = pd.read_csv(data_path / f'lagged_predictive_dataset_{method}.csv')
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    
    for hotel_id in hotel_ids:
        print(f"\n  Extracting equation: {hotel_id}")
        
        # Get target variable
        hotel_col = 'base_rate' if hotel_id == 'FOCAL' else hotel_id
        
        if hotel_col not in df.columns:
            print(f"    WARNING: {hotel_col} not found, skipping")
            continue
        
        y = df[hotel_col].copy()
        
        # Preprocess
        df_processed, features_D = preprocess_dataset(df.copy(), y, hotel_id)
        
        # Extract equation
        equation = extract_equation_details(df_processed, y, features_D, hotel_id, method)
        all_equations.append(equation)
        
        print(f"    Adj R²: {equation['metrics']['adj_r2']:.4f}")
        print(f"    MAPE: {equation['metrics']['mape']:.2f}%")
        print(f"    Features: {equation['metrics']['n_features']}")

# Save results
with open('complete_model_equations.json', 'w') as f:
    json.dump(all_equations, f, indent=2)

# Display sample equation
print("\n" + "="*80)
print("SAMPLE EQUATION - BASELINE FOCAL")
print("="*80)

baseline_focal = [eq for eq in all_equations 
                  if eq['imputation_method'] == 'baseline' and eq['hotel'] == 'FOCAL'][0]

print(f"\nHotel: FOCAL")
print(f"Imputation: BASELINE")
print(f"Performance: Adj R² = {baseline_focal['metrics']['adj_r2']:.4f}, MAPE = {baseline_focal['metrics']['mape']:.2f}%")
print(f"\nEquation:")
print(f"base_rate = {baseline_focal['intercept']:.4f}")

# Group by category
for category in ['Temporal', 'Seasonal', 'Competitor']:
    cat_coefs = [c for c in baseline_focal['coefficients'] if c['category'] == category]
    if cat_coefs:
        print(f"\n  # {category} Features")
        for coef in cat_coefs:
            sign = '+' if coef['coefficient'] > 0 else ''
            print(f"    {sign}{coef['coefficient']:.6f} × {coef['feature']} {coef['significance']}")

print(f"\nSignificance: *** p<0.001, ** p<0.01, * p<0.05")

print("\n" + "="*80)
print("COMPLETE EQUATIONS SAVED")
print("="*80)
print("✓ Saved: complete_model_equations.json")

EXTRACTING COMPLETE MODEL EQUATIONS - ALL METHODS

PROCESSING: BASELINE

  Extracting equation: FOCAL
    Adj R²: 0.5833
    MAPE: 6.36%
    Features: 13

  Extracting equation: booking-us-aqua-pacific-monarch-USD
    Adj R²: 0.5535
    MAPE: 4.48%
    Features: 11

  Extracting equation: booking-us-castle-kamaole-sands-USD
    Adj R²: 0.6234
    MAPE: 4.29%
    Features: 13

  Extracting equation: booking-us-courtyard-by-marriott-maui-kahului-airport-USD
    Adj R²: 0.6103
    MAPE: 4.66%
    Features: 11

  Extracting equation: booking-us-kohea-kai-resort-maui-USD
    Adj R²: 0.5596
    MAPE: 2.89%
    Features: 13

  Extracting equation: booking-us-ohana-waikiki-malia-USD
    Adj R²: 0.7371
    MAPE: 6.31%
    Features: 16

PROCESSING: KNN

  Extracting equation: FOCAL
    Adj R²: 0.5933
    MAPE: 6.32%
    Features: 18

  Extracting equation: booking-us-aqua-pacific-monarch-USD
    Adj R²: 0.7214
    MAPE: 4.48%
    Features: 8

  Extracting equation: booking-us-castle-kamaole-sand

In [3]:
# import pandas as pd
# from pathlib import Path

# data_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'

# for method in ['baseline', 'knn', 'mice', 'timeseries']:
#     df = pd.read_csv(data_path / f'lagged_predictive_dataset_{method}.csv')
    
#     print(f"\n{'='*80}")
#     print(f"{method.upper()} DATASET COLUMNS")
#     print(f"{'='*80}")
#     print(f"Total columns: {len(df.columns)}")
    
#     # Show all columns that contain 'base_rate'
#     base_rate_cols = [col for col in df.columns if 'base_rate' in col.lower()]
#     print(f"\nBase rate related columns ({len(base_rate_cols)}):")
#     for col in base_rate_cols:
#         print(f"  - {col}")
    
#     # Show temporal features
#     temporal_cols = [col for col in df.columns if any(x in col for x in ['week', 'month', 'sin', 'cos', 'peak', 'holiday', 'summer'])]
#     print(f"\nTemporal/Seasonal columns ({len(temporal_cols)}):")
#     for col in temporal_cols:
#         print(f"  - {col}")