In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import xgboost as xgb
import lightgbm as lgb
import shap
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("NON-PARAMETRIC MODELS & SHAP ANALYSIS (FIXED - NO DATA LEAKAGE)")
print("="*80)

# Load datasets
matrix_path = Path('../../../data/dataprocessed')
baseline_path = Path('../../results/data/output-data')

df_matrix = pd.read_csv(matrix_path / 'lagged_predictive_dataset_matrix_completion.csv')
df_matrix['date'] = pd.to_datetime(df_matrix['date'])
df_matrix = df_matrix.sort_values('date').reset_index(drop=True)

df_baseline = pd.read_csv(baseline_path / 'lagged_predictive_dataset_baseline.csv')
df_baseline['date'] = pd.to_datetime(df_baseline['date'])
df_baseline = df_baseline.sort_values('date').reset_index(drop=True)

print(f"\nMatrix Completion data: {df_matrix.shape}")
print(f"Baseline data: {df_baseline.shape}")

# Hotel configuration
hotel_ids = [
    'FOCAL',
    'booking-us-aqua-pacific-monarch-USD',
    'booking-us-castle-kamaole-sands-USD',
    'booking-us-courtyard-by-marriott-maui-kahului-airport-USD',
    'booking-us-kohea-kai-resort-maui-USD',
    'booking-us-ohana-waikiki-malia-USD'
]

hotel_names_display = {
    'FOCAL': 'FOCAL HOTEL',
    'booking-us-aqua-pacific-monarch-USD': 'AQUA PACIFIC MONARCH',
    'booking-us-castle-kamaole-sands-USD': 'CASTLE KAMAOLE SANDS',
    'booking-us-courtyard-by-marriott-maui-kahului-airport-USD': 'COURTYARD MARRIOTT MAUI',
    'booking-us-kohea-kai-resort-maui-USD': 'KOHEA KAI RESORT',
    'booking-us-ohana-waikiki-malia-USD': 'OHANA WAIKIKI MALIA'
}

# Determine which dataset to use for each hotel
hotel_datasets = {
    'FOCAL': 'matrix',
    'booking-us-aqua-pacific-monarch-USD': 'matrix',
    'booking-us-castle-kamaole-sands-USD': 'matrix',
    'booking-us-courtyard-by-marriott-maui-kahului-airport-USD': 'matrix',
    'booking-us-kohea-kai-resort-maui-USD': 'baseline',
    'booking-us-ohana-waikiki-malia-USD': 'matrix'
}

def prepare_features(df, hotel_id):
    """Prepare features for modeling - FIXED TO PREVENT DATA LEAKAGE"""
    # Add temporal features
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(float)
    week_mean = df['week_of_year'].mean()
    df['week_centered'] = df['week_of_year'] - week_mean
    df['week_squared'] = df['week_centered'] ** 2
    df['week_cubic'] = df['week_centered'] ** 3
    df['sin_week'] = np.sin(2 * np.pi * df['week_of_year'] / 52).astype(float)
    df['cos_week'] = np.cos(2 * np.pi * df['week_of_year'] / 52).astype(float)
    
    if 'is_holiday' not in df.columns:
        df['is_holiday'] = 0
        df.loc[df['date'].dt.month.isin([12, 1]), 'is_holiday'] = 1
        df.loc[(df['date'].dt.month == 7) & (df['date'].dt.day <= 7), 'is_holiday'] = 1
        df.loc[(df['date'].dt.month == 11) & (df['date'].dt.day >= 22), 'is_holiday'] = 1
    
    df['is_peak_season'] = ((df['date'].dt.month.isin([6, 7, 8])) | 
                            (df['date'].dt.month.isin([12, 1]))).astype(int)
    df['is_summer'] = df['date'].dt.month.isin([6, 7, 8]).astype(int)
    
    # CRITICAL FIX: Remove ALL columns containing the target variable or its lags
    if hotel_id == 'FOCAL':
        # Exclude ALL columns with 'base_rate' in them (including normalized and lags)
        leakage_cols = [col for col in df.columns if 'base_rate' in col.lower() and col != 'base_rate']
        if leakage_cols:
            df = df.drop(columns=leakage_cols, errors='ignore')
            print(f"  ✓ Excluded {len(leakage_cols)} base_rate-related columns to prevent leakage")
            print(f"    Excluded: {', '.join(leakage_cols[:5])}{'...' if len(leakage_cols) > 5 else ''}")
    else:
        # Exclude own price and own lags for competitor hotels
        own_lag_cols = [col for col in df.columns if f'{hotel_id}_lag' in col]
        if own_lag_cols:
            df = df.drop(columns=own_lag_cols, errors='ignore')
            print(f"  ✓ Excluded {len(own_lag_cols)} own-lag columns to prevent leakage")
    
    return df

def train_nonparametric_models_insample(X, y, hotel_name):
    """Train Random Forest, XGBoost, and LightGBM - IN-SAMPLE"""
    results = {}
    
    # Random Forest
    print(f"\n  Training Random Forest...")
    rf = RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X, y)
    y_pred_rf = rf.predict(X)
    
    results['random_forest'] = {
        'model': rf,
        'r2': r2_score(y, y_pred_rf),
        'rmse': np.sqrt(mean_squared_error(y, y_pred_rf)),
        'mape': mean_absolute_percentage_error(y, y_pred_rf) * 100,
        'predictions': y_pred_rf
    }
    
    # XGBoost
    print(f"  Training XGBoost...")
    xgb_model = xgb.XGBRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1
    )
    xgb_model.fit(X, y)
    y_pred_xgb = xgb_model.predict(X)
    
    results['xgboost'] = {
        'model': xgb_model,
        'r2': r2_score(y, y_pred_xgb),
        'rmse': np.sqrt(mean_squared_error(y, y_pred_xgb)),
        'mape': mean_absolute_percentage_error(y, y_pred_xgb) * 100,
        'predictions': y_pred_xgb
    }
    
    # LightGBM
    print(f"  Training LightGBM...")
    lgb_model = lgb.LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.05,
        subsample=0.8,
        colsample_bytree=0.8,
        random_state=42,
        n_jobs=-1,
        verbose=-1
    )
    lgb_model.fit(X, y)
    y_pred_lgb = lgb_model.predict(X)
    
    results['lightgbm'] = {
        'model': lgb_model,
        'r2': r2_score(y, y_pred_lgb),
        'rmse': np.sqrt(mean_squared_error(y, y_pred_lgb)),
        'mape': mean_absolute_percentage_error(y, y_pred_lgb) * 100,
        'predictions': y_pred_lgb
    }
    
    return results

# Store all results
all_results = {}
shap_results = {}

print("\n" + "="*80)
print("PART C: NON-PARAMETRIC MODELS (IN-SAMPLE, NO LEAKAGE)")
print("="*80)

for hotel_id in hotel_ids:
    hotel_col = 'base_rate' if hotel_id == 'FOCAL' else hotel_id
    dataset_type = hotel_datasets[hotel_id]
    
    df = df_matrix.copy() if dataset_type == 'matrix' else df_baseline.copy()
    
    print(f"\n{hotel_names_display[hotel_id]}")
    print(f"Dataset: {dataset_type.upper()}")
    
    if hotel_col not in df.columns:
        print(f"  ✗ Column {hotel_col} not found")
        continue
    
    # Prepare data
    df_prep = prepare_features(df.copy(), hotel_id)
    y = df_prep[hotel_col].copy()
    
    # Select features (exclude date, target, and own lags)
    exclude_cols = ['date', hotel_col]
    feature_cols = [col for col in df_prep.columns if col not in exclude_cols and df_prep[col].dtype in ['int64', 'float64']]
    
    X = df_prep[feature_cols].fillna(0)
    
    print(f"  Sample size: {len(X)}")
    print(f"  Features: {len(feature_cols)}")
    
    # Train models (IN-SAMPLE)
    results = train_nonparametric_models_insample(X, y, hotel_id)
    all_results[hotel_id] = results
    
    # Print results
    print(f"\n  Model Performance (IN-SAMPLE, NO LEAKAGE):")
    for model_name, res in results.items():
        print(f"    {model_name.upper():<15} R²: {res['r2']:.4f}  RMSE: ${res['rmse']:.2f}  MAPE: {res['mape']:.2f}%")

print("\n" + "="*80)
print("PART D: SHAP FEATURE IMPORTANCE (ALL HOTELS)")
print("="*80)

# Calculate SHAP for all hotels
for hotel_id in hotel_ids:
    hotel_col = 'base_rate' if hotel_id == 'FOCAL' else hotel_id
    dataset_type = hotel_datasets[hotel_id]
    
    if hotel_id not in all_results:
        continue
    
    print(f"\n{hotel_names_display[hotel_id]}")
    print("-" * 60)
    
    df = df_matrix.copy() if dataset_type == 'matrix' else df_baseline.copy()
    df_prep = prepare_features(df.copy(), hotel_id)
    y = df_prep[hotel_col].copy()
    
    exclude_cols = ['date', hotel_col]
    feature_cols = [col for col in df_prep.columns if col not in exclude_cols and df_prep[col].dtype in ['int64', 'float64']]
    X = df_prep[feature_cols].fillna(0)
    
    # Get the best performing model for this hotel
    hotel_results = all_results[hotel_id]
    best_model_name = max(hotel_results.keys(), key=lambda k: hotel_results[k]['r2'])
    best_model = hotel_results[best_model_name]['model']
    
    print(f"Best model: {best_model_name.upper()} (R² = {hotel_results[best_model_name]['r2']:.4f})")
    print(f"Samples: {len(X)}")
    
    # Calculate SHAP values
    try:
        if best_model_name == 'random_forest':
            explainer = shap.TreeExplainer(best_model)
            shap_values = explainer.shap_values(X)
        elif best_model_name in ['xgboost', 'lightgbm']:
            explainer = shap.TreeExplainer(best_model)
            shap_values = explainer.shap_values(X)
        
        # Get feature importance
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': np.abs(shap_values).mean(axis=0)
        }).sort_values('importance', ascending=False)
        
        print(f"\nTop 15 Most Important Features:")
        for idx, row in feature_importance.head(15).iterrows():
            print(f"  {row['feature']:<50} {row['importance']:.6f}")
        
        # Save SHAP results for this hotel
        shap_results[hotel_id] = {
            'hotel_name': hotel_names_display[hotel_id],
            'feature_importance': feature_importance.to_dict('records'),
            'best_model': best_model_name,
            'model_performance': {
                'r2': float(hotel_results[best_model_name]['r2']),
                'rmse': float(hotel_results[best_model_name]['rmse']),
                'mape': float(hotel_results[best_model_name]['mape'])
            },
            'dataset_type': dataset_type
        }
        
    except Exception as e:
        print(f"  ✗ SHAP calculation failed: {str(e)}")
        continue

# Create output directory
output_path = Path('../../../data/dataprocessed')
output_path.mkdir(parents=True, exist_ok=True)

# Save numerical results
results_summary = {}
for hotel_id, res in all_results.items():
    results_summary[hotel_id] = {
        model_name: {
            'r2': float(metrics['r2']),
            'rmse': float(metrics['rmse']),
            'mape': float(metrics['mape'])
        }
        for model_name, metrics in res.items()
    }

with open(output_path / 'nonparametric_results_insample_no_leakage.json', 'w') as f:
    json.dump(results_summary, f, indent=2)

with open(output_path / 'shap_feature_importance_all_hotels_no_leakage.json', 'w') as f:
    json.dump(shap_results, f, indent=2)

print("\n" + "="*80)
print("SUMMARY TABLE - ALL MODELS & HOTELS (IN-SAMPLE, NO LEAKAGE)")
print("="*80)
print(f"\n{'Hotel':<35} {'Model':<15} {'R²':<8} {'RMSE ($)':<10} {'MAPE (%)':<10}")
print("-" * 80)

for hotel_id in hotel_ids:
    if hotel_id in all_results:
        for model_name, res in all_results[hotel_id].items():
            hotel_display = hotel_names_display[hotel_id].title()
            model_display = model_name.replace('_', ' ').title()
            print(f"{hotel_display:<35} {model_display:<15} {res['r2']:<8.3f} {res['rmse']:<10.2f} {res['mape']:<10.2f}")
        print()

# Calculate means
all_r2 = [res['r2'] for results in all_results.values() for res in results.values()]
all_rmse = [res['rmse'] for results in all_results.values() for res in results.values()]
all_mape = [res['mape'] for results in all_results.values() for res in results.values()]

print("-" * 80)
print(f"{'Mean (All Models)':<35} {'':<15} {np.mean(all_r2):<8.3f} {np.mean(all_rmse):<10.2f} {np.mean(all_mape):<10.2f}")

# Print SHAP summary
print("\n" + "="*80)
print("SHAP ANALYSIS SUMMARY")
print("="*80)
print(f"\nSHAP values calculated for {len(shap_results)}/6 hotels")
print("\nTop 5 Features by Hotel (based on SHAP importance):")
print("-" * 80)

for hotel_id, shap_data in shap_results.items():
    print(f"\n{shap_data['hotel_name']}:")
    print(f"  Best Model: {shap_data['best_model'].upper()}")
    print(f"  R²: {shap_data['model_performance']['r2']:.4f}")
    print(f"  Top 5 Features:")
    for i, feat in enumerate(shap_data['feature_importance'][:5], 1):
        print(f"    {i}. {feat['feature']:<45} (importance: {feat['importance']:.6f})")

print("\n" + "="*80)
print("COMPARISON WITH LINEAR REGRESSION")
print("="*80)
print("\nLinear Regression (Hybrid): Avg R² = 0.6650, Avg MAPE = 5.92%")
print(f"Non-Parametric (No Leakage): Avg R² = {np.mean(all_r2):.4f}, Avg MAPE = {np.mean(all_mape):.2f}%")

print("\n" + "="*80)
print("FILES SAVED:")
print("  - nonparametric_results_insample_no_leakage.json")
print("  - shap_feature_importance_all_hotels_no_leakage.json")
print("="*80)

NON-PARAMETRIC MODELS & SHAP ANALYSIS (FIXED - NO DATA LEAKAGE)

Matrix Completion data: (359, 53)
Baseline data: (360, 68)

PART C: NON-PARAMETRIC MODELS (IN-SAMPLE, NO LEAKAGE)

FOCAL HOTEL
Dataset: MATRIX
  ✓ Excluded 11 base_rate-related columns to prevent leakage
    Excluded: base_rate_normalized, base_rate_lag_1, base_rate_lag_2, base_rate_lag_3, base_rate_lag_4...
  Sample size: 359
  Features: 49

  Training Random Forest...
  Training XGBoost...
  Training LightGBM...

  Model Performance (IN-SAMPLE, NO LEAKAGE):
    RANDOM_FOREST   R²: 0.9199  RMSE: $9.58  MAPE: 2.31%
    XGBOOST         R²: 0.9969  RMSE: $1.89  MAPE: 0.52%
    LIGHTGBM        R²: 0.9302  RMSE: $8.94  MAPE: 2.32%

AQUA PACIFIC MONARCH
Dataset: MATRIX
  ✓ Excluded 5 own-lag columns to prevent leakage
  Sample size: 359
  Features: 55

  Training Random Forest...
  Training XGBoost...
  Training LightGBM...

  Model Performance (IN-SAMPLE, NO LEAKAGE):
    RANDOM_FOREST   R²: 0.9503  RMSE: $7.42  MAPE: 1.72%
 


Top 15 Most Important Features:
  week_of_year                                       7.381121
  day_of_week_lag_4                                  2.233213
  booking-us-courtyard-by-marriott-maui-kahului-airport-USD_lag_3 1.748221
  base_rate_lag_1                                    1.282248
  week_centered                                      1.114531
  booking-us-ohana-waikiki-malia-USD                 0.913147
  booking-us-ohana-waikiki-malia-USD_lag_2           0.776066
  month                                              0.727995
  booking-us-ohana-waikiki-malia-USD_lag_1           0.717090
  booking-us-ohana-waikiki-malia-USD_lag_3           0.679130
  day_of_year                                        0.622628
  base_rate_lag_2                                    0.621947
  base_rate_lag_3                                    0.593408
  day_of_week_lag_3                                  0.562861
  sin_day_of_year                                    0.559259

OHANA WAIKIKI MALIA
---

In [2]:
"""
EXTRACT FEATURES USED BY TREE-BASED MODELS
===========================================
Show which features each model actually used and their importance
"""

import pandas as pd
import numpy as np
from pathlib import Path
import json

# Load the saved results
data_path = Path('../../../data/dataprocessed')

with open(data_path / 'shap_feature_importance_all_hotels_no_leakage.json', 'r') as f:
    shap_data = json.load(f)

print("="*80)
print("FEATURES USED BY TREE-BASED MODELS (NO DATA LEAKAGE)")
print("="*80)

hotel_names = {
    'FOCAL': 'FOCAL HOTEL',
    'booking-us-aqua-pacific-monarch-USD': 'AQUA PACIFIC MONARCH',
    'booking-us-castle-kamaole-sands-USD': 'CASTLE KAMAOLE SANDS',
    'booking-us-courtyard-by-marriott-maui-kahului-airport-USD': 'COURTYARD MARRIOTT MAUI',
    'booking-us-kohea-kai-resort-maui-USD': 'KOHEA KAI RESORT',
    'booking-us-ohana-waikiki-malia-USD': 'OHANA WAIKIKI MALIA'
}

for hotel_id, data in shap_data.items():
    print(f"\n{'='*80}")
    print(f"{data['hotel_name']}")
    print(f"{'='*80}")
    
    print(f"\nBest Model: {data['best_model'].upper()}")
    print(f"Performance: R² = {data['model_performance']['r2']:.4f}, MAPE = {data['model_performance']['mape']:.2f}%")
    print(f"Dataset: {data['dataset_type'].upper()}")
    
    # Get all features
    all_features = data['feature_importance']
    total_features = len(all_features)
    
    # Filter features with importance > 0.01 (actually used)
    used_features = [f for f in all_features if f['importance'] > 0.01]
    
    print(f"\nTotal available features: {total_features}")
    print(f"Features actually used (importance > 0.01): {len(used_features)}")
    
    # Categorize features
    temporal = []
    competitor = []
    seasonal = []
    other = []
    
    for feat in used_features:
        fname = feat['feature']
        if any(x in fname.lower() for x in ['month', 'week', 'day_of_year', 'day_of_week']):
            temporal.append(feat)
        elif 'booking-us' in fname or 'base_rate' in fname:
            competitor.append(feat)
        elif any(x in fname.lower() for x in ['sin_', 'cos_', 'holiday', 'peak', 'summer', 'weekend']):
            seasonal.append(feat)
        else:
            other.append(feat)
    
    print(f"\nFeature Breakdown:")
    print(f"  Temporal features: {len(temporal)}")
    print(f"  Competitor features: {len(competitor)}")
    print(f"  Seasonal features: {len(seasonal)}")
    print(f"  Other features: {len(other)}")
    
    # Show top 20 features with importance
    print(f"\nTop 20 Features by Importance:")
    print(f"{'Rank':<6} {'Feature':<55} {'Importance':<12} {'Category'}")
    print("-" * 90)
    
    for i, feat in enumerate(all_features[:20], 1):
        fname = feat['feature']
        importance = feat['importance']
        
        # Determine category
        if any(x in fname.lower() for x in ['month', 'week', 'day_of_year', 'day_of_week']):
            cat = 'Temporal'
        elif 'booking-us' in fname or 'base_rate' in fname:
            cat = 'Competitor'
        elif any(x in fname.lower() for x in ['sin_', 'cos_', 'holiday', 'peak', 'summer', 'weekend']):
            cat = 'Seasonal'
        else:
            cat = 'Other'
        
        print(f"{i:<6} {fname:<55} {importance:<12.6f} {cat}")
    
    # Calculate cumulative importance
    cumsum = np.cumsum([f['importance'] for f in all_features])
    total_importance = cumsum[-1]
    
    # Find how many features explain 90%, 95%, 99% of importance
    idx_90 = np.argmax(cumsum >= 0.90 * total_importance) + 1
    idx_95 = np.argmax(cumsum >= 0.95 * total_importance) + 1
    idx_99 = np.argmax(cumsum >= 0.99 * total_importance) + 1
    
    print(f"\nCumulative Feature Importance:")
    print(f"  Top {idx_90} features explain 90% of model importance")
    print(f"  Top {idx_95} features explain 95% of model importance")
    print(f"  Top {idx_99} features explain 99% of model importance")

print("\n" + "="*80)
print("KEY DIFFERENCES FROM LINEAR REGRESSION")
print("="*80)

print("\nLINEAR REGRESSION:")
print("  ✓ Provides explicit equations: Y = β₀ + β₁X₁ + β₂X₂ + ...")
print("  ✓ Each coefficient shows exact impact")
print("  ✓ Easy to interpret and explain")
print("  ✓ Can write down the exact formula")
print("  ✗ Assumes linear relationships")
print("  ✗ Lower accuracy (R² ≈ 0.67)")

print("\nTREE-BASED MODELS (XGBoost, Random Forest, LightGBM):")
print(" Capture non-linear relationships")
print(" Much higher accuracy (R² ≈ 0.97-0.99)")
print(" Feature importance via SHAP values")
print(" Can show which features matter most")
print(" No simple equation")
print(" Predictions come from complex decision trees")
print(" 'Black box' - harder to explain exactly HOW predictions are made")

print("\n" + "="*80)
print("WHAT YOU CAN REPORT FOR TREE-BASED MODELS")
print("="*80)

print("\nInstead of equations, report:")
print("  1. Model type and hyperparameters")
print("  2. Performance metrics (R², RMSE, MAPE)")
print("  3. Top N most important features (via SHAP)")
print("  4. Feature importance rankings")
print("  5. Partial dependence plots (how each feature affects price)")
print("  6. Example predictions with SHAP force plots")

print("\nFor your thesis/report:")
print("  - Use LINEAR REGRESSION for interpretability (show equations)")
print("  - Use TREE-BASED MODELS for accuracy (show feature importance)")
print("  - Compare both approaches")
print("  - Explain the accuracy vs interpretability tradeoff")

print("\n" + "="*80)
print("SUMMARY TABLE - ALL MODELS")
print("="*80)

print(f"\n{'Hotel':<35} {'Model':<15} {'R²':<8} {'MAPE':<8} {'Features Used':<15} {'Top Driver'}")
print("-" * 100)

for hotel_id, data in shap_data.items():
    hotel_name = data['hotel_name'].title()
    model = data['best_model'].upper()
    r2 = data['model_performance']['r2']
    mape = data['model_performance']['mape']
    
    used_feats = len([f for f in data['feature_importance'] if f['importance'] > 0.01])
    top_feat = data['feature_importance'][0]['feature']
    
    # Shorten long feature names
    if len(top_feat) > 35:
        top_feat = top_feat[:32] + "..."
    
    print(f"{hotel_name:<35} {model:<15} {r2:<8.4f} {mape:<8.2f} {used_feats:<15} {top_feat}")

print("\n" + "="*80)

FEATURES USED BY TREE-BASED MODELS (NO DATA LEAKAGE)

FOCAL HOTEL

Best Model: XGBOOST
Performance: R² = 0.9969, MAPE = 0.52%
Dataset: MATRIX

Total available features: 49
Features actually used (importance > 0.01): 47

Feature Breakdown:
  Temporal features: 14
  Competitor features: 30
  Seasonal features: 3
  Other features: 0

Top 20 Features by Importance:
Rank   Feature                                                 Importance   Category
------------------------------------------------------------------------------------------
1      month                                                   8.408547     Temporal
2      booking-us-courtyard-by-marriott-maui-kahului-airport-USD_lag_1 2.844317     Competitor
3      booking-us-kohea-kai-resort-maui-USD                    2.424810     Competitor
4      day_of_year                                             2.257157     Temporal
5      booking-us-kohea-kai-resort-maui-USD_lag_1              2.176940     Competitor
6      cos_day_of_yea