In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.linear_model import LassoCV, RidgeCV
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
import warnings
warnings.filterwarnings('ignore')

## Iteration 2 Baseline

In [2]:
# import pandas as pd
# import numpy as np
# from pathlib import Path
# import json
# from sklearn.metrics import r2_score, mean_squared_error
# import statsmodels.api as sm
# from statsmodels.stats.diagnostic import het_breuschpagan
# from statsmodels.stats.stattools import durbin_watson
# import warnings
# warnings.filterwarnings('ignore')

# print("="*80)
# print("LINEAR VS LOG-LOG REGRESSION MODELS")
# print("="*80)

# # Load data
# data_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'
# df = pd.read_csv(data_path / 'lagged_predictive_dataset.csv')

# with open(data_path / 'lag_selection_metadata.json', 'r') as f:
#     metadata = json.load(f)

# df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values('date').reset_index(drop=True)
# y = df[metadata['focal_column']]

# print(f"\nDataset: {len(df)} observations")
# print(f"Target: {metadata['focal_column']}")

# # ============================================================================
# # FEATURE PREPARATION
# # ============================================================================
# print("\n" + "="*80)
# print("FEATURE PREPARATION")
# print("="*80)

# # 1. ALL FOCAL LAGS
# focal_lags = [f'base_rate_lag_{i}' for i in range(1, 6)]
# print(f"\n1. Focal lags: {len(focal_lags)} features")

# # 2. ALL COMPETITORS (remove only severe multicollinearity > 0.90)
# competitor_cols = [col for col in df.columns if 'booking-us' in col and 'lag' in col]
# comp_corr_with_target = df[competitor_cols].corrwith(y).abs().sort_values(ascending=False)

# all_competitors = []
# for comp in comp_corr_with_target.index:
#     is_redundant = False
#     for existing in all_competitors:
#         if abs(df[comp].corr(df[existing])) > 0.90:
#             is_redundant = True
#             break
#     if not is_redundant:
#         all_competitors.append(comp)

# print(f"2. All competitors (correlation < 0.90): {len(all_competitors)} features")

# # 3. WEEK OF YEAR
# df['week_of_year'] = df['date'].dt.isocalendar().week.astype(float)
# df['sin_week'] = np.sin(2 * np.pi * df['week_of_year'] / 52).astype(float)
# df['cos_week'] = np.cos(2 * np.pi * df['week_of_year'] / 52).astype(float)
# print(f"3. Week of year: 2 features (sin_week, cos_week)")

# # 4. TEMPORAL FEATURES
# temporal_features = ['sin_day_of_week', 'cos_day_of_week', 'sin_month', 'cos_month', 
#                      'sin_day_of_year', 'cos_day_of_year', 'is_weekend']
# print(f"4. Temporal features: {len(temporal_features)} features")

# # 5. HOLIDAY FLAG
# if 'is_holiday' not in df.columns:
#     df['is_holiday'] = 0
#     df.loc[df['date'].dt.month.isin([12, 1]), 'is_holiday'] = 1
#     df.loc[(df['date'].dt.month == 7) & (df['date'].dt.day <= 7), 'is_holiday'] = 1
#     df.loc[(df['date'].dt.month == 11) & (df['date'].dt.day >= 22), 'is_holiday'] = 1
# print(f"5. Holiday flag: 1 feature")

# # 6. PEAK SEASON (Summer + Winter holidays)
# df['is_peak_season'] = ((df['date'].dt.month.isin([6, 7, 8])) | 
#                         (df['date'].dt.month.isin([12, 1]))).astype(int)
# print(f"6. Peak season flag: 1 feature")

# # 7. SUMMER TRAVEL SEASON (June-August specifically)
# df['is_summer'] = df['date'].dt.month.isin([6, 7, 8]).astype(int)
# print(f"7. Summer season flag: 1 feature")

# print(f"\nTotal features available: {len(focal_lags) + len(all_competitors) + 2 + len(temporal_features) + 3}")

# # ============================================================================
# # LINEAR MODELS
# # ============================================================================
# print("\n" + "="*80)
# print("LINEAR MODELS")
# print("="*80)

# def estimate_linear_model(feature_list, model_name, description):
#     """Estimate OLS model with robust standard errors"""
#     valid_features = [f for f in feature_list if f in df.columns]
#     X = df[valid_features]
#     X_const = sm.add_constant(X)
    
#     model = sm.OLS(y, X_const).fit(cov_type='HC1')
#     y_pred = model.predict(X_const)
#     residuals = y - y_pred
    
#     n, k = len(y), len(valid_features)
#     r2 = r2_score(y, y_pred)
#     adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)
#     rmse = np.sqrt(mean_squared_error(y, y_pred))
    
#     sig_count = sum(model.pvalues[1:] < 0.05)
#     dw = durbin_watson(residuals)
#     bp_stat, bp_pval = het_breuschpagan(residuals, X_const)[0:2]
    
#     return {
#         'name': model_name,
#         'description': description,
#         'type': 'Linear',
#         'model': model,
#         'n_features': k,
#         'r2': r2,
#         'adj_r2': adj_r2,
#         'rmse': rmse,
#         'aic': model.aic,
#         'bic': model.bic,
#         'sig_features': sig_count,
#         'sig_ratio': sig_count / k if k > 0 else 0,
#         'durbin_watson': dw,
#         'heteroscedasticity_pval': bp_pval,
#         'condition_number': np.linalg.cond(X_const)
#     }

# models = {}

# # Linear Model 1: All focal + all competitors + week + holiday + season
# linear_full = focal_lags + all_competitors + ['sin_week', 'cos_week', 'is_holiday', 'is_peak_season', 'is_summer']
# print(f"\nEstimating Linear Model 1: Full specification")
# models['Linear_1_Full'] = estimate_linear_model(linear_full, 'Linear_1_Full', 
#                                                  'All focal + all competitors + seasonal')

# # Linear Model 2: Add temporal features
# linear_with_temporal = linear_full + temporal_features
# print(f"Estimating Linear Model 2: With temporal features")
# models['Linear_2_Temporal'] = estimate_linear_model(linear_with_temporal, 'Linear_2_Temporal', 
#                                                      'Model 1 + temporal features')

# # Linear Model 3: Simplified - top competitors only
# top_10_competitors = all_competitors[:10]
# linear_simplified = focal_lags + top_10_competitors + ['sin_week', 'cos_week', 'is_holiday', 'is_peak_season', 'is_summer']
# print(f"Estimating Linear Model 3: Top 10 competitors")
# models['Linear_3_Top10'] = estimate_linear_model(linear_simplified, 'Linear_3_Top10', 
#                                                   'All focal + top 10 competitors + seasonal')

# # ============================================================================
# # LOG-LOG MODELS
# # ============================================================================
# print("\n" + "="*80)
# print("LOG-LOG MODELS")
# print("="*80)

# # Create log transformations
# df['log_base_rate'] = np.log(y)

# # Log focal lags
# log_focal_lags = []
# for lag in focal_lags:
#     if lag in df.columns and (df[lag] > 0).all():
#         log_name = f'log_{lag}'
#         df[log_name] = np.log(df[lag])
#         log_focal_lags.append(log_name)

# # Log competitors
# log_competitors = []
# for comp in all_competitors:
#     if comp in df.columns and (df[comp] > 0).all():
#         log_name = f'log_{comp}'
#         df[log_name] = np.log(df[comp])
#         log_competitors.append(log_name)

# print(f"\nLog-transformed focal lags: {len(log_focal_lags)}")
# print(f"Log-transformed competitors: {len(log_competitors)}")

# def estimate_loglog_model(feature_list, model_name, description):
#     """Estimate log-log model"""
#     valid_features = [f for f in feature_list if f in df.columns]
#     X = df[valid_features]
#     X_const = sm.add_constant(X)
    
#     # Estimate in log space
#     model = sm.OLS(df['log_base_rate'], X_const).fit(cov_type='HC1')
    
#     # Back-transform predictions to original scale
#     y_pred_log = model.predict(X_const)
#     y_pred = np.exp(y_pred_log)
#     residuals = y - y_pred
    
#     n, k = len(y), len(valid_features)
#     r2 = r2_score(y, y_pred)
#     adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)
#     rmse = np.sqrt(mean_squared_error(y, y_pred))
    
#     sig_count = sum(model.pvalues[1:] < 0.05)
#     dw = durbin_watson(residuals)
#     bp_stat, bp_pval = het_breuschpagan(residuals, X_const)[0:2]
    
#     return {
#         'name': model_name,
#         'description': description,
#         'type': 'Log-Log',
#         'model': model,
#         'n_features': k,
#         'r2': r2,
#         'adj_r2': adj_r2,
#         'rmse': rmse,
#         'aic': model.aic,
#         'bic': model.bic,
#         'sig_features': sig_count,
#         'sig_ratio': sig_count / k if k > 0 else 0,
#         'durbin_watson': dw,
#         'heteroscedasticity_pval': bp_pval,
#         'condition_number': np.linalg.cond(X_const)
#     }

# # Log-Log Model 1: All log features + seasonal indicators
# loglog_full = log_focal_lags + log_competitors + ['sin_week', 'cos_week', 'is_holiday', 'is_peak_season', 'is_summer']
# print(f"\nEstimating Log-Log Model 1: Full specification")
# models['LogLog_1_Full'] = estimate_loglog_model(loglog_full, 'LogLog_1_Full', 
#                                                  'All log(focal) + log(competitors) + seasonal')

# # Log-Log Model 2: Add temporal
# loglog_with_temporal = loglog_full + temporal_features
# print(f"Estimating Log-Log Model 2: With temporal features")
# models['LogLog_2_Temporal'] = estimate_loglog_model(loglog_with_temporal, 'LogLog_2_Temporal', 
#                                                      'Model 1 + temporal features')

# # Log-Log Model 3: Top 10 competitors
# log_top_10_competitors = log_competitors[:10]
# loglog_simplified = log_focal_lags + log_top_10_competitors + ['sin_week', 'cos_week', 'is_holiday', 'is_peak_season', 'is_summer']
# print(f"Estimating Log-Log Model 3: Top 10 competitors")
# models['LogLog_3_Top10'] = estimate_loglog_model(loglog_simplified, 'LogLog_3_Top10', 
#                                                   'All log(focal) + log(top 10 competitors) + seasonal')

# # ============================================================================
# # MODEL COMPARISON
# # ============================================================================
# print("\n" + "="*80)
# print("MODEL COMPARISON")
# print("="*80)

# comparison = []
# for name, result in models.items():
#     comparison.append({
#         'Model': result['name'],
#         'Type': result['type'],
#         'Description': result['description'],
#         'Features': result['n_features'],
#         'R²': result['r2'],
#         'Adj_R²': result['adj_r2'],
#         'RMSE': result['rmse'],
#         'AIC': result['aic'],
#         'BIC': result['bic'],
#         'Sig_Feat': result['sig_features'],
#         'Sig_%': result['sig_ratio'],
#         'DW': result['durbin_watson'],
#         'Het_p': result['heteroscedasticity_pval'],
#         'Cond_No': result['condition_number']
#     })

# comp_df = pd.DataFrame(comparison).sort_values('Adj_R²', ascending=False)
# print("\n" + comp_df.to_string(index=False))

# # ============================================================================
# # BEST MODEL ANALYSIS
# # ============================================================================
# print("\n" + "="*80)
# print("BEST MODEL ANALYSIS")
# print("="*80)

# best_model_name = comp_df.iloc[0]['Model']
# best_result = models[best_model_name]

# print(f"\nBest Model: {best_result['name']}")
# print(f"Type: {best_result['type']}")
# print(f"Description: {best_result['description']}")
# print(f"\nPerformance:")
# print(f"  R²: {best_result['r2']:.4f}")
# print(f"  Adjusted R²: {best_result['adj_r2']:.4f}")
# print(f"  RMSE: ${best_result['rmse']:.2f}")
# print(f"  AIC: {best_result['aic']:.1f}")
# print(f"  BIC: {best_result['bic']:.1f}")
# print(f"\nDiagnostics:")
# print(f"  Features: {best_result['n_features']}")
# print(f"  Significant: {best_result['sig_features']} ({best_result['sig_ratio']:.1%})")
# print(f"  Durbin-Watson: {best_result['durbin_watson']:.3f}")
# print(f"  Heteroscedasticity p-value: {best_result['heteroscedasticity_pval']:.4f}")
# print(f"  Condition number: {best_result['condition_number']:.1f}")

# if best_result['type'] == 'Log-Log':
#     print("\nNOTE: Coefficients represent elasticities")
#     print("      1% increase in X → β% increase in base_rate")

# print("\n" + "="*80)
# print("FULL REGRESSION SUMMARY")
# print("="*80)
# print(best_result['model'].summary())

# # ============================================================================
# # TOP FEATURES
# # ============================================================================
# print("\n" + "="*80)
# print("TOP 20 FEATURES BY COEFFICIENT MAGNITUDE")
# print("="*80)

# coefs = best_result['model'].params[1:]
# pvals = best_result['model'].pvalues[1:]

# feature_importance = pd.DataFrame({
#     'Feature': coefs.index,
#     'Coefficient': coefs.values,
#     'P_Value': pvals.values,
#     'Significant': pvals.values < 0.05,
#     'Abs_Coef': np.abs(coefs.values)
# }).sort_values('Abs_Coef', ascending=False)

# print(feature_importance.head(20).to_string(index=False))

# print("\n" + "="*80)
# print(f"ANALYSIS COMPLETE - Best Adj R²: {best_result['adj_r2']:.4f}")
# print("="*80)

## Iteration 2a

In [3]:
print("="*80)
print("REGULARIZED REGRESSION MODELS")
print("="*80)

# Load data
data_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'
df = pd.read_csv(data_path / 'lagged_predictive_dataset.csv')

with open(data_path / 'lag_selection_metadata.json', 'r') as f:
    metadata = json.load(f)

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)
y = df[metadata['focal_column']]

print(f"\nDataset: {len(df)} observations")
print(f"Target: {metadata['focal_column']}")

# ============================================================================
# FEATURE PREPARATION (Same as Linear_2_Temporal)
# ============================================================================
print("\n" + "="*80)
print("FEATURE PREPARATION")
print("="*80)

# Focal lags
focal_lags = [f'base_rate_lag_{i}' for i in range(1, 6)]

# All competitors (correlation < 0.90)
competitor_cols = [col for col in df.columns if 'booking-us' in col and 'lag' in col]
comp_corr_with_target = df[competitor_cols].corrwith(y).abs().sort_values(ascending=False)

all_competitors = []
for comp in comp_corr_with_target.index:
    is_redundant = False
    for existing in all_competitors:
        if abs(df[comp].corr(df[existing])) > 0.90:
            is_redundant = True
            break
    if not is_redundant:
        all_competitors.append(comp)

# Week of year
df['week_of_year'] = df['date'].dt.isocalendar().week.astype(float)
df['sin_week'] = np.sin(2 * np.pi * df['week_of_year'] / 52).astype(float)
df['cos_week'] = np.cos(2 * np.pi * df['week_of_year'] / 52).astype(float)

# Temporal features
temporal_features = ['sin_day_of_week', 'cos_day_of_week', 'sin_month', 'cos_month', 
                     'sin_day_of_year', 'cos_day_of_year', 'is_weekend']

# Seasonal indicators
if 'is_holiday' not in df.columns:
    df['is_holiday'] = 0
    df.loc[df['date'].dt.month.isin([12, 1]), 'is_holiday'] = 1
    df.loc[(df['date'].dt.month == 7) & (df['date'].dt.day <= 7), 'is_holiday'] = 1
    df.loc[(df['date'].dt.month == 11) & (df['date'].dt.day >= 22), 'is_holiday'] = 1

df['is_peak_season'] = ((df['date'].dt.month.isin([6, 7, 8])) | 
                        (df['date'].dt.month.isin([12, 1]))).astype(int)
df['is_summer'] = df['date'].dt.month.isin([6, 7, 8]).astype(int)

# Compile all features
all_features = (focal_lags + all_competitors + ['sin_week', 'cos_week'] + 
                ['is_holiday', 'is_peak_season', 'is_summer'] + temporal_features)
all_features = [f for f in all_features if f in df.columns]

print(f"\nTotal features: {len(all_features)}")
print(f"  Focal lags: {len(focal_lags)}")
print(f"  Competitors: {len(all_competitors)}")
print(f"  Temporal: {len(temporal_features) + 2}")  # +2 for week
print(f"  Seasonal: 3")

# ============================================================================
# PREPARE DATA FOR REGULARIZATION
# ============================================================================
print("\n" + "="*80)
print("DATA PREPARATION FOR REGULARIZATION")
print("="*80)

X = df[all_features].copy()

# Standardize features (required for regularization)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=all_features, index=X.index)

print(f"\nFeatures standardized (mean=0, std=1)")
print(f"Original X shape: {X.shape}")

# ============================================================================
# RIDGE REGRESSION
# ============================================================================
print("\n" + "="*80)
print("RIDGE REGRESSION")
print("="*80)

# Ridge with cross-validation to find optimal alpha
alphas_ridge = np.logspace(-2, 4, 100)
ridge_cv = RidgeCV(alphas=alphas_ridge, cv=5, scoring='r2')
ridge_cv.fit(X_scaled, y)

y_pred_ridge = ridge_cv.predict(X_scaled)
r2_ridge = r2_score(y, y_pred_ridge)
n, k = len(y), len(all_features)
adj_r2_ridge = 1 - (1 - r2_ridge) * (n - 1) / (n - k - 1)
rmse_ridge = np.sqrt(mean_squared_error(y, y_pred_ridge))

# Get coefficients (transform back to original scale)
ridge_coefs = pd.Series(ridge_cv.coef_, index=all_features)
non_zero_ridge = (ridge_coefs.abs() > 1e-10).sum()

print(f"\nRidge Results:")
print(f"  Optimal alpha: {ridge_cv.alpha_:.4f}")
print(f"  R²: {r2_ridge:.4f}")
print(f"  Adjusted R²: {adj_r2_ridge:.4f}")
print(f"  RMSE: ${rmse_ridge:.2f}")
print(f"  Non-zero coefficients: {non_zero_ridge}/{len(all_features)}")
print(f"  CV score: {ridge_cv.best_score_:.4f}")

# ============================================================================
# LASSO REGRESSION
# ============================================================================
print("\n" + "="*80)
print("LASSO REGRESSION")
print("="*80)

# Lasso with cross-validation
alphas_lasso = np.logspace(-4, 1, 100)
lasso_cv = LassoCV(alphas=alphas_lasso, cv=5, max_iter=10000, random_state=42)
lasso_cv.fit(X_scaled, y)

y_pred_lasso = lasso_cv.predict(X_scaled)
r2_lasso = r2_score(y, y_pred_lasso)
adj_r2_lasso = 1 - (1 - r2_lasso) * (n - 1) / (n - k - 1)
rmse_lasso = np.sqrt(mean_squared_error(y, y_pred_lasso))

lasso_coefs = pd.Series(lasso_cv.coef_, index=all_features)
non_zero_lasso = (lasso_coefs != 0).sum()
selected_features_lasso = lasso_coefs[lasso_coefs != 0].sort_values(key=abs, ascending=False)

print(f"\nLasso Results:")
print(f"  Optimal alpha: {lasso_cv.alpha_:.4f}")
print(f"  R²: {r2_lasso:.4f}")
print(f"  Adjusted R²: {adj_r2_lasso:.4f}")
print(f"  RMSE: ${rmse_lasso:.2f}")
print(f"  Non-zero coefficients: {non_zero_lasso}/{len(all_features)}")
print(f"  Features automatically selected: {non_zero_lasso}")

print(f"\nTop 15 Lasso-selected features:")
print(selected_features_lasso.head(15).to_string())

# ============================================================================
# ELASTIC NET
# ============================================================================
print("\n" + "="*80)
print("ELASTIC NET")
print("="*80)

# ElasticNet with cross-validation
alphas_enet = np.logspace(-4, 1, 50)
l1_ratios = [.1, .3, .5, .7, .9, .95, .99]
enet_cv = ElasticNetCV(alphas=alphas_enet, l1_ratio=l1_ratios, cv=5, 
                       max_iter=10000, random_state=42)
enet_cv.fit(X_scaled, y)

y_pred_enet = enet_cv.predict(X_scaled)
r2_enet = r2_score(y, y_pred_enet)
adj_r2_enet = 1 - (1 - r2_enet) * (n - 1) / (n - k - 1)
rmse_enet = np.sqrt(mean_squared_error(y, y_pred_enet))

enet_coefs = pd.Series(enet_cv.coef_, index=all_features)
non_zero_enet = (enet_coefs != 0).sum()
selected_features_enet = enet_coefs[enet_coefs != 0].sort_values(key=abs, ascending=False)

print(f"\nElasticNet Results:")
print(f"  Optimal alpha: {enet_cv.alpha_:.4f}")
print(f"  Optimal l1_ratio: {enet_cv.l1_ratio_:.4f}")
print(f"  R²: {r2_enet:.4f}")
print(f"  Adjusted R²: {adj_r2_enet:.4f}")
print(f"  RMSE: ${rmse_enet:.2f}")
print(f"  Non-zero coefficients: {non_zero_enet}/{len(all_features)}")

# ============================================================================
# RE-ESTIMATE LASSO-SELECTED FEATURES WITH OLS
# ============================================================================
print("\n" + "="*80)
print("OLS WITH LASSO-SELECTED FEATURES")
print("="*80)

if non_zero_lasso >= 3:
    lasso_selected_list = selected_features_lasso.index.tolist()
    X_lasso_selected = df[lasso_selected_list]
    X_lasso_const = sm.add_constant(X_lasso_selected)
    
    model_lasso_ols = sm.OLS(y, X_lasso_const).fit(cov_type='HC1')
    y_pred_lasso_ols = model_lasso_ols.predict(X_lasso_const)
    residuals_lasso_ols = y - y_pred_lasso_ols
    
    r2_lasso_ols = r2_score(y, y_pred_lasso_ols)
    k_lasso = len(lasso_selected_list)
    adj_r2_lasso_ols = 1 - (1 - r2_lasso_ols) * (n - 1) / (n - k_lasso - 1)
    rmse_lasso_ols = np.sqrt(mean_squared_error(y, y_pred_lasso_ols))
    
    sig_count_lasso = sum(model_lasso_ols.pvalues[1:] < 0.05)
    dw_lasso = durbin_watson(residuals_lasso_ols)
    bp_stat, bp_pval = het_breuschpagan(residuals_lasso_ols, X_lasso_const)[0:2]
    
    print(f"\nOLS Re-estimation Results:")
    print(f"  Features: {k_lasso}")
    print(f"  R²: {r2_lasso_ols:.4f}")
    print(f"  Adjusted R²: {adj_r2_lasso_ols:.4f}")
    print(f"  RMSE: ${rmse_lasso_ols:.2f}")
    print(f"  AIC: {model_lasso_ols.aic:.1f}")
    print(f"  BIC: {model_lasso_ols.bic:.1f}")
    print(f"  Significant features: {sig_count_lasso}/{k_lasso} ({sig_count_lasso/k_lasso:.1%})")
    print(f"  Durbin-Watson: {dw_lasso:.3f}")
    print(f"  Heteroscedasticity p-value: {bp_pval:.4f}")
    print(f"  Condition number: {np.linalg.cond(X_lasso_const):.1f}")

# ============================================================================
# MODEL COMPARISON
# ============================================================================
print("\n" + "="*80)
print("MODEL COMPARISON")
print("="*80)

comparison = pd.DataFrame([
    {
        'Model': 'Linear_Full_OLS',
        'Method': 'OLS',
        'Features': len(all_features),
        'R²': 0.6824,
        'Adj_R²': 0.6503,
        'RMSE': 19.20,
        'Sig_Features': 7,
        'Sig_Ratio': 0.212,
        'Cond_No': 85694,
        'Note': 'Original full model'
    },
    {
        'Model': 'Ridge',
        'Method': 'Ridge',
        'Features': len(all_features),
        'R²': r2_ridge,
        'Adj_R²': adj_r2_ridge,
        'RMSE': rmse_ridge,
        'Sig_Features': non_zero_ridge,
        'Sig_Ratio': non_zero_ridge/len(all_features),
        'Cond_No': None,
        'Note': f'Alpha={ridge_cv.alpha_:.4f}'
    },
    {
        'Model': 'Lasso',
        'Method': 'Lasso',
        'Features': non_zero_lasso,
        'R²': r2_lasso,
        'Adj_R²': adj_r2_lasso,
        'RMSE': rmse_lasso,
        'Sig_Features': non_zero_lasso,
        'Sig_Ratio': 1.0,
        'Cond_No': None,
        'Note': f'Auto-selected {non_zero_lasso} features'
    },
    {
        'Model': 'ElasticNet',
        'Method': 'ElasticNet',
        'Features': non_zero_enet,
        'R²': r2_enet,
        'Adj_R²': adj_r2_enet,
        'RMSE': rmse_enet,
        'Sig_Features': non_zero_enet,
        'Sig_Ratio': 1.0,
        'Cond_No': None,
        'Note': f'L1_ratio={enet_cv.l1_ratio_:.2f}'
    }
])

if non_zero_lasso >= 3:
    comparison = pd.concat([comparison, pd.DataFrame([{
        'Model': 'Lasso_OLS_Refit',
        'Method': 'OLS',
        'Features': k_lasso,
        'R²': r2_lasso_ols,
        'Adj_R²': adj_r2_lasso_ols,
        'RMSE': rmse_lasso_ols,
        'Sig_Features': sig_count_lasso,
        'Sig_Ratio': sig_count_lasso/k_lasso,
        'Cond_No': np.linalg.cond(X_lasso_const),
        'Note': 'OLS on Lasso features'
    }])], ignore_index=True)

comparison = comparison.sort_values('Adj_R²', ascending=False)
print("\n" + comparison.to_string(index=False))

# ============================================================================
# BEST MODEL ANALYSIS
# ============================================================================
print("\n" + "="*80)
print("BEST REGULARIZED MODEL ANALYSIS")
print("="*80)

best_idx = comparison.iloc[0]
best_name = best_idx['Model']

print(f"\nBest Model: {best_name}")
print(f"Method: {best_idx['Method']}")
print(f"Features: {int(best_idx['Features'])}")
print(f"R²: {best_idx['R²']:.4f}")
print(f"Adjusted R²: {best_idx['Adj_R²']:.4f}")
print(f"RMSE: ${best_idx['RMSE']:.2f}")

# Show detailed coefficients for best model
if best_name == 'Lasso_OLS_Refit' and non_zero_lasso >= 3:
    print("\n" + "="*80)
    print("FULL REGRESSION SUMMARY (LASSO-SELECTED FEATURES)")
    print("="*80)
    print(model_lasso_ols.summary())
    
    print("\n" + "="*80)
    print("FEATURE IMPORTANCE")
    print("="*80)
    
    coefs = model_lasso_ols.params[1:]
    pvals = model_lasso_ols.pvalues[1:]
    
    feature_importance = pd.DataFrame({
        'Feature': coefs.index,
        'Coefficient': coefs.values,
        'P_Value': pvals.values,
        'Significant': pvals.values < 0.05,
        'Abs_Coef': np.abs(coefs.values)
    }).sort_values('Abs_Coef', ascending=False)
    
    print(feature_importance.to_string(index=False))

elif best_name == 'Lasso':
    print("\n" + "="*80)
    print("LASSO COEFFICIENTS (STANDARDIZED)")
    print("="*80)
    print(selected_features_lasso.to_string())

print("\n" + "="*80)
print(f"REGULARIZATION COMPLETE")
print(f"Best Adj R²: {comparison.iloc[0]['Adj_R²']:.4f}")
print(f"Multicollinearity handled by regularization")
print("="*80)

REGULARIZED REGRESSION MODELS

Dataset: 360 observations
Target: base_rate

FEATURE PREPARATION

Total features: 33
  Focal lags: 5
  Competitors: 16
  Temporal: 9
  Seasonal: 3

DATA PREPARATION FOR REGULARIZATION

Features standardized (mean=0, std=1)
Original X shape: (360, 33)

RIDGE REGRESSION

Ridge Results:
  Optimal alpha: 57.2237
  R²: 0.6610
  Adjusted R²: 0.6267
  RMSE: $19.83
  Non-zero coefficients: 33/33
  CV score: 0.0096

LASSO REGRESSION

Lasso Results:
  Optimal alpha: 0.0850
  R²: 0.6787
  Adjusted R²: 0.6461
  RMSE: $19.31
  Non-zero coefficients: 27/33
  Features automatically selected: 27

Top 15 Lasso-selected features:
booking-us-courtyard-by-marriott-maui-kahului-airport-USD_lag_1    15.030575
base_rate_lag_1                                                    10.082170
booking-us-kohea-kai-resort-maui-USD_lag_3                          7.413877
cos_week                                                            7.054884
base_rate_lag_5                          