## Iteration1a

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import statsmodels.api as sm
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Configuration
data_path = Path('../data/full-data/processed')
output_path = Path('../data/full-data/oos_results_iteration1')
output_path.mkdir(parents=True, exist_ok=True)

mapping_df = pd.read_csv('../data/full-data/hotel_mapping.csv')
hotel_list = mapping_df['masked_id'].tolist()

# Exclude catastrophic failures: Hotel_26, Hotel_32, Hotel_34
CATASTROPHIC_HOTELS = ['Hotel_26', 'Hotel_32', 'Hotel_34']
TEST_HOTELS = [h for h in hotel_list if h not in CATASTROPHIC_HOTELS]

MIN_TRAIN_DAYS = 300
TEST_WINDOW = 50

print("="*80)
print("ITERATION 1: DIRECT MODELING (OUT-OF-SAMPLE VALIDATION)")
print("="*80)
print(f"Strategy: Predict absolute base_rate using raw competitor prices")
print(f"Features: All competitor lags (1-5) + temporal features")
print(f"Testing {len(TEST_HOTELS)} hotels")
print(f"Excluded: {len(CATASTROPHIC_HOTELS)} catastrophic failures")
print("-" * 80)

def prepare_features(df):
    """Use ALL competitor lags + temporal features"""
    competitor_lags = [col for col in df.columns if '-USD' in col and 'lag_' in col]
    temporal_features = ['sin_day_of_week', 'cos_day_of_week', 'sin_month', 
                        'cos_month', 'sin_day_of_year', 'cos_day_of_year', 'is_weekend']
    all_features = competitor_lags + temporal_features
    X = df[all_features].copy()
    return X, all_features

def time_series_cv_splits(df, min_train_days, test_window):
    n = len(df)
    splits = []
    train_end = min_train_days
    while train_end + test_window <= n:
        splits.append({
            'train_idx': list(range(0, train_end)),
            'test_idx': list(range(train_end, train_end + test_window))
        })
        train_end += test_window
    return splits

def train_and_evaluate(X_train, y_train, X_test, y_test, fold_num):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    lasso = LassoCV(cv=5, alphas=np.logspace(-4, 1, 100), max_iter=10000, random_state=42, n_jobs=-1)
    lasso.fit(X_train_scaled, y_train)
    
    selected_mask = lasso.coef_ != 0
    if selected_mask.sum() == 0:
        correlations = X_train.corrwith(y_train).abs()
        selected_features = correlations.nlargest(10).index.tolist()
    else:
        selected_features = X_train.columns[selected_mask].tolist()
    
    X_train_selected = X_train[selected_features].copy().astype(float)
    X_test_selected = X_test[selected_features].copy().astype(float)
    
    X_train_ols = sm.add_constant(X_train_selected, has_constant='add')
    model = sm.OLS(y_train.astype(float), X_train_ols).fit(cov_type='HC1')
    
    X_test_ols = sm.add_constant(X_test_selected, has_constant='add')
    y_pred = model.predict(X_test_ols)
    
    ss_res = np.sum((y_test - y_pred) ** 2)
    ss_tot = np.sum((y_test - y_test.mean()) ** 2)
    r2 = 1 - (ss_res / ss_tot) if ss_tot > 0 else -999
    
    rmse = np.sqrt(np.mean((y_test - y_pred) ** 2))
    mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100
    
    return {
        'fold': fold_num,
        'model': model,
        'selected_features': selected_features,
        'r2': r2,
        'rmse': rmse,
        'mape': mape,
        'n_features': len(selected_features)
    }

all_results = {}

for idx, hotel_id in enumerate(TEST_HOTELS, 1):
    print(f"\n[{idx}/{len(TEST_HOTELS)}] Processing {hotel_id}")
    
    try:
        df = pd.read_csv(data_path / f'{hotel_id}_lagged_dataset.csv')
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date').reset_index(drop=True)
        
        X_all, feature_list = prepare_features(df)
        y_all = df['base_rate'].copy()
        
        competitor_features = [f for f in feature_list if 'USD' in f]
        if len(competitor_features) == 0:
            print(f"  No competitors - skipping")
            continue
        
        print(f"  Observations: {len(df)}, Features: {len(feature_list)}")
        
        cv_splits = time_series_cv_splits(df, MIN_TRAIN_DAYS, TEST_WINDOW)
        
        if len(cv_splits) == 0:
            print(f"  Not enough data for CV")
            continue
        
        print(f"  CV folds: {len(cv_splits)}")
        
        cv_results = []
        for i, split in enumerate(cv_splits, 1):
            X_train = X_all.iloc[split['train_idx']]
            y_train = y_all.iloc[split['train_idx']]
            X_test = X_all.iloc[split['test_idx']]
            y_test = y_all.iloc[split['test_idx']]
            
            result = train_and_evaluate(X_train, y_train, X_test, y_test, i)
            cv_results.append(result)
        
        cv_r2_scores = [r['r2'] for r in cv_results]
        mean_r2 = np.mean(cv_r2_scores)
        std_r2 = np.std(cv_r2_scores)
        mean_features = np.mean([r['n_features'] for r in cv_results])
        
        print(f"  CV Results: Mean R² = {mean_r2:.4f} ± {std_r2:.4f}, Avg Features = {mean_features:.1f}")
        
        # Final model
        scaler_final = StandardScaler()
        X_all_scaled = scaler_final.fit_transform(X_all)
        
        lasso_final = LassoCV(cv=5, alphas=np.logspace(-4, 1, 100), max_iter=10000, random_state=42, n_jobs=-1)
        lasso_final.fit(X_all_scaled, y_all)
        
        selected_mask = lasso_final.coef_ != 0
        if selected_mask.sum() == 0:
            correlations = X_all.corrwith(y_all).abs()
            final_features = correlations.nlargest(10).index.tolist()
        else:
            final_features = X_all.columns[selected_mask].tolist()
        
        X_final = X_all[final_features].copy().astype(float)
        X_final_ols = sm.add_constant(X_final, has_constant='add')
        final_model = sm.OLS(y_all.astype(float), X_final_ols).fit(cov_type='HC1')
        
        print(f"  Final Model: Training R² = {final_model.rsquared:.4f}, Features = {len(final_features)}")
        
        cv_summary = {
            'hotel_id': hotel_id,
            'approach': 'direct_modeling',
            'cv_folds': len(cv_results),
            'mean_r2': float(mean_r2),
            'std_r2': float(std_r2),
            'min_r2': float(min(cv_r2_scores)),
            'max_r2': float(max(cv_r2_scores)),
            'mean_mape': float(np.mean([r['mape'] for r in cv_results])),
            'mean_rmse': float(np.mean([r['rmse'] for r in cv_results])),
            'generalization_quality': 'GOOD' if mean_r2 > 0.20 else 'POOR'
        }
        
        with open(output_path / f'{hotel_id}_cv_iteration1.json', 'w') as f:
            json.dump(cv_summary, f, indent=2)
        
        all_results[hotel_id] = {
            'cv_mean_r2': mean_r2,
            'cv_std_r2': std_r2,
            'final_training_r2': final_model.rsquared,
            'n_features': len(final_features),
            'status': 'Success'
        }
        
    except Exception as e:
        print(f"  ERROR: {str(e)}")
        all_results[hotel_id] = {'status': f'Failed: {str(e)}', 'cv_mean_r2': np.nan}

print("\n" + "=" * 80)
print("FINAL SUMMARY - ITERATION 1")
print("=" * 80)

summary_df = pd.DataFrame([
    {
        'hotel_id': hotel_id,
        'cv_mean_r2': results.get('cv_mean_r2', np.nan),
        'cv_std_r2': results.get('cv_std_r2', np.nan),
        'final_training_r2': results.get('final_training_r2', np.nan),
        'n_features': results.get('n_features', 0),
        'status': results.get('status', 'Unknown')
    }
    for hotel_id, results in all_results.items()
])

successful = summary_df[summary_df['status'] == 'Success']

if len(successful) > 0:
    print(f"\nSuccessful: {len(successful)}/{len(TEST_HOTELS)}")
    print(f"Excluded: {', '.join(CATASTROPHIC_HOTELS)} (Training R² < 0.10)")
    
    print(f"\nPerformance Distribution:")
    print(f"  Excellent (R² > 0.40): {len(successful[successful['cv_mean_r2'] > 0.40])}")
    print(f"  Good (R² 0.25-0.40): {len(successful[(successful['cv_mean_r2'] >= 0.25) & (successful['cv_mean_r2'] <= 0.40)])}")
    print(f"  Acceptable (R² 0.15-0.25): {len(successful[(successful['cv_mean_r2'] >= 0.15) & (successful['cv_mean_r2'] < 0.25)])}")
    print(f"  Poor (R² < 0.15): {len(successful[successful['cv_mean_r2'] < 0.15])}")
    
    print(f"\nMetrics:")
    print(f"  Mean CV R²: {successful['cv_mean_r2'].mean():.4f}")
    print(f"  Median CV R²: {successful['cv_mean_r2'].median():.4f}")
    print(f"  Mean Training R²: {successful['final_training_r2'].mean():.4f}")
    
    print(f"\nTop 10 Hotels:")
    print(successful.nlargest(10, 'cv_mean_r2')[['hotel_id', 'cv_mean_r2', 'cv_std_r2']])
    
    print(f"\nBottom 5 Hotels:")
    print(successful.nsmallest(5, 'cv_mean_r2')[['hotel_id', 'cv_mean_r2', 'cv_std_r2']])

# Save catastrophic failures note
catastrophic_note = {
    'excluded_hotels': CATASTROPHIC_HOTELS,
    'reason': 'Training R² < 0.10 - No learnable relationship with competitors',
    'recommendation': 'Use alternative predictors (occupancy, events, seasonality) for these hotels'
}

with open(output_path / 'catastrophic_exclusions.json', 'w') as f:
    json.dump(catastrophic_note, f, indent=2)

summary_df.to_csv(output_path / 'summary_iteration1.csv', index=False)
print(f"\nSummary saved: {output_path / 'summary_iteration1.csv'}")
print(f"Catastrophic exclusions noted: {output_path / 'catastrophic_exclusions.json'}")
print("Complete")

ITERATION 1: DIRECT MODELING (OUT-OF-SAMPLE VALIDATION)
Strategy: Predict absolute base_rate using raw competitor prices
Features: All competitor lags (1-5) + temporal features
Testing 41 hotels
Excluded: 3 catastrophic failures
--------------------------------------------------------------------------------

[1/41] Processing Hotel_01
  Observations: 554, Features: 32
  CV folds: 5
  CV Results: Mean R² = -42.2452 ± 71.6751, Avg Features = 14.2
  Final Model: Training R² = 0.7593, Features = 8

[2/41] Processing Hotel_02
  Observations: 425, Features: 27
  CV folds: 2
  CV Results: Mean R² = 0.2375 ± 0.3190, Avg Features = 16.5
  Final Model: Training R² = 0.7179, Features = 25

[3/41] Processing Hotel_03
  Observations: 396, Features: 52
  CV folds: 1
  CV Results: Mean R² = -0.5275 ± 0.0000, Avg Features = 17.0
  Final Model: Training R² = 0.8917, Features = 34

[4/41] Processing Hotel_04
  Observations: 620, Features: 27
  CV folds: 6
  CV Results: Mean R² = 0.2674 ± 0.3002, Avg Fe