In [None]:
"""
ADAPTIVE LOG DETRENDING MODEL
==============================

Key innovation: Different hotels need different settings!

For EACH hotel, test combinations of:
1. Detrending window: 7, 14, 21, 28 days
2. Min training size: 180, 200, 250 days
3. Feature sets: Simple (lags 1-5) vs Extended (all lags)

Select best combination per hotel via CV.

Goal: 
- Get 22+ hotels working (like baseline)
- With better R² than baseline (0.1156)
"""

import pandas as pd
import numpy as np
from pathlib import Path
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import json
import pickle
import warnings
warnings.filterwarnings('ignore')

# ============================================================================
# CONFIGURATION
# ============================================================================

processed_data_path = Path('../data/full-data/processed')
output_path = Path('../models/adaptive_log_method')
output_path.mkdir(parents=True, exist_ok=True)

mapping_df = pd.read_csv('../data/full-data/hotel_mapping.csv')
hotel_list = mapping_df['masked_id'].tolist()

DATA_END_DATE = '2025-12-31'

# XGBoost params (same as baseline)
XGB_PARAMS = {
    'max_depth': 4,
    'n_estimators': 100,
    'learning_rate': 0.05,
    'reg_alpha': 0.3,
    'reg_lambda': 2.0,
    'min_child_weight': 5,
    'gamma': 0.1,
    'random_state': 42,
    'n_jobs': -1
}

# Configurations to try per hotel
CONFIGURATIONS = []

# Generate all combinations
for trend_window in [7, 14, 21, 28]:
    for min_train in [180, 200, 250]:
        for feature_set in ['simple', 'extended']:
            CONFIGURATIONS.append({
                'trend_window': trend_window,
                'min_train_days': min_train,
                'test_window': 30,
                'feature_set': feature_set,
                'name': f'trend{trend_window}_train{min_train}_{feature_set}'
            })

print("="*80)
print("ADAPTIVE LOG DETRENDING MODEL")
print("="*80)
print(f"Hotels to process: {len(hotel_list)}")
print(f"Configurations per hotel: {len(CONFIGURATIONS)}")
print(f"\nWill test:")
print(f"  - Detrending windows: 7, 14, 21, 28 days")
print(f"  - Min training: 180, 200, 250 days")
print(f"  - Feature sets: simple (lags 1-5), extended (all lags)")
print(f"\nGoal: Find best config for EACH hotel!")
print("="*80)

# ============================================================================
# FEATURE ENGINEERING
# ============================================================================

def detrend_log_series(series, window):
    """Detrend with FLEXIBLE window"""
    trend = series.rolling(window=window, min_periods=1, center=False).mean()
    detrended = series - trend
    return detrended, trend

def prepare_features(df, config):
    """
    Prepare features with ADAPTIVE configuration
    
    config = {
        'trend_window': 7/14/21/28,
        'feature_set': 'simple' or 'extended'
    }
    """
    df_processed = df.copy()
    feature_cols = []
    
    trend_window = config['trend_window']
    feature_set = config['feature_set']
    
    # ========================================================================
    # COMPETITOR FEATURES
    # ========================================================================
    
    # Get lag columns based on feature set
    all_comp_lag_cols = [col for col in df.columns 
                         if '_lag_' in col 
                         and 'base_rate' not in col.lower()
                         and any(currency in col for currency in ['-USD', '-EUR', '-HKD', '-CNY'])]
    
    if len(all_comp_lag_cols) == 0:
        return None
    
    # Filter based on feature set
    if feature_set == 'simple':
        # Only lags 1-5 (like baseline)
        comp_lag_cols = [col for col in all_comp_lag_cols 
                        if any(f'_lag_{i}' in col for i in [1, 2, 3, 4, 5])]
    else:
        # All available lags (extended)
        comp_lag_cols = all_comp_lag_cols
    
    if len(comp_lag_cols) == 0:
        return None
    
    # Log + detrend with ADAPTIVE window
    for col in comp_lag_cols:
        log_prices = np.log(df_processed[col].replace(0, np.nan))
        detrended, _ = detrend_log_series(log_prices, window=trend_window)
        df_processed[f'{col}_log_detrended'] = detrended
        feature_cols.append(f'{col}_log_detrended')
    
    # Market aggregates (only for extended)
    if feature_set == 'extended':
        lag1_cols = [col for col in comp_lag_cols if '_lag_1' in col]
        
        if len(lag1_cols) > 1:
            for col in lag1_cols:
                df_processed[f'{col}_log'] = np.log(df_processed[col].replace(0, np.nan))
            
            lag1_log_cols = [f'{col}_log' for col in lag1_cols]
            
            df_processed['comp_mean_log'] = df_processed[lag1_log_cols].mean(axis=1)
            df_processed['comp_min_log'] = df_processed[lag1_log_cols].min(axis=1)
            df_processed['comp_max_log'] = df_processed[lag1_log_cols].max(axis=1)
            df_processed['comp_std_log'] = df_processed[lag1_log_cols].std(axis=1)
            
            feature_cols.extend(['comp_mean_log', 'comp_min_log', 'comp_max_log', 'comp_std_log'])
    
    # ========================================================================
    # TEMPORAL FEATURES (10 baseline features)
    # ========================================================================
    
    temporal_cols = ['day_of_week', 'month', 'is_weekend', 'day_of_year']
    temporal_cols = [col for col in temporal_cols if col in df_processed.columns]
    feature_cols.extend(temporal_cols)
    
    # Cyclical encoding
    if 'day_of_week' in df_processed.columns:
        df_processed['sin_day_of_week'] = np.sin(2 * np.pi * df_processed['day_of_week'] / 7)
        df_processed['cos_day_of_week'] = np.cos(2 * np.pi * df_processed['day_of_week'] / 7)
        feature_cols.extend(['sin_day_of_week', 'cos_day_of_week'])
    
    if 'month' in df_processed.columns:
        df_processed['sin_month'] = np.sin(2 * np.pi * df_processed['month'] / 12)
        df_processed['cos_month'] = np.cos(2 * np.pi * df_processed['month'] / 12)
        feature_cols.extend(['sin_month', 'cos_month'])
    
    if 'day_of_year' in df_processed.columns:
        df_processed['sin_day_of_year'] = np.sin(2 * np.pi * df_processed['day_of_year'] / 365)
        df_processed['cos_day_of_year'] = np.cos(2 * np.pi * df_processed['day_of_year'] / 365)
        feature_cols.extend(['sin_day_of_year', 'cos_day_of_year'])
    
    # ========================================================================
    # TARGET (with ADAPTIVE detrending window)
    # ========================================================================
    
    y_original = df_processed['base_rate']
    y_log = np.log(y_original.replace(0, np.nan))
    y_detrended, y_trend = detrend_log_series(y_log, window=trend_window)
    
    X = df_processed[feature_cols].copy()
    
    # Drop NaN
    valid_idx = ~(X.isnull().any(axis=1) | y_detrended.isnull() | y_log.isnull())
    X = X[valid_idx].reset_index(drop=True)
    y_detrended = y_detrended[valid_idx].reset_index(drop=True)
    y_trend = y_trend[valid_idx].reset_index(drop=True)
    y_original = y_original[valid_idx].reset_index(drop=True)
    
    return X, y_detrended, y_trend, y_original, feature_cols

# ============================================================================
# CV AND EVALUATION
# ============================================================================

def time_series_cv_splits(n_samples, min_train, test_window):
    """Time-series CV with ADAPTIVE split sizes"""
    splits = []
    train_end = min_train
    
    while train_end + test_window <= n_samples and len(splits) < 6:
        splits.append({
            'train_idx': list(range(0, train_end)),
            'test_idx': list(range(train_end, min(train_end + test_window, n_samples)))
        })
        train_end += test_window
    
    if len(splits) < 3:
        return None
    
    return splits

def evaluate_configuration(X_all, y_detrended, y_trend, y_original, config):
    """Evaluate one configuration with CV"""
    
    splits = time_series_cv_splits(len(X_all), config['min_train_days'], config['test_window'])
    
    if splits is None:
        return None
    
    test_r2_scores = []
    train_r2_scores = []
    
    for split in splits:
        X_train = X_all.iloc[split['train_idx']]
        X_test = X_all.iloc[split['test_idx']]
        y_train = y_detrended.iloc[split['train_idx']]
        y_test_detrended = y_detrended.iloc[split['test_idx']]
        y_trend_train = y_trend.iloc[split['train_idx']]
        y_trend_test = y_trend.iloc[split['test_idx']]
        y_train_original = y_original.iloc[split['train_idx']]
        y_test_original = y_original.iloc[split['test_idx']]
        
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        model = XGBRegressor(**XGB_PARAMS)
        model.fit(X_train_scaled, y_train, verbose=False)
        
        # Train predictions
        y_train_pred_detrended = model.predict(X_train_scaled)
        y_train_pred_log = y_train_pred_detrended + y_trend_train.values
        y_train_pred = np.exp(y_train_pred_log)
        train_r2 = r2_score(y_train_original, y_train_pred)
        
        # Test predictions
        y_test_pred_detrended = model.predict(X_test_scaled)
        y_test_pred_log = y_test_pred_detrended + y_trend_test.values
        y_test_pred = np.exp(y_test_pred_log)
        test_r2 = r2_score(y_test_original, y_test_pred)
        
        test_r2_scores.append(test_r2)
        train_r2_scores.append(train_r2)
    
    return {
        'mean_test_r2': float(np.mean(test_r2_scores)),
        'mean_train_r2': float(np.mean(train_r2_scores)),
        'n_folds': len(splits)
    }

def train_final_model(X_all, y_detrended):
    """Train final model"""
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X_all)
    
    model = XGBRegressor(**XGB_PARAMS)
    model.fit(X_scaled, y_detrended, verbose=False)
    
    return model, scaler

# ============================================================================
# ADAPTIVE PROCESSING
# ============================================================================

def process_hotel_adaptive(hotel_id):
    """
    Process one hotel with ADAPTIVE approach:
    1. Try all configurations
    2. Select best based on test R²
    3. Train final model with best config
    """
    
    try:
        data_file = processed_data_path / f'{hotel_id}_lagged_dataset.csv'
        
        if not data_file.exists():
            return {'hotel_id': hotel_id, 'status': 'missing_file'}
        
        df = pd.read_csv(data_file)
        df['date'] = pd.to_datetime(df['date'])
        df = df[df['date'] <= DATA_END_DATE].copy()
        df = df.sort_values('date').reset_index(drop=True)
        
        # Need minimum data for any config
        if len(df) < 180 + 30:  # Smallest min_train + test
            return {'hotel_id': hotel_id, 'status': 'insufficient_data',
                   'observations': len(df)}
        
        # Try all configurations
        config_results = {}
        
        for config in CONFIGURATIONS:
            # Prepare features
            result = prepare_features(df, config)
            
            if result is None:
                continue
            
            X, y_detrended, y_trend, y_original, feature_cols = result
            
            # Need enough data for this config
            if len(X) < config['min_train_days'] + config['test_window']:
                continue
            
            # Evaluate
            eval_result = evaluate_configuration(X, y_detrended, y_trend, y_original, config)
            
            if eval_result is not None:
                config_results[config['name']] = {
                    'config': config,
                    'test_r2': eval_result['mean_test_r2'],
                    'train_r2': eval_result['mean_train_r2'],
                    'n_folds': eval_result['n_folds'],
                    'n_features': len(feature_cols)
                }
        
        if len(config_results) == 0:
            return {'hotel_id': hotel_id, 'status': 'all_configs_failed'}
        
        # Select BEST configuration
        best_config_name = max(config_results.keys(), 
                              key=lambda k: config_results[k]['test_r2'])
        best_config = config_results[best_config_name]['config']
        best_test_r2 = config_results[best_config_name]['test_r2']
        
        # Train final model with best config
        result = prepare_features(df, best_config)
        X, y_detrended, y_trend, y_original, feature_cols = result
        
        model, scaler = train_final_model(X, y_detrended)
        
        # Save model
        model_data = {
            'model': model,
            'scaler': scaler,
            'feature_cols': feature_cols,
            'config': best_config,
            'trend_window': best_config['trend_window'],
            'y_trend_last': y_trend.iloc[-1],
            'method': 'adaptive_log_detrending'
        }
        
        model_file = output_path / f'{hotel_id}_model.pkl'
        with open(model_file, 'wb') as f:
            pickle.dump(model_data, f)
        
        return {
            'hotel_id': hotel_id,
            'status': 'success',
            'best_config_name': best_config_name,
            'trend_window': best_config['trend_window'],
            'min_train_days': best_config['min_train_days'],
            'feature_set': best_config['feature_set'],
            'test_r2': best_test_r2,
            'train_r2': config_results[best_config_name]['train_r2'],
            'n_features': len(feature_cols),
            'n_folds': config_results[best_config_name]['n_folds'],
            'n_configs_tried': len(config_results),
            'all_configs': {k: {'test_r2': v['test_r2'], 'n_features': v['n_features']} 
                           for k, v in config_results.items()}
        }
        
    except Exception as e:
        return {'hotel_id': hotel_id, 'status': 'error', 'error': str(e)}

# ============================================================================
# RUN FOR ALL HOTELS
# ============================================================================

print(f"\nProcessing hotels with ADAPTIVE configurations...")
print("-"*80)

all_results = {}
successful = []

for idx, hotel_id in enumerate(hotel_list, 1):
    print(f"[{idx}/{len(hotel_list)}] {hotel_id}...", end=' ')
    
    result = process_hotel_adaptive(hotel_id)
    all_results[hotel_id] = result
    
    if result['status'] == 'success':
        successful.append(hotel_id)
        config_info = f"trend={result['trend_window']}, {result['feature_set']}"
        print(f"R²={result['test_r2']:.4f} ({config_info})")
    else:
        print(f"{result['status']}")

# ============================================================================
# SUMMARY
# ============================================================================

print(f"\n{'='*80}")
print("ADAPTIVE LOG DETRENDING - RESULTS")
print(f"{'='*80}")

print(f"\nTotal hotels: {len(hotel_list)}")
print(f"Successful: {len(successful)}")
print(f"Failed: {len(hotel_list) - len(successful)}")

if len(successful) > 0:
    test_r2_values = [all_results[h]['test_r2'] for h in successful]
    
    print(f"\nPERFORMANCE STATISTICS:")
    print(f"  Mean Test R²: {np.mean(test_r2_values):.4f}")
    print(f"  Median Test R²: {np.median(test_r2_values):.4f}")
    print(f"  Min Test R²: {np.min(test_r2_values):.4f}")
    print(f"  Max Test R²: {np.max(test_r2_values):.4f}")
    print(f"  Std Test R²: {np.std(test_r2_values):.4f}")
    
    print(f"\n{'='*80}")
    print("COMPARISON TO YOUR BASELINE:")
    print(f"{'='*80}")
    print(f"  YOUR BASELINE:    Mean = 0.1156, Median = 0.2236 (22 hotels)")
    print(f"  ADAPTIVE METHOD:  Mean = {np.mean(test_r2_values):.4f}, Median = {np.median(test_r2_values):.4f} ({len(successful)} hotels)")
    
    improvement_mean = ((np.mean(test_r2_values) - 0.1156) / 0.1156) * 100
    improvement_median = ((np.median(test_r2_values) - 0.2236) / 0.2236) * 100
    
    print(f"\n  Mean improvement: {improvement_mean:+.1f}%")
    print(f"  Median improvement: {improvement_median:+.1f}%")
    
    # Configuration analysis
    print(f"\n{'='*80}")
    print("CONFIGURATION ANALYSIS:")
    print(f"{'='*80}")
    
    # Trend window distribution
    trend_windows = {}
    for h in successful:
        tw = all_results[h]['trend_window']
        trend_windows[tw] = trend_windows.get(tw, 0) + 1
    
    print(f"\nBest Trend Window Distribution:")
    for tw in sorted(trend_windows.keys()):
        count = trend_windows[tw]
        print(f"  {tw}-day window: {count} hotels ({count/len(successful)*100:.1f}%)")
    
    # Feature set distribution
    feature_sets = {}
    for h in successful:
        fs = all_results[h]['feature_set']
        feature_sets[fs] = feature_sets.get(fs, 0) + 1
    
    print(f"\nBest Feature Set Distribution:")
    for fs, count in feature_sets.items():
        print(f"  {fs}: {count} hotels ({count/len(successful)*100:.1f}%)")
    
    # Performance tiers
    good = sum(1 for r2 in test_r2_values if r2 > 0)
    high = sum(1 for r2 in test_r2_values if r2 > 0.4)
    medium = sum(1 for r2 in test_r2_values if 0.2 < r2 <= 0.4)
    low = sum(1 for r2 in test_r2_values if 0 < r2 <= 0.2)
    negative = sum(1 for r2 in test_r2_values if r2 <= 0)
    
    print(f"\n{'='*80}")
    print("PERFORMANCE TIERS:")
    print(f"{'='*80}")
    print(f"  GOOD (R² > 0): {good} hotels ({good/len(successful)*100:.1f}%)")
    print(f"  HIGH (R² > 0.4): {high} hotels ({high/len(successful)*100:.1f}%)")
    print(f"  MEDIUM (0.2-0.4): {medium} hotels ({medium/len(successful)*100:.1f}%)")
    print(f"  LOW (0-0.2): {low} hotels ({low/len(successful)*100:.1f}%)")
    print(f"  NEGATIVE: {negative} hotels ({negative/len(successful)*100:.1f}%)")
    
    # Save results
    with open(output_path / 'adaptive_summary.json', 'w') as f:
        json.dump(all_results, f, indent=2)
    
    deployment_df = pd.DataFrame([
        {
            'hotel_id': h,
            'test_r2': all_results[h]['test_r2'],
            'train_r2': all_results[h]['train_r2'],
            'trend_window': all_results[h]['trend_window'],
            'feature_set': all_results[h]['feature_set'],
            'n_features': all_results[h]['n_features'],
            'n_folds': all_results[h]['n_folds'],
            'n_configs_tried': all_results[h]['n_configs_tried'],
            'model_file': f'{h}_model.pkl'
        }
        for h in successful
    ])
    deployment_df = deployment_df.sort_values('test_r2', ascending=False)
    deployment_df.to_csv(output_path / 'deployment_adaptive.csv', index=False)
    
    print(f"\n Models saved to: {output_path}/")
    print(f" Summary: adaptive_summary.json")
    print(f" Deployment: deployment_adaptive.csv")

print(f"\n{'='*80}")
print("KEY INSIGHT: Each hotel got its optimal configuration!")
print(f"{'='*80}")