In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

print("="*80)
print("COMPETITOR SET MODELING - ADVANCED IMPUTATION")
print("="*80)

COMPETITOR SET MODELING - ADVANCED IMPUTATION


## Baseline Imputation

In [2]:
# ============================================================================
# LOAD DATA
# ============================================================================
data_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'
df = pd.read_csv(data_path / 'lagged_predictive_dataset.csv')

with open(data_path / 'lag_selection_metadata.json', 'r') as f:
    metadata = json.load(f)

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

# Identify all hotels
competitor_lag_cols = [col for col in df.columns if 'booking-us' in col and 'lag' in col]
all_hotels = set()
for col in competitor_lag_cols:
    hotel_name = col.split('_lag_')[0]
    all_hotels.add(hotel_name)

all_hotels_list = sorted(list(all_hotels))
print(f"\nHotels to model: {len(all_hotels_list) + 1}")
print(f"  Focal: {metadata['focal_column']}")
for hotel in all_hotels_list:
    print(f"  Competitor: {hotel}")

# ============================================================================
# MODEL BUILDING FUNCTION (IDENTICAL TO FOCAL METHODOLOGY)
# ============================================================================

def build_hotel_model(df, target_col):
    """Build model using exact same process as focal hotel Model D"""
    
    y = df[target_col].dropna()
    
    # Remove self lags (same as focal)
    target_base = target_col.split('_lag_')[0]
    self_lag_cols = [col for col in df.columns if target_base in col and 'lag' in col]
    df_work = df.drop(columns=self_lag_cols, errors='ignore')
    
    # Create week features (same as focal)
    df_work['week_of_year'] = df_work['date'].dt.isocalendar().week.astype(float)
    week_mean = df_work['week_of_year'].mean()
    df_work['week_centered'] = df_work['week_of_year'] - week_mean
    df_work['week_squared'] = df_work['week_centered'] ** 2
    df_work['week_cubic'] = df_work['week_centered'] ** 3
    df_work['week_quartic'] = df_work['week_centered'] ** 4
    df_work['sin_week'] = np.sin(2 * np.pi * df_work['week_of_year'] / 52).astype(float)
    df_work['cos_week'] = np.cos(2 * np.pi * df_work['week_of_year'] / 52).astype(float)
    
    # Seasonal indicators (same as focal)
    if 'is_holiday' not in df_work.columns:
        df_work['is_holiday'] = 0
        df_work.loc[df_work['date'].dt.month.isin([12, 1]), 'is_holiday'] = 1
        df_work.loc[(df_work['date'].dt.month == 7) & (df_work['date'].dt.day <= 7), 'is_holiday'] = 1
        df_work.loc[(df_work['date'].dt.month == 11) & (df_work['date'].dt.day >= 22), 'is_holiday'] = 1
    
    df_work['is_peak_season'] = ((df_work['date'].dt.month.isin([6, 7, 8])) | 
                                  (df_work['date'].dt.month.isin([12, 1]))).astype(int)
    df_work['is_summer'] = df_work['date'].dt.month.isin([6, 7, 8]).astype(int)
    
    # Get competitors (same as focal)
    competitor_cols = [col for col in df_work.columns if 'booking-us' in col and 'lag' in col]
    comp_corr_with_target = df_work[competitor_cols].corrwith(y).abs().sort_values(ascending=False)
    
    all_competitors = []
    for comp in comp_corr_with_target.index:
        is_redundant = False
        for existing in all_competitors:
            if abs(df_work[comp].corr(df_work[existing])) > 0.90:
                is_redundant = True
                break
        if not is_redundant:
            all_competitors.append(comp)
    
    seasonal_cols = ['is_holiday', 'is_peak_season', 'is_summer']
    
    # Prepare feature pool for Lasso (same as focal)
    features_A = all_competitors + seasonal_cols + ['sin_week', 'cos_week']
    features_B = all_competitors + seasonal_cols + ['week_centered', 'week_squared', 'week_cubic', 'week_quartic']
    features_D = list(set(features_A + features_B))
    
    # Run Lasso selection (same as focal)
    X_D = df_work[features_D].dropna()
    y_D = y[X_D.index]
    
    scaler = StandardScaler()
    X_D_scaled = scaler.fit_transform(X_D)
    
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(X_D_scaled, y_D)
    
    selected_mask = lasso.coef_ != 0
    selected_features = np.array(features_D)[selected_mask]
    n_selected = selected_mask.sum()
    
    # Refit with OLS (same as focal)
    X_selected = df_work[list(selected_features)].dropna()
    y_selected = y[X_selected.index]
    
    X_with_const = sm.add_constant(X_selected)
    model = sm.OLS(y_selected, X_with_const).fit(cov_type='HC1')
    
    # Calculate metrics (same as focal)
    y_pred = model.predict(X_with_const)
    percentage_errors = np.abs((y_selected - y_pred) / y_selected) * 100
    mape = percentage_errors.mean()
    median_ape = percentage_errors.median()
    rmse = np.sqrt(model.mse_resid)
    
    return {
        'target': target_col,
        'model': model,
        'features': list(selected_features),
        'n_features': n_selected,
        'adj_r2': model.rsquared_adj,
        'rmse': rmse,
        'mape': mape,
        'median_ape': median_ape,
        'n_significant': (model.pvalues[1:] < 0.05).sum(),
        'avg_price': y_selected.mean()
    }

# ============================================================================
# BUILD MODELS
# ============================================================================
print("\n" + "="*80)
print("BUILDING MODELS")
print("="*80)

results = {}

# Focal hotel
print(f"\n{metadata['focal_column']}")
focal_result = build_hotel_model(df, metadata['focal_column'])
results['FOCAL'] = focal_result
print(f"Adj R²: {focal_result['adj_r2']:.4f}")
print(f"Features: {focal_result['n_features']}")
print(f"MAPE: {focal_result['mape']:.2f}%")

# Competitors
for hotel in all_hotels_list:
    print(f"\n{hotel}")
    try:
        result = build_hotel_model(df, hotel)
        results[hotel] = result
        print(f"Adj R²: {result['adj_r2']:.4f}")
        print(f"Features: {result['n_features']}")
        print(f"MAPE: {result['mape']:.2f}%")
    except Exception as e:
        print(f"Error: {str(e)}")
        results[hotel] = None

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

summary_data = []
for name, result in results.items():
    if result is not None:
        summary_data.append({
            'Hotel': name,
            'Adj_R2': result['adj_r2'],
            'RMSE': result['rmse'],
            'MAPE': result['mape'],
            'Features': result['n_features'],
            'Significant': result['n_significant'],
            'Avg_Price': result['avg_price']
        })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Adj_R2', ascending=False)

print("\n" + summary_df.to_string(index=False))

print(f"\nAverage Adj R²: {summary_df['Adj_R2'].mean():.4f}")
print(f"Std Dev: {summary_df['Adj_R2'].std():.4f}")
print(f"Range: {summary_df['Adj_R2'].min():.4f} to {summary_df['Adj_R2'].max():.4f}")

# ============================================================================
# SAVE
# ============================================================================
output_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'
summary_df.to_csv(output_path / 'compset_model_comparison.csv', index=False)

results_export = {
    'mean_adj_r2': float(summary_df['Adj_R2'].mean()),
    'std_adj_r2': float(summary_df['Adj_R2'].std()),
    'mean_mape': float(summary_df['MAPE'].mean()),
    'n_hotels': len(summary_df)
}

with open(output_path / 'compset_modeling_results.json', 'w') as f:
    json.dump(results_export, f, indent=2)

print("\nFiles saved:")
print("  compset_model_comparison.csv")
print("  compset_modeling_results.json")


Hotels to model: 6
  Focal: base_rate
  Competitor: booking-us-aqua-pacific-monarch-USD
  Competitor: booking-us-castle-kamaole-sands-USD
  Competitor: booking-us-courtyard-by-marriott-maui-kahului-airport-USD
  Competitor: booking-us-kohea-kai-resort-maui-USD
  Competitor: booking-us-ohana-waikiki-malia-USD

BUILDING MODELS

base_rate
Adj R²: 0.5833
Features: 13
MAPE: 6.36%

booking-us-aqua-pacific-monarch-USD
Adj R²: 0.5535
Features: 11
MAPE: 4.48%

booking-us-castle-kamaole-sands-USD
Adj R²: 0.6234
Features: 13
MAPE: 4.29%

booking-us-courtyard-by-marriott-maui-kahului-airport-USD
Adj R²: 0.6103
Features: 11
MAPE: 4.66%

booking-us-kohea-kai-resort-maui-USD
Adj R²: 0.5596
Features: 13
MAPE: 2.89%

booking-us-ohana-waikiki-malia-USD
Adj R²: 0.7371
Features: 16
MAPE: 6.31%

SUMMARY

                                                    Hotel   Adj_R2      RMSE     MAPE  Features  Significant  Avg_Price
                       booking-us-ohana-waikiki-malia-USD 0.737148 21.738551 6.31299

## Advanced Imputations

In [3]:
# ============================================================================
# LOAD DATA
# ============================================================================
data_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'
df = pd.read_csv(data_path / 'lagged_predictive_dataset_advanced_imputation.csv')

with open(data_path / 'lag_selection_metadata_advanced_imputation.json', 'r') as f:
    metadata = json.load(f)

df['date'] = pd.to_datetime(df['date'])
df = df.sort_values('date').reset_index(drop=True)

# Identify all hotels
competitor_lag_cols = [col for col in df.columns if 'booking-us' in col and 'lag' in col]
all_hotels = set()
for col in competitor_lag_cols:
    hotel_name = col.split('_lag_')[0]
    all_hotels.add(hotel_name)

all_hotels_list = sorted(list(all_hotels))
print(f"\nHotels to model: {len(all_hotels_list) + 1}")
print(f"  Focal: {metadata['focal_column']}")
for hotel in all_hotels_list:
    print(f"  Competitor: {hotel}")

# ============================================================================
# MODEL BUILDING FUNCTION (IDENTICAL TO FOCAL METHODOLOGY)
# ============================================================================

def build_hotel_model(df, target_col):
    """Build model using exact same process as focal hotel Model D"""
    
    y = df[target_col].dropna()
    
    # Remove self lags (same as focal)
    target_base = target_col.split('_lag_')[0]
    self_lag_cols = [col for col in df.columns if target_base in col and 'lag' in col]
    df_work = df.drop(columns=self_lag_cols, errors='ignore')
    
    # Create week features (same as focal)
    df_work['week_of_year'] = df_work['date'].dt.isocalendar().week.astype(float)
    week_mean = df_work['week_of_year'].mean()
    df_work['week_centered'] = df_work['week_of_year'] - week_mean
    df_work['week_squared'] = df_work['week_centered'] ** 2
    df_work['week_cubic'] = df_work['week_centered'] ** 3
    df_work['week_quartic'] = df_work['week_centered'] ** 4
    df_work['sin_week'] = np.sin(2 * np.pi * df_work['week_of_year'] / 52).astype(float)
    df_work['cos_week'] = np.cos(2 * np.pi * df_work['week_of_year'] / 52).astype(float)
    
    # Seasonal indicators (same as focal)
    if 'is_holiday' not in df_work.columns:
        df_work['is_holiday'] = 0
        df_work.loc[df_work['date'].dt.month.isin([12, 1]), 'is_holiday'] = 1
        df_work.loc[(df_work['date'].dt.month == 7) & (df_work['date'].dt.day <= 7), 'is_holiday'] = 1
        df_work.loc[(df_work['date'].dt.month == 11) & (df_work['date'].dt.day >= 22), 'is_holiday'] = 1
    
    df_work['is_peak_season'] = ((df_work['date'].dt.month.isin([6, 7, 8])) | 
                                  (df_work['date'].dt.month.isin([12, 1]))).astype(int)
    df_work['is_summer'] = df_work['date'].dt.month.isin([6, 7, 8]).astype(int)
    
    # Get competitors (same as focal)
    competitor_cols = [col for col in df_work.columns if 'booking-us' in col and 'lag' in col]
    comp_corr_with_target = df_work[competitor_cols].corrwith(y).abs().sort_values(ascending=False)
    
    all_competitors = []
    for comp in comp_corr_with_target.index:
        is_redundant = False
        for existing in all_competitors:
            if abs(df_work[comp].corr(df_work[existing])) > 0.90:
                is_redundant = True
                break
        if not is_redundant:
            all_competitors.append(comp)
    
    seasonal_cols = ['is_holiday', 'is_peak_season', 'is_summer']
    
    # Prepare feature pool for Lasso (same as focal)
    features_A = all_competitors + seasonal_cols + ['sin_week', 'cos_week']
    features_B = all_competitors + seasonal_cols + ['week_centered', 'week_squared', 'week_cubic', 'week_quartic']
    features_D = list(set(features_A + features_B))
    
    # Run Lasso selection (same as focal)
    X_D = df_work[features_D].dropna()
    y_D = y[X_D.index]
    
    scaler = StandardScaler()
    X_D_scaled = scaler.fit_transform(X_D)
    
    lasso = LassoCV(cv=5, random_state=42, max_iter=10000, n_jobs=-1)
    lasso.fit(X_D_scaled, y_D)
    
    selected_mask = lasso.coef_ != 0
    selected_features = np.array(features_D)[selected_mask]
    n_selected = selected_mask.sum()
    
    # Refit with OLS (same as focal)
    X_selected = df_work[list(selected_features)].dropna()
    y_selected = y[X_selected.index]
    
    X_with_const = sm.add_constant(X_selected)
    model = sm.OLS(y_selected, X_with_const).fit(cov_type='HC1')
    
    # Calculate metrics (same as focal)
    y_pred = model.predict(X_with_const)
    percentage_errors = np.abs((y_selected - y_pred) / y_selected) * 100
    mape = percentage_errors.mean()
    median_ape = percentage_errors.median()
    rmse = np.sqrt(model.mse_resid)
    
    return {
        'target': target_col,
        'model': model,
        'features': list(selected_features),
        'n_features': n_selected,
        'adj_r2': model.rsquared_adj,
        'rmse': rmse,
        'mape': mape,
        'median_ape': median_ape,
        'n_significant': (model.pvalues[1:] < 0.05).sum(),
        'avg_price': y_selected.mean()
    }

# ============================================================================
# BUILD MODELS
# ============================================================================
print("\n" + "="*80)
print("BUILDING MODELS")
print("="*80)

results = {}

# Focal hotel
print(f"\n{metadata['focal_column']}")
focal_result = build_hotel_model(df, metadata['focal_column'])
results['FOCAL'] = focal_result
print(f"Adj R²: {focal_result['adj_r2']:.4f}")
print(f"Features: {focal_result['n_features']}")
print(f"MAPE: {focal_result['mape']:.2f}%")

# Competitors
for hotel in all_hotels_list:
    print(f"\n{hotel}")
    try:
        result = build_hotel_model(df, hotel)
        results[hotel] = result
        print(f"Adj R²: {result['adj_r2']:.4f}")
        print(f"Features: {result['n_features']}")
        print(f"MAPE: {result['mape']:.2f}%")
    except Exception as e:
        print(f"Error: {str(e)}")
        results[hotel] = None

# ============================================================================
# SUMMARY
# ============================================================================
print("\n" + "="*80)
print("SUMMARY")
print("="*80)

summary_data = []
for name, result in results.items():
    if result is not None:
        summary_data.append({
            'Hotel': name,
            'Adj_R2': result['adj_r2'],
            'RMSE': result['rmse'],
            'MAPE': result['mape'],
            'Features': result['n_features'],
            'Significant': result['n_significant'],
            'Avg_Price': result['avg_price']
        })

summary_df = pd.DataFrame(summary_data)
summary_df = summary_df.sort_values('Adj_R2', ascending=False)

print("\n" + summary_df.to_string(index=False))

print(f"\nAverage Adj R²: {summary_df['Adj_R2'].mean():.4f}")
print(f"Std Dev: {summary_df['Adj_R2'].std():.4f}")
print(f"Range: {summary_df['Adj_R2'].min():.4f} to {summary_df['Adj_R2'].max():.4f}")

# ============================================================================
# SAVE
# ============================================================================
output_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'
summary_df.to_csv(output_path / 'compset_model_comparison.csv', index=False)

results_export = {
    'mean_adj_r2': float(summary_df['Adj_R2'].mean()),
    'std_adj_r2': float(summary_df['Adj_R2'].std()),
    'mean_mape': float(summary_df['MAPE'].mean()),
    'n_hotels': len(summary_df)
}

with open(output_path / 'compset_modeling_results.json', 'w') as f:
    json.dump(results_export, f, indent=2)

print("\nFiles saved:")
print("  compset_model_comparison.csv")
print("  compset_modeling_results.json")


Hotels to model: 6
  Focal: base_rate
  Competitor: booking-us-aqua-pacific-monarch-USD
  Competitor: booking-us-castle-kamaole-sands-USD
  Competitor: booking-us-courtyard-by-marriott-maui-kahului-airport-USD
  Competitor: booking-us-kohea-kai-resort-maui-USD
  Competitor: booking-us-ohana-waikiki-malia-USD

BUILDING MODELS

base_rate
Adj R²: 0.5931
Features: 12
MAPE: 6.30%

booking-us-aqua-pacific-monarch-USD
Adj R²: 0.7250
Features: 8
MAPE: 4.48%

booking-us-castle-kamaole-sands-USD
Adj R²: 0.5349
Features: 5
MAPE: 9.08%

booking-us-courtyard-by-marriott-maui-kahului-airport-USD
Adj R²: 0.6852
Features: 13
MAPE: 4.01%

booking-us-kohea-kai-resort-maui-USD
Adj R²: 0.0000
Features: 0
MAPE: 34.08%

booking-us-ohana-waikiki-malia-USD
Adj R²: 0.7551
Features: 13
MAPE: 5.91%

SUMMARY

                                                    Hotel       Adj_R2       RMSE      MAPE  Features  Significant  Avg_Price
                       booking-us-ohana-waikiki-malia-USD 7.550950e-01  20.98593