## SECTION 1: IMPORTS AND SETUP

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
from sklearn.metrics import r2_score
from scipy import stats

import os
import json
import warnings
warnings.filterwarnings('ignore')

## SECTION 2: DATA LOADING

In [2]:
focal_daily = pd.read_csv('../data/dataprocessed/focal_daily_aggregated.csv')
comp_price_pivot = pd.read_csv('../data/dataprocessed/competitor_price_matrix.csv')

focal_daily['stay_date'] = pd.to_datetime(focal_daily['stay_date'])
comp_price_pivot['stay_date'] = pd.to_datetime(comp_price_pivot['stay_date'])
comp_price_pivot = comp_price_pivot.set_index('stay_date')

with open('../data/dataprocessed/preprocessing_metadata.json', 'r') as f:
    metadata = json.load(f)

print(f"Focal hotel data: {len(focal_daily)} observations")
print(f"Competitor data: {comp_price_pivot.shape}")
print(f"Competitor hotels: {list(comp_price_pivot.columns)}")
print(f"Data completeness from preprocessing: {metadata.get('data_completeness', 'Unknown')}")

Focal hotel data: 365 observations
Competitor data: (365, 5)
Competitor hotels: ['booking-us-aqua-pacific-monarch-USD', 'booking-us-castle-kamaole-sands-USD', 'booking-us-courtyard-by-marriott-maui-kahului-airport-USD', 'booking-us-kohea-kai-resort-maui-USD', 'booking-us-ohana-waikiki-malia-USD']
Data completeness from preprocessing: True


## SECTION 3: TEMPORAL FEATURE CONSTRUCTION

In [3]:
focal_daily_indexed = focal_daily.set_index('stay_date')

print(f"Original focal observations: {len(focal_daily_indexed)}")
print(f"Original competitor observations: {comp_price_pivot.shape[0]}")

valid_dates = focal_daily_indexed.index.intersection(comp_price_pivot.index)
print(f"Overlapping dates: {len(valid_dates)}")

comp_final = comp_price_pivot.loc[valid_dates]
focal_final = focal_daily_indexed.loc[valid_dates]

missing_comp = comp_final.isnull().sum().sum()
missing_focal = focal_final['base_rate'].isnull().sum()

print(f"Missing values - Competitors: {missing_comp}, Focal: {missing_focal}")

if missing_comp > 0 or missing_focal > 0:
    print("ERROR: Missing values detected after preprocessing")
    print("Competitor missing by hotel:")
    print(comp_final.isnull().sum())
    raise ValueError("Data should be complete after preprocessing")

print(f"Final aligned sample: {len(comp_final)}")

if len(comp_final) == 0:
    raise ValueError("No valid observations after alignment")

temporal_features = pd.DataFrame(index=valid_dates)
temporal_features['sin_month'] = np.sin(2 * np.pi * temporal_features.index.month / 12)
temporal_features['cos_month'] = np.cos(2 * np.pi * temporal_features.index.month / 12)
temporal_features['sin_day'] = np.sin(2 * np.pi * temporal_features.index.dayofweek / 7)
temporal_features['cos_day'] = np.cos(2 * np.pi * temporal_features.index.dayofweek / 7)

week_of_year = temporal_features.index.isocalendar().week.astype(int)
temporal_features['sin_week'] = np.sin(2 * np.pi * week_of_year / 52)
temporal_features['cos_week'] = np.cos(2 * np.pi * week_of_year / 52)

temporal_features['is_holiday_season'] = ((temporal_features.index.month == 12) | 
                                         (temporal_features.index.month == 1)).astype(int)

print(f"Temporal features shape: {temporal_features.shape}")
print(f"Features: {list(temporal_features.columns)}")

Original focal observations: 365
Original competitor observations: 365
Overlapping dates: 365
Missing values - Competitors: 0, Focal: 0
Final aligned sample: 365
Temporal features shape: (365, 7)
Features: ['sin_month', 'cos_month', 'sin_day', 'cos_day', 'sin_week', 'cos_week', 'is_holiday_season']


## SECTION 4: COMPETITOR PRICE DECOMPOSITION

In [4]:
linear_model = LinearRegression()

competitor_results = {}
competitor_residuals = pd.DataFrame(index=valid_dates)

sample_size = len(valid_dates)
print(f"Stage 1: Linear Competitor Price Decomposition (n={sample_size} for ALL hotels)")

for hotel_id in comp_final.columns:
    X = temporal_features
    y = comp_final[hotel_id]
    
    if X.isnull().any().any() or y.isnull().any():
        print(f"ERROR: NaN values found for {hotel_id}")
        continue
    
    linear_model.fit(X, y)
    predictions = linear_model.predict(X)
    residuals = y - predictions
    
    full_r2 = r2_score(y, predictions)
    
    n, k = X.shape
    ssr = np.sum(residuals ** 2)
    tss = np.sum((y - y.mean()) ** 2)
    
    if tss > 0 and full_r2 < 1:
        f_statistic = (full_r2 / (1 - full_r2)) * ((n - k - 1) / k)
    else:
        f_statistic = np.nan
    
    competitor_results[hotel_id] = {
        'model_type': 'LinearRegression',
        'r2_score': full_r2,
        'f_statistic': f_statistic,
        'n_observations': len(y),
        'coefficients': linear_model.coef_.copy(),
        'intercept': linear_model.intercept_,
        'feature_names': list(temporal_features.columns)
    }
    
    competitor_residuals[hotel_id] = residuals
    
    strength = "Strong" if f_statistic > 10 else "Weak" if not np.isnan(f_statistic) else "N/A"
    print(f"{hotel_id}: R²={full_r2:.3f}, F={f_statistic:.1f} ({strength}), n={len(y)}")

Stage 1: Linear Competitor Price Decomposition (n=365 for ALL hotels)
booking-us-aqua-pacific-monarch-USD: R²=0.306, F=22.5 (Strong), n=365
booking-us-castle-kamaole-sands-USD: R²=0.214, F=13.9 (Strong), n=365
booking-us-courtyard-by-marriott-maui-kahului-airport-USD: R²=0.391, F=32.8 (Strong), n=365
booking-us-kohea-kai-resort-maui-USD: R²=0.254, F=17.4 (Strong), n=365
booking-us-ohana-waikiki-malia-USD: R²=0.486, F=48.2 (Strong), n=365


## SECTION 6: INSTRUMENT STRENGTH VALIDATION

In [5]:
f_statistics = {}
strong_instruments = []

print("Instrument Strength Validation")
for hotel_id, results in competitor_results.items():
    f_stat = results['f_statistic']
    f_statistics[hotel_id] = f_stat
    
    if f_stat > 10:
        strong_instruments.append(hotel_id)
    
    strength = "Strong" if f_stat > 10 else "Weak"
    print(f"{hotel_id}: F-statistic = {f_stat:.1f} ({strength})")

print(f"Strong instruments (F>10): {len(strong_instruments)}")
print(f"Weak instruments: {len(f_statistics) - len(strong_instruments)}")

Instrument Strength Validation
booking-us-aqua-pacific-monarch-USD: F-statistic = 22.5 (Strong)
booking-us-castle-kamaole-sands-USD: F-statistic = 13.9 (Strong)
booking-us-courtyard-by-marriott-maui-kahului-airport-USD: F-statistic = 32.8 (Strong)
booking-us-kohea-kai-resort-maui-USD: F-statistic = 17.4 (Strong)
booking-us-ohana-waikiki-malia-USD: F-statistic = 48.2 (Strong)
Strong instruments (F>10): 5
Weak instruments: 0


## SECTION 7: STATISTICAL INFERENCE FOR INDIVIDUAL MODELS

In [6]:
print("Residuals Quality Assessment")
for hotel_id in competitor_residuals.columns:
    residuals = competitor_residuals[hotel_id].dropna()
    if len(residuals) > 0:
        mean_resid = residuals.mean()
        std_resid = residuals.std()
        print(f"{hotel_id}: Mean={mean_resid:.6f}, Std={std_resid:.3f}, n={len(residuals)}")

print(f"Final residuals matrix shape: {competitor_residuals.shape}")

print("SECTION 7: STATISTICAL INFERENCE FOR INDIVIDUAL MODELS")

for hotel_id, results in competitor_results.items():
    X = temporal_features
    y = comp_final[hotel_id]
    
    linear_model.fit(X, y)
    predictions = linear_model.predict(X)
    residuals = y - predictions
    
    n = len(y)
    k = X.shape[1]
    
    residual_variance = np.sum(residuals**2) / (n - k - 1)
    X_with_intercept = np.column_stack([np.ones(n), X])
    
    try:
        XTX_inv = np.linalg.inv(X_with_intercept.T @ X_with_intercept)
        cov_matrix = residual_variance * XTX_inv
        standard_errors = np.sqrt(np.diag(cov_matrix))
        
        coefficients_with_intercept = np.concatenate([[results['intercept']], results['coefficients']])
        t_stats = coefficients_with_intercept / standard_errors
        p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - k - 1))
        
        f_stat = results.get('f_statistic', 'N/A')
        if isinstance(f_stat, (int, float)):
            f_stat_str = f"{f_stat:.1f}"
        else:
            f_stat_str = str(f_stat)
        
        print(f"\n{hotel_id} (R²={results['r2_score']:.3f}, F={f_stat_str}):")
        print(f"  Intercept: {results['intercept']:.3f} (t={t_stats[0]:.2f}, p={p_values[0]:.3f})")
        
        for i, feature in enumerate(temporal_features.columns):
            coeff = results['coefficients'][i]
            t_stat = t_stats[i + 1]
            p_val = p_values[i + 1]
            significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
            print(f"  {feature}: {coeff:.3f} (t={t_stat:.2f}, p={p_val:.3f}) {significance}")
            
    except np.linalg.LinAlgError:
        print(f"\n{hotel_id}: Could not compute standard errors")
        print(f"  R² = {results['r2_score']:.3f}, F = {f_stat_str}")
        print(f"  Coefficients without significance tests:")
        print(f"  Intercept: {results['intercept']:.3f}")
        for i, feature in enumerate(temporal_features.columns):
            print(f"  {feature}: {results['coefficients'][i]:.3f}")

print("\nSIGNIFICANCE LEGEND: *** p<0.001, ** p<0.01, * p<0.05")

Residuals Quality Assessment
booking-us-aqua-pacific-monarch-USD: Mean=-0.000000, Std=22.641, n=365
booking-us-castle-kamaole-sands-USD: Mean=0.000000, Std=27.667, n=365
booking-us-courtyard-by-marriott-maui-kahului-airport-USD: Mean=-0.000000, Std=38.645, n=365
booking-us-kohea-kai-resort-maui-USD: Mean=0.000000, Std=21.572, n=365
booking-us-ohana-waikiki-malia-USD: Mean=-0.000000, Std=30.473, n=365
Final residuals matrix shape: (365, 5)
SECTION 7: STATISTICAL INFERENCE FOR INDIVIDUAL MODELS

booking-us-aqua-pacific-monarch-USD (R²=0.306, F=22.5):
  Intercept: 273.798 (t=197.79, p=0.000)
  sin_month: 18.389 (t=1.67, p=0.096) 
  cos_month: -9.324 (t=-0.84, p=0.400) 
  sin_day: -3.039 (t=-1.80, p=0.073) 
  cos_day: -1.290 (t=-0.76, p=0.447) 
  sin_week: -1.899 (t=-0.17, p=0.863) 
  cos_week: -5.319 (t=-0.48, p=0.629) 
  is_holiday_season: 20.735 (t=5.10, p=0.000) ***

booking-us-castle-kamaole-sands-USD (R²=0.214, F=13.9):
  Intercept: 338.811 (t=200.29, p=0.000)
  sin_month: 9.561 (t=0

## SECTION 8: COEFFICIENT ANALYSIS

In [7]:
coef_summary = pd.DataFrame()
for hotel_id, results in competitor_results.items():
    coef_dict = dict(zip(temporal_features.columns, results['coefficients']))
    coef_dict['intercept'] = results['intercept']
    coef_dict['hotel_id'] = hotel_id
    coef_summary = pd.concat([coef_summary, pd.DataFrame([coef_dict])], ignore_index=True)

print("Linear Model Coefficients Analysis")
for feature in temporal_features.columns:
    mean_coef = coef_summary[feature].mean()
    std_coef = coef_summary[feature].std()
    print(f"{feature}: Mean={mean_coef:.3f}, Std={std_coef:.3f}")

if 'final_correlation' in metadata and metadata['final_correlation'] is not None:
    baseline_corr = metadata['final_correlation']
    avg_r2 = np.mean([results['r2_score'] for results in competitor_results.values()])
    print(f"Preprocessing correlation baseline: {baseline_corr:.3f}")
    print(f"Stage 1 linear average R²: {avg_r2:.3f}")

Linear Model Coefficients Analysis
sin_month: Mean=-15.005, Std=33.683
cos_month: Mean=-5.956, Std=19.096
sin_day: Mean=-1.887, Std=2.830
cos_day: Mean=-1.949, Std=2.666
sin_week: Mean=21.985, Std=31.601
cos_week: Mean=-6.552, Std=14.185
is_holiday_season: Mean=15.238, Std=27.150
Preprocessing correlation baseline: 0.362
Stage 1 linear average R²: 0.330


## SECTION 9: EXPORT RESULTS

In [8]:
os.makedirs('../data/stage1_linear_results', exist_ok=True)

competitor_residuals.to_csv('../data/stage1_linear_results/competitor_residuals_linear.csv')
comp_final.to_csv('../data/stage1_linear_results/competitor_prices_actual.csv')
temporal_features.to_csv('../data/stage1_linear_results/temporal_features.csv')
focal_final[['base_rate']].to_csv('../data/stage1_linear_results/focal_base_rates.csv')

stage1_metadata = {
    'model_type': 'LinearRegression',
    'competitors_modeled': list(competitor_results.keys()),
    'uniform_sample_size': int(sample_size),
    'observations_per_hotel': [int(r['n_observations']) for r in competitor_results.values()],
    'r2_range': [float(min([r['r2_score'] for r in competitor_results.values()])),
                float(max([r['r2_score'] for r in competitor_results.values()]))],
    'f_statistics': {k: float(v) for k, v in f_statistics.items()},
    'strong_instruments': strong_instruments,
    'residuals_shape': list(competitor_residuals.shape),
    'temporal_features': list(temporal_features.columns),
    'stage1_approach': 'linear_complete_data',
    'data_retention': {
        'preprocessing_complete': metadata.get('data_completeness', False),
        'final_sample_size': int(sample_size),
        'missing_values_stage1': int(missing_comp + missing_focal)
    }
}

with open('../data/stage1_linear_results/stage1_linear_metadata.json', 'w') as f:
    json.dump(stage1_metadata, f, indent=2)

print("Stage 1 Linear Results Exported:")
print("- competitor_residuals_linear.csv (instruments for Stage 2)")
print("- competitor_prices_actual.csv (endogenous variables)")
print("- temporal_features.csv (exogenous controls)")
print("- focal_base_rates.csv (dependent variable for Stage 2)")
print("- stage1_linear_metadata.json (model diagnostics)")
print(f"Complete dataset retained: {sample_size} observations per model")

Stage 1 Linear Results Exported:
- competitor_residuals_linear.csv (instruments for Stage 2)
- competitor_prices_actual.csv (endogenous variables)
- temporal_features.csv (exogenous controls)
- focal_base_rates.csv (dependent variable for Stage 2)
- stage1_linear_metadata.json (model diagnostics)
Complete dataset retained: 365 observations per model
