In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
from scipy import stats
import os
import json
import warnings
warnings.filterwarnings('ignore')

## SECTION 1: DATA LOADING

In [2]:
competitor_residuals = pd.read_csv('../data/stage1_linear_results/competitor_residuals_linear.csv', index_col=0)
competitor_prices = pd.read_csv('../data/stage1_linear_results/competitor_prices_actual.csv', index_col=0)
temporal_features = pd.read_csv('../data/stage1_linear_results/temporal_features.csv', index_col=0)
focal_base_rates = pd.read_csv('../data/stage1_linear_results/focal_base_rates.csv', index_col=0)

competitor_residuals.index = pd.to_datetime(competitor_residuals.index)
competitor_prices.index = pd.to_datetime(competitor_prices.index)
temporal_features.index = pd.to_datetime(temporal_features.index)
focal_base_rates.index = pd.to_datetime(focal_base_rates.index)

with open('../data/stage1_linear_results/stage1_linear_metadata.json', 'r') as f:
    stage1_metadata = json.load(f)

print(f"Loaded Stage 1 results:")
print(f"  Competitor residuals: {competitor_residuals.shape}")
print(f"  Competitor prices: {competitor_prices.shape}")
print(f"  Temporal features: {temporal_features.shape}")
print(f"  Focal base rates: {focal_base_rates.shape}")

expected_features = ['sin_month', 'cos_month', 'sin_day', 'cos_day', 'sin_week', 'cos_week', 'is_holiday_season']
current_features = list(temporal_features.columns)

if set(current_features) != set(expected_features):
    print("ERROR: Temporal features mismatch")
    print(f"Current: {current_features}")
    print(f"Expected: {expected_features}")
    raise ValueError("Stage 1 temporal features need updating")

Loaded Stage 1 results:
  Competitor residuals: (365, 5)
  Competitor prices: (365, 5)
  Temporal features: (365, 7)
  Focal base rates: (365, 1)


## SECTION 2: DATA ALIGNMENT AND CLEANING

In [3]:
common_dates = competitor_residuals.index.intersection(competitor_prices.index)
common_dates = common_dates.intersection(temporal_features.index)
common_dates = common_dates.intersection(focal_base_rates.index)

residuals_final = competitor_residuals.loc[common_dates]
prices_final = competitor_prices.loc[common_dates]
features_final = temporal_features.loc[common_dates]
focal_final = focal_base_rates.loc[common_dates]

total_missing = (residuals_final.isnull().sum().sum() + 
                prices_final.isnull().sum().sum() + 
                features_final.isnull().sum().sum() + 
                focal_final.isnull().sum().sum())

if total_missing > 0:
    print(f"ERROR: {total_missing} missing values detected")
    raise ValueError("Complete data expected")

print(f"Final sample size: {len(common_dates)} observations")

Final sample size: 365 observations


## SECTION 3: 2SRI STAGE 2 MODEL CONSTRUCTION

In [4]:
X_competitors = prices_final.values
X_residuals = residuals_final.values
X_temporal = features_final.values
y = focal_final.values.flatten()

X_stage2 = np.column_stack([X_competitors, X_residuals, X_temporal])

n_competitors = X_competitors.shape[1]
n_residuals = X_residuals.shape[1]
n_temporal = X_temporal.shape[1]

print(f"Model specification:")
print(f"  Sample size: {len(y)}")
print(f"  Competitor prices: {n_competitors}")
print(f"  Residual instruments: {n_residuals}")
print(f"  Temporal features: {n_temporal}")
print(f"  Total features: {X_stage2.shape[1]}")

Model specification:
  Sample size: 365
  Competitor prices: 5
  Residual instruments: 5
  Temporal features: 7
  Total features: 17


## SECTION 4: 2SRI MODEL ESTIMATION

In [5]:
stage2_model = LinearRegression()
stage2_model.fit(X_stage2, y)

y_predicted = stage2_model.predict(X_stage2)
r2_stage2 = r2_score(y, y_predicted)
rmse_stage2 = np.sqrt(mean_squared_error(y, y_predicted))

intercept = stage2_model.intercept_
competitor_coefficients = stage2_model.coef_[:n_competitors]
residual_coefficients = stage2_model.coef_[n_competitors:n_competitors + n_residuals]
temporal_coefficients = stage2_model.coef_[n_competitors + n_residuals:]

print(f"Model Performance:")
print(f"  R²: {r2_stage2:.3f}")
print(f"  RMSE: ${rmse_stage2:.2f}")
print(f"  Intercept: {intercept:.3f}")

print(f"Competitor Coefficients (β):")
for i, competitor in enumerate(prices_final.columns):
    coeff = competitor_coefficients[i]
    name = competitor.split('-')[-2][:4].upper()
    print(f"  β_{name}: {coeff:.3f}")

print(f"Residual Coefficients (θ - Endogeneity Tests):")
for i, competitor in enumerate(residuals_final.columns):
    coeff = residual_coefficients[i]
    name = competitor.split('-')[-2][:4].upper()
    print(f"  θ_{name}: {coeff:.3f}")

print(f"Temporal Coefficients (γ):")
for i, feature in enumerate(features_final.columns):
    coeff = temporal_coefficients[i]
    print(f"  γ_{feature}: {coeff:.3f}")

Model Performance:
  R²: 0.512
  RMSE: $23.91
  Intercept: -72.289
Competitor Coefficients (β):
  β_MONA: 0.563
  β_SAND: -0.862
  β_AIRP: 0.105
  β_MAUI: 1.758
  β_MALI: -0.241
Residual Coefficients (θ - Endogeneity Tests):
  θ_MONA: -0.597
  θ_SAND: 1.144
  θ_AIRP: 0.202
  θ_MAUI: -1.659
  θ_MALI: 0.330
Temporal Coefficients (γ):
  γ_sin_month: 0.490
  γ_cos_month: 1.770
  γ_sin_day: 8.696
  γ_cos_day: 4.175
  γ_sin_week: 0.851
  γ_cos_week: 2.578
  γ_is_holiday_season: 2.691


## SECTION 5: STATISTICAL INFERENCE

In [6]:
n = len(y)
k = X_stage2.shape[1]
residuals_2sri = y - y_predicted
residual_variance = np.sum(residuals_2sri**2) / (n - k - 1)

X_stage2_with_intercept = np.column_stack([np.ones(n), X_stage2])

try:
    XTX_inv = np.linalg.pinv(X_stage2_with_intercept.T @ X_stage2_with_intercept)
    cov_matrix = residual_variance * XTX_inv
    standard_errors = np.sqrt(np.diag(cov_matrix))
    
    coefficients_with_intercept = np.concatenate([[intercept], stage2_model.coef_])
    t_stats = coefficients_with_intercept / standard_errors
    p_values = 2 * (1 - stats.t.cdf(np.abs(t_stats), n - k - 1))
    
    print(f"Statistical Significance:")
    print(f"  Intercept: t = {t_stats[0]:.2f}, p = {p_values[0]:.3f}")
    
    for i, competitor in enumerate(prices_final.columns):
        t_stat = t_stats[i + 1]
        p_val = p_values[i + 1]
        name = competitor.split('-')[-2][:4].upper()
        significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
        print(f"  β_{name}: t = {t_stat:.2f}, p = {p_val:.3f} {significance}")
    
    for i, competitor in enumerate(residuals_final.columns):
        t_stat = t_stats[n_competitors + i + 1]
        p_val = p_values[n_competitors + i + 1]
        name = competitor.split('-')[-2][:4].upper()
        significance = "***" if p_val < 0.001 else "**" if p_val < 0.01 else "*" if p_val < 0.05 else ""
        print(f"  θ_{name}: t = {t_stat:.2f}, p = {p_val:.3f} {significance}")
    
    statistical_inference_successful = True
    
except Exception as e:
    print(f"Statistical inference failed: {str(e)}")
    statistical_inference_successful = False
    t_stats = None
    p_values = None

Statistical Significance:
  Intercept: t = -236.71, p = 0.000
  β_MONA: t = 1.88, p = 0.061 
  β_SAND: t = -1.71, p = 0.089 
  β_AIRP: t = 0.76, p = 0.449 
  β_MAUI: t = 3.05, p = 0.002 **
  β_MALI: t = -1.25, p = 0.212 
  θ_MONA: t = -1.94, p = 0.053 
  θ_SAND: t = 2.25, p = 0.025 *
  θ_AIRP: t = 1.41, p = 0.161 
  θ_MAUI: t = -2.86, p = 0.004 **
  θ_MALI: t = 1.64, p = 0.102 


## SECTION 6: ENDOGENEITY TESTING

In [7]:
significant_residuals = 0
if statistical_inference_successful:
    print("Endogeneity Test Results:")
    for i, competitor in enumerate(residuals_final.columns):
        idx = n_competitors + i + 1
        p_val = p_values[idx]
        coeff = residual_coefficients[i]
        t_stat = t_stats[idx]
        name = competitor.split('-')[-2][:4].upper()
        if p_val < 0.05:
            significant_residuals += 1
            print(f"  θ_{name}: {coeff:.3f} (t={t_stat:.2f}, p={p_val:.3f}) ENDOGENOUS")
        else:
            print(f"  θ_{name}: {coeff:.3f} (t={t_stat:.2f}, p={p_val:.3f}) exogenous")
    
    print(f"Endogeneity Summary:")
    print(f"  Significant θ coefficients: {significant_residuals}/{n_residuals}")
    if significant_residuals > 0:
        print(f"  Endogeneity detected - 2SRI correction warranted")
    else:
        print(f"  No endogeneity detected")

Endogeneity Test Results:
  θ_MONA: -0.597 (t=-1.94, p=0.053) exogenous
  θ_SAND: 1.144 (t=2.25, p=0.025) ENDOGENOUS
  θ_AIRP: 0.202 (t=1.41, p=0.161) exogenous
  θ_MAUI: -1.659 (t=-2.86, p=0.004) ENDOGENOUS
  θ_MALI: 0.330 (t=1.64, p=0.102) exogenous
Endogeneity Summary:
  Significant θ coefficients: 2/5
  Endogeneity detected - 2SRI correction warranted


## SECTION 7: COMPARISON WITH OLS BASELINE

In [8]:
X_ols = np.column_stack([X_competitors, X_temporal])
ols_model = LinearRegression()
ols_model.fit(X_ols, y)
y_ols = ols_model.predict(X_ols)
r2_ols = r2_score(y, y_ols)
rmse_ols = np.sqrt(mean_squared_error(y, y_ols))

print("Model Performance Comparison:")
print(f"  2SRI: R² = {r2_stage2:.3f}, RMSE = ${rmse_stage2:.2f}")
print(f"  OLS:  R² = {r2_ols:.3f}, RMSE = ${rmse_ols:.2f}")
print(f"  R² Difference: {r2_stage2 - r2_ols:.6f}")

Model Performance Comparison:
  2SRI: R² = 0.512, RMSE = $23.91
  OLS:  R² = 0.512, RMSE = $23.91
  R² Difference: 0.000000


## SECTION 8: ECONOMIC INTERPRETATION

In [9]:
hotel_names = {
    'MONA': 'Aqua Pacific Monarch',
    'SAND': 'Castle Kamaole Sands', 
    'AIRP': 'Courtyard Marriott Airport',
    'MAUI': 'Kohea Kai Resort Maui',
    'MALI': 'Ohana Waikiki Malia'
}

total_competitor_effect = np.sum(competitor_coefficients)
print("Competitive Response Analysis:")
print(f"  Total competitive effect: {total_competitor_effect:.3f}")

strategy = "COMPLEMENTARY" if total_competitor_effect > 0.1 else "COMPETITIVE" if total_competitor_effect < -0.1 else "NEUTRAL"
print(f"  Strategy: {strategy} pricing")

strongest_idx = np.argmax(np.abs(competitor_coefficients))
strongest_short = prices_final.columns[strongest_idx].split('-')[-2][:4].upper()
strongest_name = hotel_names.get(strongest_short, strongest_short)
strongest_effect = competitor_coefficients[strongest_idx]
print(f"  Dominant competitor: {strongest_name} (β = {strongest_effect:.3f})")

print("DETAILED COMPETITIVE ANALYSIS:")
print("Direct Price Elasticities (β coefficients):")
for i, competitor in enumerate(prices_final.columns):
    coeff = competitor_coefficients[i]
    short_name = competitor.split('-')[-2][:4].upper()
    full_name = hotel_names.get(short_name, short_name)
    
    significance = ""
    if statistical_inference_successful:
        p_val = p_values[i+1]
        significance = " ***" if p_val < 0.001 else " **" if p_val < 0.01 else " *" if p_val < 0.05 else ""
    
    interpretation = "Complementary" if coeff > 0 else "Competitive"
    magnitude = "Strong" if abs(coeff) > 0.5 else "Moderate" if abs(coeff) > 0.1 else "Weak"
    
    print(f"  {full_name}: ${coeff:.3f} - {magnitude} {interpretation}{significance}")

if statistical_inference_successful:
    print("Endogeneity Corrections (θ coefficients):")
    for i, competitor in enumerate(residuals_final.columns):
        p_val = p_values[n_competitors + i + 1]
        if p_val < 0.05:
            coeff = residual_coefficients[i]
            short_name = competitor.split('-')[-2][:4].upper()
            full_name = hotel_names.get(short_name, short_name)
            print(f"  {full_name}: Simultaneity bias corrected (θ = {coeff:.3f})")

print("Temporal Effects:")
for i, feature in enumerate(features_final.columns):
    coeff = temporal_coefficients[i]
    significance = ""
    if statistical_inference_successful:
        p_val = p_values[n_competitors + n_residuals + i + 1]
        significance = " ***" if p_val < 0.001 else " **" if p_val < 0.01 else " *" if p_val < 0.05 else ""
    
    if abs(coeff) > 0.5 or (statistical_inference_successful and p_values[n_competitors + n_residuals + i + 1] < 0.05):
        effect_desc = "Positive" if coeff > 0 else "Negative"
        print(f"  {feature}: {effect_desc} (${coeff:.2f}){significance}")

Competitive Response Analysis:
  Total competitive effect: 1.323
  Strategy: COMPLEMENTARY pricing
  Dominant competitor: Kohea Kai Resort Maui (β = 1.758)
DETAILED COMPETITIVE ANALYSIS:
Direct Price Elasticities (β coefficients):
  Aqua Pacific Monarch: $0.563 - Strong Complementary
  Castle Kamaole Sands: $-0.862 - Strong Competitive
  Courtyard Marriott Airport: $0.105 - Moderate Complementary
  Kohea Kai Resort Maui: $1.758 - Strong Complementary **
  Ohana Waikiki Malia: $-0.241 - Moderate Competitive
Endogeneity Corrections (θ coefficients):
  Castle Kamaole Sands: Simultaneity bias corrected (θ = 1.144)
  Kohea Kai Resort Maui: Simultaneity bias corrected (θ = -1.659)
Temporal Effects:
  cos_month: Positive ($1.77)
  sin_day: Positive ($8.70) **
  cos_day: Positive ($4.18)
  sin_week: Positive ($0.85)
  cos_week: Positive ($2.58)
  is_holiday_season: Positive ($2.69)


## SECTION 9: MODEL DIAGNOSTICS

In [10]:
residuals_mean = np.mean(residuals_2sri)
residuals_std = np.std(residuals_2sri)
durbin_watson = np.sum(np.diff(residuals_2sri)**2) / np.sum(residuals_2sri**2)

print("Residual Diagnostics:")
print(f"  Mean: {residuals_mean:.6f}")
print(f"  Std Dev: {residuals_std:.3f}")
print(f"  Durbin-Watson: {durbin_watson:.3f}")

if abs(residuals_mean) > 0.01:
    print("  WARNING: Non-zero residual mean suggests model misspecification")
else:
    print("  Residual mean acceptable (≈ 0)")

if durbin_watson < 1.5 or durbin_watson > 2.5:
    print("  WARNING: Potential serial correlation in residuals")
else:
    print("  Durbin-Watson statistic acceptable (no strong serial correlation)")


Residual Diagnostics:
  Mean: 0.000000
  Std Dev: 23.906
  Durbin-Watson: 1.106
  Residual mean acceptable (≈ 0)


## SECTION 10: RESULTS EXPORT

In [11]:
os.makedirs('../data/stage2_2sri_results', exist_ok=True)

results_df = pd.DataFrame({
    'date': focal_final.index,
    'actual': y,
    'predicted_2sri': y_predicted,
    'predicted_ols': y_ols,
    'residuals_2sri': residuals_2sri
})
results_df.to_csv('../data/stage2_2sri_results/predictions.csv', index=False)

model_results = {
    'model_type': '2SRI_corrected_temporal',
    'sample_size': int(n),
    'temporal_features_used': list(features_final.columns),
    'professor_feedback_implemented': True,
    'intercept': float(intercept),
    'competitor_coefficients': {
        hotel_names.get(prices_final.columns[i].split('-')[-2][:4].upper()): float(coeff) 
        for i, coeff in enumerate(competitor_coefficients)
    },
    'residual_coefficients': {
        f"{hotel_names.get(residuals_final.columns[i].split('-')[-2][:4].upper())}_residual": float(coeff)
        for i, coeff in enumerate(residual_coefficients)
    },
    'temporal_coefficients': {
        feature: float(coeff) for feature, coeff in 
        zip(features_final.columns, temporal_coefficients)
    },
    'model_performance': {
        'r2_2sri': float(r2_stage2),
        'r2_ols': float(r2_ols),
        'rmse_2sri': float(rmse_stage2),
        'rmse_ols': float(rmse_ols),
        'total_competitor_effect': float(total_competitor_effect),
        'strongest_competitor': strongest_name,
        'significant_residuals': int(significant_residuals) if statistical_inference_successful else None,
        'endogeneity_detected': bool(significant_residuals > 0) if statistical_inference_successful else None,
        'durbin_watson': float(durbin_watson)
    },
    'data_quality': {
        'complete_data_pipeline': bool(stage1_metadata.get('data_retention', {}).get('preprocessing_complete', False)),
        'sample_size_improvement': int(n) - 288,
        'zero_missing_values': bool(total_missing == 0)
    }
}

if statistical_inference_successful:
    model_results['statistical_inference'] = {
        't_statistics': {
            'intercept': float(t_stats[0]),
            'competitors': {
                hotel_names.get(prices_final.columns[i].split('-')[-2][:4].upper()): float(t_stats[i+1])
                for i in range(n_competitors)
            },
            'residuals': {
                f"{hotel_names.get(residuals_final.columns[i].split('-')[-2][:4].upper())}_residual": float(t_stats[n_competitors+i+1])
                for i in range(n_residuals)
            }
        },
        'p_values': {
            'intercept': float(p_values[0]),
            'competitors': {
                hotel_names.get(prices_final.columns[i].split('-')[-2][:4].upper()): float(p_values[i+1])
                for i in range(n_competitors)
            },
            'residuals': {
                f"{hotel_names.get(residuals_final.columns[i].split('-')[-2][:4].upper())}_residual": float(p_values[n_competitors+i+1])
                for i in range(n_residuals)
            }
        }
    }

with open('../data/stage2_2sri_results/model_results.json', 'w') as f:
    json.dump(model_results, f, indent=2)

print("COMPLETE 2SRI EQUATION:")
print(f"P_focal = {intercept:.3f}")

for i, competitor in enumerate(prices_final.columns):
    coeff = competitor_coefficients[i]
    short_name = competitor.split('-')[-2][:4].upper()
    full_name = hotel_names.get(short_name, short_name)
    sign = " + " if coeff >= 0 else " "
    significance = ""
    if statistical_inference_successful:
        p_val = p_values[i+1]
        significance = " ***" if p_val < 0.001 else " **" if p_val < 0.01 else " *" if p_val < 0.05 else ""
    print(f"         {sign}{coeff:.3f} × {full_name}_Price{significance}")

for i, competitor in enumerate(residuals_final.columns):
    coeff = residual_coefficients[i]
    short_name = competitor.split('-')[-2][:4].upper()
    full_name = hotel_names.get(short_name, short_name)
    sign = " + " if coeff >= 0 else " "
    significance = ""
    if statistical_inference_successful:
        p_val = p_values[n_competitors + i + 1]
        significance = " ***" if p_val < 0.001 else " **" if p_val < 0.01 else " *" if p_val < 0.05 else ""
    print(f"         {sign}{coeff:.3f} × {full_name}_Residual{significance}")

for i, feature in enumerate(features_final.columns):
    coeff = temporal_coefficients[i]
    sign = " + " if coeff >= 0 else " "
    significance = ""
    if statistical_inference_successful:
        p_val = p_values[n_competitors + n_residuals + i + 1]
        significance = " ***" if p_val < 0.001 else " **" if p_val < 0.01 else " *" if p_val < 0.05 else ""
    print(f"         {sign}{coeff:.3f} × {feature}{significance}")

print("         + u")

print(f"Sample Size: {n} observations")
print(f"R² = {r2_stage2:.3f}, RMSE = ${rmse_stage2:.2f}")
if statistical_inference_successful and significant_residuals > 0:
    print(f"Endogeneity detected and corrected in {significant_residuals}/{n_residuals} competitors")

sample_improvement = int(n) - 288
if sample_improvement > 0:
    print(f"Sample size improvement: +{sample_improvement} observations")

print("Files exported: predictions.csv, model_results.json")

COMPLETE 2SRI EQUATION:
P_focal = -72.289
          + 0.563 × Aqua Pacific Monarch_Price
          -0.862 × Castle Kamaole Sands_Price
          + 0.105 × Courtyard Marriott Airport_Price
          + 1.758 × Kohea Kai Resort Maui_Price **
          -0.241 × Ohana Waikiki Malia_Price
          -0.597 × Aqua Pacific Monarch_Residual
          + 1.144 × Castle Kamaole Sands_Residual *
          + 0.202 × Courtyard Marriott Airport_Residual
          -1.659 × Kohea Kai Resort Maui_Residual **
          + 0.330 × Ohana Waikiki Malia_Residual
          + 0.490 × sin_month
          + 1.770 × cos_month
          + 8.696 × sin_day **
          + 4.175 × cos_day
          + 0.851 × sin_week
          + 2.578 × cos_week
          + 2.691 × is_holiday_season
         + u
Sample Size: 365 observations
R² = 0.512, RMSE = $23.91
Endogeneity detected and corrected in 2/5 competitors
Sample size improvement: +77 observations
Files exported: predictions.csv, model_results.json
