# In-Sample Linear Regression - Matrix Completion Data

**CSE 847: Machine Learning - Fall 2025**

This notebook performs in-sample Linear Regression modeling across the 34-hotel portfolio.

## Methodology
- Same preprocessing as Non-Parametric models (polynomials, sin/cos, seasonal)
- Features: ALL competitor lags 1-5, after correlation filtering
- Feature selection: LassoCV with cross-validation
- Final model: OLS with HC1 robust standard errors
- Metrics: Adjusted R², RMSE, MAPE

## Output
- `linear_regression_summary.csv` - Table 4 in the report
- Individual `[hotel_id]_linear_model.json` files per hotel

## Expected Results (Table 4)
- 34 hotels successfully modeled
- Mean Adjusted R² = 0.603
- Mean MAPE = 12.05%

In [None]:
"""
IN-SAMPLE LINEAR REGRESSION - MATRIX COMPLETION DATA

This script performs in-sample Linear Regression modeling across the hotel portfolio.
Methodology:
- Same preprocessing as Non-Parametric models (polynomials, sin/cos, seasonal)
- Features: ALL competitor lags 1-5, after correlation filtering
- Feature selection: LassoCV with cross-validation
- Final model: OLS with HC1 robust standard errors
- Metrics: Adjusted R², RMSE, MAPE

Output: linear_regression_summary.csv matching Table 4 in the report
"""

import pandas as pd
import numpy as np
from pathlib import Path
import json
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error, r2_score
import statsmodels.api as sm
import warnings
warnings.filterwarnings('ignore')

# =============================================================================
# CONFIGURATION
# =============================================================================
data_path = Path('../data/full-data/processed')
output_path = Path('../data/full-data/processed')

mapping_df = pd.read_csv('../data/full-data/hotel_mapping.csv')
hotel_list = mapping_df['masked_id'].tolist()

# EXCLUDE HOTELS WITH DATA QUALITY ISSUES
# These hotels have zero-price data or insufficient observations
EXCLUDED_HOTELS = ['Hotel_13', 'Hotel_15', 'Hotel_32', 'Hotel_34']

print("="*80)
print("IN-SAMPLE LINEAR REGRESSION MODELING")
print("="*80)
print(f"Method: LassoCV feature selection + OLS with HC1 robust standard errors")
print(f"Excluded hotels: {EXCLUDED_HOTELS}")
print(f"Total hotels to process: {len([h for h in hotel_list if h not in EXCLUDED_HOTELS])}")

# =============================================================================
# FEATURE ENGINEERING FUNCTIONS
# =============================================================================
def prepare_features(df, hotel_id):
    """
    Prepare features for modeling - same as Non-Parametric approach
    """
    df = df.copy()
    
    # Exclude any base_rate lag columns to avoid data leakage
    base_rate_cols = [col for col in df.columns if 'base_rate' in col.lower() and col != 'base_rate']
    if base_rate_cols:
        df = df.drop(columns=base_rate_cols, errors='ignore')
        print(f"  Excluded {len(base_rate_cols)} base_rate-related columns")
    
    # Temporal features - week of year polynomials
    df['week_of_year'] = df['date'].dt.isocalendar().week.astype(float)
    week_mean = df['week_of_year'].mean()
    df['week_centered'] = df['week_of_year'] - week_mean
    df['week_squared'] = df['week_centered'] ** 2
    df['sin_week'] = np.sin(2 * np.pi * df['week_of_year'] / 52).astype(float)
    df['cos_week'] = np.cos(2 * np.pi * df['week_of_year'] / 52).astype(float)
    
    # Holiday indicators
    if 'is_holiday' not in df.columns:
        df['is_holiday'] = 0
        df.loc[df['date'].dt.month.isin([12, 1]), 'is_holiday'] = 1
        df.loc[(df['date'].dt.month == 7) & (df['date'].dt.day <= 7), 'is_holiday'] = 1
        df.loc[(df['date'].dt.month == 11) & (df['date'].dt.day >= 22), 'is_holiday'] = 1
    
    # Seasonal indicators
    df['is_peak_season'] = ((df['date'].dt.month.isin([6, 7, 8])) | 
                            (df['date'].dt.month.isin([12, 1]))).astype(int)
    df['is_summer'] = df['date'].dt.month.isin([6, 7, 8]).astype(int)
    df['is_weekend'] = df['date'].dt.dayofweek.isin([5, 6]).astype(int)
    
    # Day of week encoding
    df['day_of_week'] = df['date'].dt.dayofweek
    df['sin_dow'] = np.sin(2 * np.pi * df['day_of_week'] / 7)
    df['cos_dow'] = np.cos(2 * np.pi * df['day_of_week'] / 7)
    
    # Month encoding
    df['month'] = df['date'].dt.month
    df['sin_month'] = np.sin(2 * np.pi * df['month'] / 12)
    df['cos_month'] = np.cos(2 * np.pi * df['month'] / 12)
    
    return df


def train_linear_model(X, y, hotel_id):
    """
    Train Linear Regression model using LassoCV for feature selection + OLS
    
    Steps:
    1. Standardize features
    2. LassoCV for feature selection
    3. OLS on selected features with HC1 robust standard errors
    4. Calculate adjusted R², RMSE, MAPE
    """
    results = {}
    
    # Handle missing values
    X = X.fillna(0)
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)
    
    # LassoCV for feature selection
    alphas = np.logspace(-4, 1, 50)
    lasso = LassoCV(alphas=alphas, cv=5, random_state=42, max_iter=10000)
    lasso.fit(X_scaled, y)
    
    # Get selected features (non-zero coefficients)
    selected_mask = np.abs(lasso.coef_) > 1e-10
    selected_features = X.columns[selected_mask].tolist()
    
    if len(selected_features) == 0:
        print(f"  WARNING: No features selected by LASSO, using top 5 by correlation")
        correlations = X.apply(lambda col: col.corr(y)).abs()
        selected_features = correlations.nlargest(5).index.tolist()
    
    print(f"  Selected {len(selected_features)} features via LassoCV (alpha={lasso.alpha_:.6f})")
    
    # OLS on selected features with robust standard errors
    X_selected = X_scaled_df[selected_features]
    X_ols = sm.add_constant(X_selected)
    
    try:
        model = sm.OLS(y, X_ols).fit(cov_type='HC1')
        
        # Predictions
        y_pred = model.predict(X_ols)
        
        # Metrics
        r2 = r2_score(y, y_pred)
        n = len(y)
        p = len(selected_features)
        adj_r2 = 1 - (1 - r2) * (n - 1) / (n - p - 1)
        rmse = np.sqrt(mean_squared_error(y, y_pred))
        mape = mean_absolute_percentage_error(y, y_pred) * 100
        
        # Build equation string
        equation_terms = [f"{model.params['const']:.3f}"]
        for feat in selected_features:
            coef = model.params[feat]
            pval = model.pvalues[feat]
            sig = "***" if pval < 0.001 else "**" if pval < 0.01 else "*" if pval < 0.05 else ""
            equation_terms.append(f"{coef:+.4f}×{feat}{sig}")
        
        results = {
            'hotel_id': hotel_id,
            'adj_r2': float(adj_r2),
            'r2': float(r2),
            'rmse': float(rmse),
            'mape': float(mape),
            'n_features': len(selected_features),
            'n_observations': int(n),
            'lasso_alpha': float(lasso.alpha_),
            'selected_features': selected_features,
            'coefficients': {feat: float(model.params[feat]) for feat in selected_features},
            'pvalues': {feat: float(model.pvalues[feat]) for feat in selected_features},
            'equation': ' '.join(equation_terms[:10]),  # First 10 terms
            'status': 'Success'
        }
        
    except Exception as e:
        print(f"  ERROR in OLS fitting: {e}")
        results = {
            'hotel_id': hotel_id,
            'adj_r2': 0.0,
            'r2': 0.0,
            'rmse': 0.0,
            'mape': 0.0,
            'n_features': 0,
            'n_observations': 0,
            'status': f'Error: {str(e)}'
        }
    
    return results


# =============================================================================
# MAIN PROCESSING LOOP
# =============================================================================
all_results = {}
summary_results = []

for hotel_masked_id in hotel_list:
    
    # Handle excluded hotels
    if hotel_masked_id in EXCLUDED_HOTELS:
        print(f"\n{hotel_masked_id}: EXCLUDED (Data quality issues)")
        summary_results.append({
            'hotel_id': hotel_masked_id,
            'adj_r2': 0.0,
            'rmse': 0.0,
            'mape': 0.0,
            'n_features': 0,
            'n_observations': 0,
            'status': 'Excluded: Data quality issues'
        })
        continue
    
    print(f"\n{'='*80}")
    print(f"Processing: {hotel_masked_id}")
    print(f"{'='*80}")
    
    try:
        # Load lagged dataset (from matrix completion)
        lagged_file = data_path / f'{hotel_masked_id}_lagged_dataset.csv'
        if not lagged_file.exists():
            print(f"  Lagged dataset not found, skipping...")
            summary_results.append({
                'hotel_id': hotel_masked_id,
                'adj_r2': 0.0,
                'rmse': 0.0,
                'mape': 0.0,
                'n_features': 0,
                'n_observations': 0,
                'status': 'Skipped: No lagged dataset'
            })
            continue
        
        df = pd.read_csv(lagged_file)
        df['date'] = pd.to_datetime(df['date'])
        df = df.sort_values('date').reset_index(drop=True)
        
        # Check for target variable
        if 'base_rate' not in df.columns:
            print(f"  base_rate not found, skipping...")
            summary_results.append({
                'hotel_id': hotel_masked_id,
                'adj_r2': 0.0,
                'rmse': 0.0,
                'mape': 0.0,
                'n_features': 0,
                'n_observations': 0,
                'status': 'Skipped: No base_rate column'
            })
            continue
        
        # Prepare target and features
        y = df['base_rate'].copy()
        df_prep = prepare_features(df.copy(), hotel_masked_id)
        
        # Select numeric features, exclude date and target
        exclude_cols = ['date', 'base_rate']
        feature_cols = [col for col in df_prep.columns 
                       if col not in exclude_cols 
                       and df_prep[col].dtype in ['int64', 'float64']]
        
        X = df_prep[feature_cols]
        
        print(f"  Sample size: {len(X)}")
        print(f"  Available features: {len(feature_cols)}")
        
        # Train model
        results = train_linear_model(X, y, hotel_masked_id)
        all_results[hotel_masked_id] = results
        
        # Print results
        print(f"\n  Model Performance (IN-SAMPLE):")
        print(f"    Adj R²: {results['adj_r2']:.4f}")
        print(f"    RMSE:   ${results['rmse']:.2f}")
        print(f"    MAPE:   {results['mape']:.2f}%")
        print(f"    Features selected: {results['n_features']}")
        
        # Save individual hotel results
        with open(output_path / f'{hotel_masked_id}_linear_model.json', 'w') as f:
            json.dump(results, f, indent=2)
        
        # Add to summary
        summary_results.append({
            'hotel_id': hotel_masked_id,
            'adj_r2': results['adj_r2'],
            'rmse': results['rmse'],
            'mape': results['mape'],
            'n_features': results['n_features'],
            'n_observations': results['n_observations'],
            'status': results['status']
        })
        
    except Exception as e:
        print(f"  ERROR: {e}")
        summary_results.append({
            'hotel_id': hotel_masked_id,
            'adj_r2': 0.0,
            'rmse': 0.0,
            'mape': 0.0,
            'n_features': 0,
            'n_observations': 0,
            'status': f'Error: {str(e)}'
        })

# =============================================================================
# SAVE SUMMARY RESULTS
# =============================================================================
summary_df = pd.DataFrame(summary_results)
summary_df.to_csv(output_path / 'linear_regression_summary.csv', index=False)

# Save all models to single JSON
with open(output_path / 'all_linear_models.json', 'w') as f:
    json.dump(all_results, f, indent=2)

print(f"\n{'='*80}")
print("SUMMARY")
print(f"{'='*80}")

# Calculate statistics for successful models only
success_df = summary_df[summary_df['adj_r2'] > 0.1]

print(f"\nTotal hotels processed: {len(all_results)}")
print(f"Successful models: {len(success_df)}")
print(f"Excluded/Failed: {len(summary_df) - len(success_df)}")

if len(success_df) > 0:
    print(f"\nPortfolio Performance (Successful Hotels):")
    print(f"  Mean Adj R²: {success_df['adj_r2'].mean():.3f}")
    print(f"  Mean RMSE:   ${success_df['rmse'].mean():.2f}")
    print(f"  Mean MAPE:   {success_df['mape'].mean():.2f}%")
    print(f"  Median Adj R²: {success_df['adj_r2'].median():.3f}")

print(f"\nFiles saved to: {output_path}/")
print(f"  - linear_regression_summary.csv")
print(f"  - all_linear_models.json")
print(f"  - [hotel_id]_linear_model.json (per hotel)")

print(f"\n{'='*80}")
print("DETAILED RESULTS")
print(f"{'='*80}")
print(summary_df[['hotel_id', 'adj_r2', 'rmse', 'mape', 'n_features', 'status']].to_string())
