In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
import statsmodels.api as sm
from statsmodels.stats.diagnostic import het_breuschpagan
from statsmodels.stats.stattools import durbin_watson
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

## SECTION1:SETUP AND DATA LOADING

In [2]:
Path().cwd()

WindowsPath('C:/Users/Nandan Hegde/OneDrive/Documents/Open_Source_projects/PricingService.ai/Dynamic-Pricing-Model-Experiments/predictive_models/notebooks/02_LR_Models')

In [3]:
def load_lagged_dataset():
    """Load the prepared lagged dataset"""
    data_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'
    
    df = pd.read_csv(data_path / 'lagged_predictive_dataset.csv')
    
    with open(data_path / 'lag_selection_metadata.json', 'r') as f:
        metadata = json.load(f)
    
    df['date'] = pd.to_datetime(df['date'])
    df = df.sort_values('date').reset_index(drop=True)
    
    return df, metadata

df_lagged, lag_metadata = load_lagged_dataset()
focal_col = lag_metadata['focal_column']
y = df_lagged[focal_col]

print(f"Dataset shape: {df_lagged.shape}")
print(f"Target variable: {focal_col}")

Dataset shape: (360, 68)
Target variable: base_rate


## SECTION2:MULTICOLLINEARITY REMOVAL

In [4]:
all_lag_features = [col for col in df_lagged.columns if 'lag' in col]

base_rate_lags = [col for col in all_lag_features if 'base_rate_lag' in col and 'normalized' not in col]
competitor_lags = [col for col in all_lag_features if any(comp in col for comp in ['booking-us', 'day_of_week', 'month', 'is_weekend'])]

def remove_high_correlations(df, features, threshold=0.95):
    """Remove features with correlation above threshold"""
    corr_matrix = df[features].corr()
    to_remove = set()
    
    for i in range(len(corr_matrix.columns)):
        for j in range(i+1, len(corr_matrix.columns)):
            if abs(corr_matrix.iloc[i, j]) > threshold:
                # Remove the feature with lower correlation to target
                col1, col2 = corr_matrix.columns[i], corr_matrix.columns[j]
                corr1 = abs(df[col1].corr(df[focal_col]))
                corr2 = abs(df[col2].corr(df[focal_col]))
                
                if corr1 < corr2:
                    to_remove.add(col1)
                else:
                    to_remove.add(col2)
    
    return [f for f in features if f not in to_remove]

clean_competitor_lags = remove_high_correlations(df_lagged, competitor_lags, 0.90)

clean_lag_features = base_rate_lags + clean_competitor_lags
temporal_features = ['sin_day_of_week', 'cos_day_of_week', 'sin_month', 'cos_month', 
                    'sin_day_of_year', 'cos_day_of_year', 'is_weekend']

print(f"Original lag features: {len(all_lag_features)}")
print(f"After multicollinearity removal: {len(clean_lag_features)}")

Original lag features: 50
After multicollinearity removal: 30


## SECTION3:FEATURE SELECTION STRATEGIES

In [5]:
with open(Path().cwd().parent.parent / 'data' / 'dataprocessed' / 'insample_model_results.json', 'r') as f:
    previous_results = json.load(f)

significant_features_raw = [
    feat['Feature'] for feat in previous_results['feature_importance'] 
    if feat['P_Value'] < 0.05 and feat['Feature'] != 'const'
]

significant_features = [f for f in significant_features_raw if f in clean_lag_features + temporal_features]

correlations = df_lagged[clean_lag_features].corrwith(y).abs().sort_values(ascending=False)
top_corr_features = correlations.head(8).index.tolist()

feature_strategies = {
    'significant_only': significant_features,
    'focal_lag_1': [col for col in base_rate_lags if '_lag_1' in col],
    'focal_plus_competitors': [col for col in base_rate_lags if '_lag_1' in col] + 
                             [f for f in significant_features if 'booking-us' in f][:3],
    'parsimonious': [col for col in base_rate_lags if '_lag_1' in col] + 
                   [f for f in significant_features if 'booking-us' in f][:2] + 
                   temporal_features[:3],
    'correlation_based': top_corr_features + temporal_features[:2]
}

print(f"Significant features available: {len(significant_features)}")
print("\nFeature strategies:")
for name, features in feature_strategies.items():
    if features:
        print(f"{name}: {len(features)} features")

Significant features available: 5

Feature strategies:
significant_only: 5 features
focal_lag_1: 1 features
focal_plus_competitors: 4 features
parsimonious: 6 features
correlation_based: 10 features


## MODEL TRAINING AND EVALUATION

In [6]:
def fit_and_evaluate_model(X, y, strategy_name):
    """Fit model and return comprehensive evaluation"""
    
    # Fit OLS model
    model = sm.OLS(y, sm.add_constant(X)).fit()
    y_pred = model.predict(sm.add_constant(X))
    
    # Calculate metrics
    metrics = {
        'r2': r2_score(y, y_pred),
        'adj_r2': 1 - (1 - r2_score(y, y_pred)) * (len(y) - 1) / (len(y) - X.shape[1] - 1),
        'rmse': np.sqrt(mean_squared_error(y, y_pred)),
        'mae': mean_absolute_error(y, y_pred),
        'mape': np.mean(np.abs((y - y_pred) / y)) * 100,
        'n_features': X.shape[1],
        'n_obs': len(y),
        'obs_per_param': len(y) / (X.shape[1] + 1)
    }
    
    # Diagnostics
    residuals = y - y_pred
    dw_stat = durbin_watson(residuals)
    
    try:
        bp_lm, bp_pval, bp_fstat, bp_fpval = het_breuschpagan(residuals, sm.add_constant(X))
        heterosced = bp_pval < 0.05
    except:
        bp_pval = None
        heterosced = None
    
    diagnostics = {
        'durbin_watson': dw_stat,
        'autocorr_concern': dw_stat < 1.5 or dw_stat > 2.5,
        'breusch_pagan_pval': bp_pval,
        'heteroscedasticity': heterosced
    }
    
    # Significant features count
    sig_features = sum(model.pvalues[1:] < 0.05)  # Exclude constant
    
    return {
        'model': model,
        'metrics': metrics,
        'diagnostics': diagnostics,
        'y_pred': y_pred,
        'significant_features': sig_features,
        'feature_names': list(X.columns)
    }

results = {}

for strategy_name, features in feature_strategies.items():
    if features and len(features) > 0:
        print(f"\nTraining {strategy_name} model...")
        
        # Check if all features exist
        available_features = [f for f in features if f in df_lagged.columns]
        if len(available_features) != len(features):
            missing = set(features) - set(available_features)
            print(f"  Warning: Missing features {missing}")
            features = available_features
        
        if len(features) == 0:
            print(f"  Skipping {strategy_name} - no valid features")
            continue
            
        X = df_lagged[features]
        
        try:
            result = fit_and_evaluate_model(X, y, strategy_name)
            results[strategy_name] = result
            
            print(f"  R²: {result['metrics']['r2']:.4f} | "
                  f"Adj R²: {result['metrics']['adj_r2']:.4f} | "
                  f"RMSE: ${result['metrics']['rmse']:.2f}")
            print(f"  Features: {result['metrics']['n_features']} | "
                  f"Obs/Param: {result['metrics']['obs_per_param']:.1f} | "
                  f"Significant: {result['significant_features']}/{result['metrics']['n_features']}")
            
            if result['diagnostics']['autocorr_concern']:
                print(f"  Autocorrelation concern (DW: {result['diagnostics']['durbin_watson']:.3f})")
            if result['diagnostics']['heteroscedasticity']:
                print(f"  Heteroscedasticity detected (p: {result['diagnostics']['breusch_pagan_pval']:.4f})")
                
        except Exception as e:
            print(f"  Error: {str(e)}")


Training significant_only model...
  R²: 0.2359 | Adj R²: 0.2252 | RMSE: $29.77
  Features: 5 | Obs/Param: 60.0 | Significant: 3/5
  Autocorrelation concern (DW: 0.698)
  Heteroscedasticity detected (p: 0.0000)

Training focal_lag_1 model...
  R²: 0.5409 | Adj R²: 0.5396 | RMSE: $23.08
  Features: 1 | Obs/Param: 180.0 | Significant: 1/1

Training focal_plus_competitors model...
  R²: 0.5701 | Adj R²: 0.5653 | RMSE: $22.33
  Features: 4 | Obs/Param: 72.0 | Significant: 4/4
  Heteroscedasticity detected (p: 0.0000)

Training parsimonious model...
  R²: 0.5683 | Adj R²: 0.5609 | RMSE: $22.38
  Features: 6 | Obs/Param: 51.4 | Significant: 3/6
  Heteroscedasticity detected (p: 0.0000)

Training correlation_based model...
  R²: 0.6007 | Adj R²: 0.5893 | RMSE: $21.52
  Features: 10 | Obs/Param: 32.7 | Significant: 3/10


## SECTION 5:MODEL COMPARISON AND FINAL SELECTION

In [7]:
comparison_data = []
for name, result in results.items():
    comparison_data.append({
        'Model': name,
        'R²': result['metrics']['r2'],
        'Adj_R²': result['metrics']['adj_r2'], 
        'RMSE': result['metrics']['rmse'],
        'Features': result['metrics']['n_features'],
        'Obs_Per_Param': result['metrics']['obs_per_param'],
        'Sig_Features': result['significant_features'],
        'Sig_Ratio': result['significant_features'] / result['metrics']['n_features'],
        'DW_Stat': result['diagnostics']['durbin_watson'],
        'Heterosced': result['diagnostics']['heteroscedasticity']
    })

comparison_df = pd.DataFrame(comparison_data)
comparison_df = comparison_df.sort_values('Adj_R²', ascending=False)

print("\nModel Performance Comparison:")
print(comparison_df.round(4))

def score_model(row):
    """Score model based on multiple criteria"""
    score = 0
    score += row['Adj_R²'] * 100  
    score += row['Sig_Ratio'] * 20 
    score += min(row['Obs_Per_Param'], 20) * 2  
    score -= (1 if row['Heterosced'] else 0) * 5
    return score

comparison_df['Score'] = comparison_df.apply(score_model, axis=1)
comparison_df = comparison_df.sort_values('Score', ascending=False)

best_model_name = comparison_df.iloc[0]['Model']
best_result = results[best_model_name]

print(f"\nBest model: {best_model_name}")
print(f"Adjusted R²: {best_result['metrics']['adj_r2']:.4f}")
print(f"RMSE: ${best_result['metrics']['rmse']:.2f}")
print(f"Features: {best_result['metrics']['n_features']}")
print(f"Significant features: {best_result['significant_features']}/{best_result['metrics']['n_features']}")
print(f"Observations per parameter: {best_result['metrics']['obs_per_param']:.1f}")


Model Performance Comparison:
                    Model      R²  Adj_R²     RMSE  Features  Obs_Per_Param  \
4       correlation_based  0.6007  0.5893  21.5240        10        32.7273   
2  focal_plus_competitors  0.5701  0.5653  22.3332         4        72.0000   
3            parsimonious  0.5683  0.5609  22.3812         6        51.4286   
1             focal_lag_1  0.5409  0.5396  23.0800         1       180.0000   
0        significant_only  0.2359  0.2252  29.7745         5        60.0000   

   Sig_Features  Sig_Ratio  DW_Stat  Heterosced  
4             3        0.3   1.8886       False  
2             4        1.0   2.1988        True  
3             3        0.5   2.1970        True  
1             1        1.0   2.3014       False  
0             3        0.6   0.6981        True  

Best model: focal_lag_1
Adjusted R²: 0.5396
RMSE: $23.08
Features: 1
Significant features: 1/1
Observations per parameter: 180.0


## SECTION6:DETAILED ANALYSIS OF BEST MODEL

In [8]:
best_model = best_result['model']
print("\nDetailed Model Summary:")
print(best_model.summary())


Detailed Model Summary:
                            OLS Regression Results                            
Dep. Variable:              base_rate   R-squared:                       0.541
Model:                            OLS   Adj. R-squared:                  0.540
Method:                 Least Squares   F-statistic:                     421.8
Date:                Sun, 28 Sep 2025   Prob (F-statistic):           1.73e-62
Time:                        19:24:28   Log-Likelihood:                -1640.8
No. Observations:                 360   AIC:                             3286.
Df Residuals:                     358   BIC:                             3293.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const            

## SECTION7:EXACT ESTIMATED EQUATIONS WITH NUMERICAL VALUES

In [9]:
print("\nEXACT ESTIMATED EQUATIONS:")
print("=" * 60)

for model_name, result in results.items():
    model = result['model']
    coefficients = model.params
    
    print(f"\n{model_name.upper()} MODEL:")
    
    # Build equation with actual numerical values
    equation_parts = []
    for param_name, coeff in coefficients.items():
        if param_name == 'const':
            equation_parts.append(f"{coeff:.6f}")
        else:
            sign = "+" if coeff >= 0 else "-"
            if len(equation_parts) == 1:  # First term after constant
                equation_parts.append(f" {sign} {abs(coeff):.6f}*{param_name}")
            else:
                equation_parts.append(f" {sign} {abs(coeff):.6f}*{param_name}")
    
    equation = f"base_rate = {''.join(equation_parts)}"
    print(equation)
    
    # Show key statistics
    print(f"R² = {result['metrics']['r2']:.4f}, Adj R² = {result['metrics']['adj_r2']:.4f}, RMSE = ${result['metrics']['rmse']:.2f}")

print("\n" + "=" * 60)
print("BEST MODEL EQUATION WITH SIGNIFICANCE:")

best_coefficients = best_model.params
best_pvalues = best_model.pvalues

equation_parts = []
significant_terms = []
insignificant_terms = []

for param_name, coeff in best_coefficients.items():
    pval = best_pvalues[param_name]
    significance = "***" if pval < 0.001 else "**" if pval < 0.01 else "*" if pval < 0.05 else ""
    
    if param_name == 'const':
        term = f"{coeff:.6f}{significance}"
        equation_parts.append(term)
        if significance:
            significant_terms.append(f"Intercept: {coeff:.6f}{significance}")
        else:
            insignificant_terms.append(f"Intercept: {coeff:.6f}")
    else:
        sign = "+" if coeff >= 0 else "-"
        term = f" {sign} {abs(coeff):.6f}*{param_name}{significance}"
        equation_parts.append(term)
        if significance:
            significant_terms.append(f"{param_name}: {coeff:.6f}{significance}")
        else:
            insignificant_terms.append(f"{param_name}: {coeff:.6f}")

final_equation = f"base_rate = {''.join(equation_parts)}"
print(final_equation)

print(f"\nSignificance: *** p<0.001, ** p<0.01, * p<0.05")
print(f"\nSignificant terms ({len(significant_terms)}):")
for term in significant_terms:
    print(f"  {term}")

if insignificant_terms:
    print(f"\nInsignificant terms ({len(insignificant_terms)}):")
    for term in insignificant_terms[:5]:  # Show first 5
        print(f"  {term}")
    if len(insignificant_terms) > 5:
        print(f"  ... and {len(insignificant_terms) - 5} more")


EXACT ESTIMATED EQUATIONS:

SIGNIFICANT_ONLY MODEL:
base_rate = -32.203655 + 0.591052*booking-us-kohea-kai-resort-maui-USD_lag_3 + 0.140472*booking-us-courtyard-by-marriott-maui-kahului-airport-USD_lag_1 + 0.317595*booking-us-aqua-pacific-monarch-USD_lag_2 - 0.875570*day_of_week_lag_4 - 0.046131*booking-us-aqua-pacific-monarch-USD_lag_1
R² = 0.2359, Adj R² = 0.2252, RMSE = $29.77

FOCAL_LAG_1 MODEL:
base_rate = 70.312142 + 0.738333*base_rate_lag_1
R² = 0.5409, Adj R² = 0.5396, RMSE = $23.08

FOCAL_PLUS_COMPETITORS MODEL:
base_rate = -26.627881 + 0.652243*base_rate_lag_1 + 0.224452*booking-us-kohea-kai-resort-maui-USD_lag_3 + 0.055216*booking-us-courtyard-by-marriott-maui-kahului-airport-USD_lag_1 + 0.118431*booking-us-aqua-pacific-monarch-USD_lag_2
R² = 0.5701, Adj R² = 0.5653, RMSE = $22.33

PARSIMONIOUS MODEL:
base_rate = -4.901800 + 0.654861*base_rate_lag_1 + 0.238168*booking-us-kohea-kai-resort-maui-USD_lag_3 + 0.070247*booking-us-courtyard-by-marriott-maui-kahului-airport-USD_lag

## SECTION8:FEATURE IMPORTANCE ANALYSIS

In [10]:
feature_importance = pd.DataFrame({
    'Feature': best_model.params.index[1:],
    'Coefficient': best_model.params.values[1:],
    'P_Value': best_model.pvalues.values[1:],
    'T_Stat': best_model.tvalues.values[1:],
    'Significant': best_model.pvalues.values[1:] < 0.05
})

feature_importance['Abs_T_Stat'] = np.abs(feature_importance['T_Stat'])
feature_importance = feature_importance.sort_values('Abs_T_Stat', ascending=False)

print("\nFeature Importance (Top 10):")
print(feature_importance.head(10))


Feature Importance (Top 10):
           Feature  Coefficient       P_Value     T_Stat  Significant  \
0  base_rate_lag_1     0.738333  1.730046e-62  20.537556         True   

   Abs_T_Stat  
0   20.537556  


## SECTION9:RESIDUAL ANALYSIS AND DIAGNOSTICS

In [11]:
residuals = y - best_result['y_pred']

print("\nResidual Analysis:")
print(f"Mean: {residuals.mean():.6f}")
print(f"Std: {residuals.std():.4f}")
print(f"Min: {residuals.min():.4f}")
print(f"Max: {residuals.max():.4f}")

residuals_df = pd.DataFrame({
    'date': df_lagged['date'],
    'residuals': residuals,
    'fitted': best_result['y_pred'],
    'actual': y
})

print(f"\nTemporal patterns in residuals:")
monthly_residuals = residuals_df.groupby(residuals_df['date'].dt.month)['residuals'].agg(['mean', 'std'])
print(monthly_residuals)

shapiro_stat, shapiro_pval = stats.shapiro(residuals)
print(f"\nDiagnostic Tests:")
print(f"Shapiro-Wilk normality test p-value: {shapiro_pval:.6f}")
print(f"Durbin-Watson statistic: {best_result['diagnostics']['durbin_watson']:.4f}")
if best_result['diagnostics']['breusch_pagan_pval']:
    print(f"Breusch-Pagan heteroscedasticity test p-value: {best_result['diagnostics']['breusch_pagan_pval']:.6f}")


Residual Analysis:
Mean: 0.000000
Std: 23.1121
Min: -82.0737
Max: 124.3763

Temporal patterns in residuals:
           mean        std
date                      
1      6.596740  22.592382
2      4.304879  13.664939
3      4.911786   7.212573
4      2.692968   0.000000
5     -8.358659   7.050863
6     -9.097606   5.377365
7     -1.825858   3.978255
8      3.169311   2.652170
9     -4.562384  36.194437
10    -4.005431  46.363775
11    -5.530932  28.163057
12    10.853726  33.873295

Diagnostic Tests:
Shapiro-Wilk normality test p-value: 0.000000
Durbin-Watson statistic: 2.3014
Breusch-Pagan heteroscedasticity test p-value: 0.412010


## SECTION10:SAVE RESULTS

In [12]:
output_path = Path().cwd().parent.parent / 'data' / 'dataprocessed'

def convert_numpy_types(obj):
    if isinstance(obj, np.integer):
        return int(obj)
    elif isinstance(obj, np.floating):
        return float(obj)
    elif isinstance(obj, np.bool_):
        return bool(obj)
    elif isinstance(obj, dict):
        return {key: convert_numpy_types(value) for key, value in obj.items()}
    elif isinstance(obj, list):
        return [convert_numpy_types(item) for item in obj]
    else:
        return obj

# Save all model equations
all_equations = {}
for model_name, result in results.items():
    model = result['model']
    coefficients = model.params
    
    equation_parts = []
    for param_name, coeff in coefficients.items():
        if param_name == 'const':
            equation_parts.append(f"{coeff:.6f}")
        else:
            sign = "+" if coeff >= 0 else "-"
            equation_parts.append(f" {sign} {abs(coeff):.6f}*{param_name}")
    
    all_equations[model_name] = f"base_rate = {''.join(equation_parts)}"

model_results = {
    'best_model_name': best_model_name,
    'model_comparison': comparison_df.to_dict('records'),
    'best_model_metrics': best_result['metrics'],
    'best_model_diagnostics': best_result['diagnostics'],
    'feature_importance': feature_importance.to_dict('records'),
    'selected_features': best_result['feature_names'],
    'all_model_equations': all_equations,
    'best_model_equation': all_equations[best_model_name],
    'improvement_summary': {
        'multicollinearity_removed': True,
        'features_reduced': f"{len(all_lag_features)} → {best_result['metrics']['n_features']}",
        'obs_per_param_ratio': best_result['metrics']['obs_per_param'],
        'significant_feature_ratio': best_result['significant_features'] / best_result['metrics']['n_features']
    }
}

model_results = convert_numpy_types(model_results)

with open(output_path / 'clean_model_results.json', 'w') as f:
    json.dump(model_results, f, indent=2)

predictions_df = pd.DataFrame({
    'date': df_lagged['date'],
    'actual': y,
    'predicted': best_result['y_pred'],
    'residuals': residuals
})

predictions_df.to_csv(output_path / 'clean_model_predictions.csv', index=False)

print(f"\nResults saved to:")
print(f"- clean_model_results.json")
print(f"- clean_model_predictions.csv")
print(f"\nModeling analysis complete!")
print(f"Best model achieved {best_result['metrics']['obs_per_param']:.1f} observations per parameter")
print(f"Significant features: {best_result['significant_features']}/{best_result['metrics']['n_features']}")


Results saved to:
- clean_model_results.json
- clean_model_predictions.csv

Modeling analysis complete!
Best model achieved 180.0 observations per parameter
Significant features: 1/1
