In [32]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from statsmodels.stats.anova import anova_lm


## Reading and cleaning data

In [33]:
df = pd.read_csv("../data/clean_reclean/numerical_cleaned.csv")

df = df.drop(columns=[col for col in df.columns if "Id" in col or col == "SalePrice.1"])
df_clean = df.dropna()


In [34]:
X = df_clean.drop(columns=["SalePrice", "TotRmsAbvGrd"])
y_log = np.log(df_clean["SalePrice"])  # log-transformed target

# Step 3: Fit initial model with all data
X_const = sm.add_constant(X)
log_model_initial = sm.OLS(y_log, X_const).fit()

# Step 4: Detect and remove outliers (log residuals > 3 std)
residuals_log = log_model_initial.resid
outliers = np.abs(residuals_log) > 3 * residuals_log.std()
X_final = X_const[~outliers]
y_final = y_log[~outliers]

# Step 5: Fit final model
log_model_final = sm.OLS(y_final, X_final).fit()

# Step 6: Output summary
print(log_model_final.summary())

# Optional: Residual statistics
print("\nResidual summary:\n", log_model_final.resid.describe())
print("\nFitted value summary:\n", log_model_final.fittedvalues.describe())



                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.894
Model:                            OLS   Adj. R-squared:                  0.891
Method:                 Least Squares   F-statistic:                     302.0
Date:                Tue, 20 May 2025   Prob (F-statistic):               0.00
Time:                        17:11:31   Log-Likelihood:                 728.37
No. Observations:                1107   AIC:                            -1395.
Df Residuals:                    1076   BIC:                            -1239.
Df Model:                          30                                         
Covariance Type:            nonrobust                                         
                    coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------
const             5.3607      5.902      0.908