# $2^k$ Design

In [309]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt

# Load the dataset
df = pd.read_csv("../data/raw/new_train.csv")  # Adjust path if needed

In [None]:
#List of 15 highest abs(correlation)
numeric_features = df.drop(columns=['SalePrice', 'Id'])
correlations = numeric_features.corrwith(df['SalePrice']).abs().sort_values(ascending=False)
top_numeric_features = correlations
top_numeric_features.head(15)

OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
ExterQual       0.636884
GarageArea      0.623431
BsmtQual        0.618025
TotalBsmtSF     0.613581
1stFlrSF        0.605852
KitchenQual     0.589189
FullBath        0.560664
GarageFinish    0.537242
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
MasVnrArea      0.472614
dtype: float64

In [None]:
# Top 10 most correlated features with SalePrice
top_features = [
    "OverallQual", "GrLivArea", "GarageCars",
    "ExterQual", "GarageArea", "BsmtQual",
    "TotalBsmtSF", "1stFlrSF", "KitchenQual",
    "FullBath"
]

# Create DataFrame
df_selected = df[top_features + ["SalePrice"]].copy()

In [312]:
# Convert to binary based on median split
for col in top_features:
    median = df_selected[col].median()
    df_selected[col + "_bin"] = (df_selected[col] > median).astype(int)

# Improve the repartition of some variables
df_selected["ExterQual_bin"] = (df_selected['ExterQual'] >= df_selected['ExterQual'].median()).astype(int)
df_selected["GarageCars_bin"] = (df_selected['GarageCars'] >= df_selected['GarageCars'].median()).astype(int)
df_selected["KitchenQual_bin"] = (df_selected['KitchenQual'] >= df_selected['KitchenQual'].median()).astype(int)
df_selected["FullBath_bin"] = (df_selected['FullBath'] >= df_selected['FullBath'].median()).astype(int)

# Rename column that starts with a digit (not allowed in formulas)
df_selected = df_selected.rename(columns={"1stFlrSF_bin": "FirstFlr_bin"})

In [313]:
# Define final list of binary features
binary_features = [
    "OverallQual_bin", "GrLivArea_bin", "GarageCars_bin",
    "GarageArea_bin", "ExterQual_bin", "BsmtQual_bin",
    "TotalBsmtSF_bin", "FirstFlr_bin", "KitchenQual_bin",
    "FullBath_bin"
]

# Define formula for full 2^10 factorial (main effects + all interactions)
formula = "SalePrice ~ " + " * ".join(binary_features)

# Fit the model
model = smf.ols(formula=formula, data=df_selected).fit()
model_summary = model.summary()
print(model_summary)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.717
Model:                            OLS   Adj. R-squared:                  0.656
Method:                 Least Squares   F-statistic:                     11.65
Date:                Mon, 26 May 2025   Prob (F-statistic):          1.83e-201
Time:                        23:18:06   Log-Likelihood:                -17622.
No. Observations:                1460   AIC:                         3.577e+04
Df Residuals:                    1198   BIC:                         3.715e+04
Df Model:                         261                                         
Covariance Type:            nonrobust                                         
                                                                                                                                                       coef    std err          t      P>|t|      [0.025      0.97

In [None]:
#2^6 Design + adding single variables
df_selected_col = ['OverallQual_bin', 'GrLivArea_bin', 'GarageCars_bin',
'GarageArea_bin', 'TotalBsmtSF_bin', 'FirstFlr_bin']

design = 'SalePrice ~ ' + ' * '.join(df_selected_col)

# Full factorial model formula
formula_full = ' + '.join([
    design,
    "ExterQual",
    "BsmtQual",
    "KitchenQual",
    "FullBath"
])

df_2k = df_selected[df_selected_col + ["ExterQual", "BsmtQual", "KitchenQual", "FullBath", 'SalePrice']].copy()

# Fit model
model_full = smf.ols(formula_full, data=df_2k).fit()
model_full_summary = model_full.summary()
print(model_full_summary)

                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.737
Model:                            OLS   Adj. R-squared:                  0.727
Method:                 Least Squares   F-statistic:                     77.21
Date:                Mon, 26 May 2025   Prob (F-statistic):               0.00
Time:                        23:33:30   Log-Likelihood:                -17570.
No. Observations:                1460   AIC:                         3.524e+04
Df Residuals:                    1408   BIC:                         3.552e+04
Df Model:                          51                                         
Covariance Type:            nonrobust                                         
                                                                                               coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------

In [None]:
# List of best variable from df_2k
params_full = model_full.params.drop(["Intercept"])
chosen_full = abs(params_full).sort_values(ascending=False)
best_full = []
for ii in chosen_full.index:
    if ":" not in ii:
        best_full.append(ii)
best_full


['OverallQual_bin',
 'GrLivArea_bin',
 'TotalBsmtSF_bin',
 'GarageArea_bin',
 'GarageCars_bin',
 'FullBath',
 'BsmtQual',
 'ExterQual',
 'KitchenQual',
 'FirstFlr_bin']