In [4]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# Load the dataset
df = pd.read_csv("../data/raw/new_train.csv")  # Adjust path if needed

# Top 6 most correlated numeric features with SalePrice
top_features = [
    "OverallQual", "GrLivArea", "GarageCars", 
    "GarageArea", "TotalBsmtSF", "1stFlrSF"
]

# Create a working DataFrame
df_selected = df[top_features + ["SalePrice"]].copy()


In [5]:
# Convert to binary based on median split
for col in top_features:
    median = df_selected[col].median()
    df_selected[col + "_bin"] = (df_selected[col] > median).astype(int)

# Rename column that starts with a digit (not allowed in formulas)
df_selected = df_selected.rename(columns={"1stFlrSF_bin": "FirstFlr_bin"})

# Define final list of binary features
binary_features = [
    "OverallQual_bin", "GrLivArea_bin", "GarageCars_bin", 
    "GarageArea_bin", "TotalBsmtSF_bin", "FirstFlr_bin"
]

In [7]:
# Define formula for full 2^6 factorial ANOVA (main effects + all interactions)
formula = "SalePrice ~ " + " * ".join(binary_features)

# Fit the model
model = smf.ols(formula=formula, data=df_selected).fit()

# Get the ANOVA table
anova_table = sm.stats.anova_lm(model, typ=2)

# Sort by significance (lowest p-values first)
anova_sorted = anova_table.sort_values("PR(>F)")

# Ensure full table is shown
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.width", None)
pd.set_option("display.max_colwidth", None)

# Display the full ANOVA table
print("\nANOVA Results (Full 2^6 Factorial Design):\n")
print(anova_sorted)


ANOVA Results (Full 2^6 Factorial Design):

                                                                                                sum_sq  \
OverallQual_bin                                                                           5.268407e+11   
GrLivArea_bin                                                                             4.512192e+11   
GarageCars_bin                                                                            2.614531e+11   
TotalBsmtSF_bin                                                                           6.803582e+10   
GarageArea_bin                                                                            6.245784e+10   
FirstFlr_bin                                                                              3.714935e+10   
OverallQual_bin:FirstFlr_bin                                                              1.886390e+10   
GarageArea_bin:FirstFlr_bin                                                               1.045874e+10   
O