# All necessary libraries

In [110]:
import statsmodels.api as sm
import statsmodels.formula.api as smf
import pandas as pd
import numpy as np

# Loading the data and cleaning


In [111]:
# Load the dataset
df = pd.read_csv('../data/raw/train.csv')

# Drop rows with missing values in the 'Neighborhood' or 'SalePrice'
df_clean = df.dropna(subset=['Neighborhood','OverallQual','GrLivArea', 'SalePrice'])


# Checking best numeric correlation with house prices 

In [112]:
### Drop 'SalePrice' and 'Id' to avoid misleading correlations
numeric_features = df.select_dtypes(include=['int64', 'float64']).drop(columns=['SalePrice', 'Id'])
#categorical_features = df.select_dtypes(include=['object'])

### Convert categorical features to dummy variables
#categorical_dummies = pd.get_dummies(categorical_features, drop_first=True)

### compute categorical and numerical correlations with SalePrice
correlations = numeric_features.corrwith(df['SalePrice']).abs().sort_values(ascending=False)
# correlations = categorical_dummies.corrwith(df['SalePrice']).abs().sort_values(ascending=False)

### Display the top 10 most correlated categorical features
# top_categorical_features = correlations.head(10)
# print(top_categorical_features)

# Display the top 10 most correlated numerical features
top_numeric_features = correlations.head(10)
print(top_numeric_features)


OverallQual     0.790982
GrLivArea       0.708624
GarageCars      0.640409
GarageArea      0.623431
TotalBsmtSF     0.613581
1stFlrSF        0.605852
FullBath        0.560664
TotRmsAbvGrd    0.533723
YearBuilt       0.522897
YearRemodAdd    0.507101
dtype: float64


# First model using Neighborhood

In [113]:
# One-way ANOVA using Neighborhood as a factor
model_one_way = smf.ols('SalePrice ~ C(Neighborhood)', data=df_clean).fit()
anova_one_way = sm.stats.anova_lm(model_one_way, typ=2)

# Display the ANOVA table
print("ANOVA for Neighborhood:")
print(model_one_way.summary())
print(anova_one_way)


ANOVA for Neighborhood:
                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.546
Model:                            OLS   Adj. R-squared:                  0.538
Method:                 Least Squares   F-statistic:                     71.78
Date:                Fri, 23 May 2025   Prob (F-statistic):          1.56e-225
Time:                        14:36:52   Log-Likelihood:                -17968.
No. Observations:                1460   AIC:                         3.599e+04
Df Residuals:                    1435   BIC:                         3.612e+04
Df Model:                          24                                         
Covariance Type:            nonrobust                                         
                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------------

# Second model, two way anova with neighborhood and overallqual

In [114]:
# Run two-way ANOVA with interaction
model_two_way = smf.ols('SalePrice ~ C(Neighborhood) * C(OverallQual)', data=df_clean).fit()
anova_table = sm.stats.anova_lm(model_two_way, typ=2)

print("Two-way ANOVA with interaction:")
print(model_two_way.summary())
print(anova_table)


Two-way ANOVA with interaction:
                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.801
Model:                            OLS   Adj. R-squared:                  0.784
Method:                 Least Squares   F-statistic:                     47.87
Date:                Fri, 23 May 2025   Prob (F-statistic):               0.00
Time:                        14:36:52   Log-Likelihood:                -17366.
No. Observations:                1460   AIC:                         3.496e+04
Df Residuals:                    1346   BIC:                         3.556e+04
Df Model:                         113                                         
Covariance Type:            nonrobust                                         
                                                      coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------



# Ancova model with GrLivArea


In [115]:
model_three_way = smf.ols('SalePrice ~ C(Neighborhood) * C(OverallQual) + GrLivArea', data=df_clean).fit()
anova_three_way = sm.stats.anova_lm(model_three_way, typ=2)

print("Three-way ANOVA:")
print(model_three_way.summary())
print(anova_three_way)


Three-way ANOVA:
                            OLS Regression Results                            
Dep. Variable:              SalePrice   R-squared:                       0.864
Model:                            OLS   Adj. R-squared:                  0.852
Method:                 Least Squares   F-statistic:                     74.85
Date:                Fri, 23 May 2025   Prob (F-statistic):               0.00
Time:                        14:36:53   Log-Likelihood:                -17088.
No. Observations:                1460   AIC:                         3.441e+04
Df Residuals:                    1345   BIC:                         3.501e+04
Df Model:                         114                                         
Covariance Type:            nonrobust                                         
                                                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------



# Creating Submission.csv with third model

In [116]:
df_test = pd.read_csv("../data/raw/test.csv")  
df_test = df_test.dropna(subset=['Neighborhood', 'OverallQual', 'GrLivArea'])

df_test['SalePrice'] = model_three_way.predict(df_test)

# Checking
print(df_test[['Id', 'SalePrice']].head())

# Saving
df_test[['Id', 'SalePrice']].to_csv("submission.csv", index=False)


     Id      SalePrice
0  1461  118359.012807
1  1462  150919.758402
2  1463  162458.595052
3  1464  181651.647132
4  1465  254550.338713
