# Model Building

In [3]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.api import OLS, add_constant
from statsmodels.tools.eval_measures import aic, bic

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [4]:
with open('../Data/after_feature_engineering.pkl', 'rb') as f:
    df_train = pickle.load(f)
    df_valid = pickle.load(f)
    df_test = pickle.load(f)
    df_train_encoded = pickle.load(f)
    df_valid_encoded = pickle.load(f)
    df_test_encoded = pickle.load(f)

In [5]:
with open('../Data/selected_features.pkl', 'rb') as f:
    selected_features = pickle.load(f)

In [32]:
# df_train.head()

In [33]:
# df_train_encoded.head()

In [34]:
# df_train[selected_features].head()

In [35]:
# df_train_encoded.select_dtypes(include = 'object').head()

In [19]:
selected_features

['market_volatility',
 'dim_m2',
 'n_rooms',
 'estimated_maintenance_cost',
 'year_built',
 'n_poi',
 'dist_centre',
 'green_space_ratio',
 'infrastructure_quality',
 'obj_type',
 'build_mat',
 'has_park',
 'has_balcony',
 'has_lift',
 'has_sec',
 'has_store',
 'src_month']

In [21]:
df_train_encoded.columns.tolist()

['unit_id',
 'dim_m2',
 'n_rooms',
 'floor_no',
 'floor_max',
 'year_built',
 'dist_centre',
 'n_poi',
 'dist_sch',
 'dist_clinic',
 'dist_post',
 'dist_kind',
 'dist_rest',
 'dist_uni',
 'dist_pharma',
 'has_park',
 'has_balcony',
 'has_sec',
 'has_store',
 'price_z',
 'src_month',
 'loc_code',
 'market_volatility',
 'infrastructure_quality',
 'neighborhood_crime_rate',
 'popularity_index',
 'green_space_ratio',
 'estimated_maintenance_cost',
 'global_economic_index',
 'obj_type_0d6c4dfc',
 'obj_type_2a6d5c01',
 'obj_type_Unknown',
 'build_mat_7f8c00f9',
 'build_mat_Unknown',
 'has_lift_no',
 'has_lift_yes']

In [23]:
selected_features = ['dim_m2',
 'n_rooms', 
 'year_built',
 'dist_centre',
 'n_poi',
 'has_park',
 'has_balcony',
 'has_sec',
 'has_store',
 'price_z',
 'src_month',
 'market_volatility',
 'infrastructure_quality',
 'green_space_ratio',
 'estimated_maintenance_cost',
 'obj_type_0d6c4dfc',
 'obj_type_2a6d5c01',
 'obj_type_Unknown',
 'build_mat_7f8c00f9',
 'build_mat_Unknown',
 'has_lift_no',
 'has_lift_yes']

In [24]:
selected_features

['dim_m2',
 'n_rooms',
 'year_built',
 'dist_centre',
 'n_poi',
 'has_park',
 'has_balcony',
 'has_sec',
 'has_store',
 'price_z',
 'src_month',
 'market_volatility',
 'infrastructure_quality',
 'green_space_ratio',
 'estimated_maintenance_cost',
 'obj_type_0d6c4dfc',
 'obj_type_2a6d5c01',
 'obj_type_Unknown',
 'build_mat_7f8c00f9',
 'build_mat_Unknown',
 'has_lift_no',
 'has_lift_yes']

In [28]:
df_train_encoded[selected_features].select_dtypes(include='object').head()

Unnamed: 0,has_park,has_balcony,has_sec,has_store,src_month
45366,no,yes,no,yes,2023-12
86630,no,yes,yes,no,2023-12
139489,no,yes,no,no,2024-06
24126,yes,no,yes,no,2024-05
151982,yes,yes,no,yes,2023-10


In [31]:
obj_cols = df_train_encoded[selected_features].select_dtypes(include='object').columns
obj_cols

Index(['has_park', 'has_balcony', 'has_sec', 'has_store', 'src_month'], dtype='object')

In [36]:
bool_cols = ['has_park', 'has_balcony', 'has_sec', 'has_store']

df_train_encoded[bool_cols] = df_train_encoded[bool_cols].replace({'yes': 1, 'no': 0})


  df_train_encoded[bool_cols] = df_train_encoded[bool_cols].replace({'yes': 1, 'no': 0})


In [37]:
df_train_encoded['src_month'] = pd.to_datetime(df_train_encoded['src_month'])
df_train_encoded['src_month'] = df_train_encoded['src_month'].map(lambda x: x.toordinal())

In [39]:
df_train_encoded[selected_features] = df_train_encoded[selected_features].apply(pd.to_numeric)

In [41]:
df_train_encoded[selected_features].head()

Unnamed: 0,dim_m2,n_rooms,year_built,dist_centre,n_poi,has_park,has_balcony,has_sec,has_store,price_z,...,infrastructure_quality,green_space_ratio,estimated_maintenance_cost,obj_type_0d6c4dfc,obj_type_2a6d5c01,obj_type_Unknown,build_mat_7f8c00f9,build_mat_Unknown,has_lift_no,has_lift_yes
45366,59.39,3.0,1980.0,2.578,19.0,0,1,0,1,12.834406,...,50.29,1.0,17.98,0,0,1,0,1,1,0
86630,52.95,2.0,2023.0,8.413,13.0,0,1,1,0,13.570913,...,1.13,0.999,24.08,1,0,0,0,1,0,1
139489,24.55,2.0,1993.0,4.479,2.0,0,1,0,0,13.362486,...,21.12,1.0,9.25,1,0,0,0,1,1,0
24126,57.58,3.0,2017.0,13.367,0.0,1,0,1,0,14.072345,...,8.08,0.999,8.48,0,0,1,0,1,0,1
151982,73.71,3.0,2016.0,9.365,0.0,1,1,0,1,13.705193,...,9.38,0.999,24.94,0,1,0,0,0,0,1


In [42]:
df_train_encoded[selected_features].info()

<class 'pandas.core.frame.DataFrame'>
Index: 93872 entries, 45366 to 15725
Data columns (total 22 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   dim_m2                      93872 non-null  float64
 1   n_rooms                     93872 non-null  float64
 2   year_built                  93872 non-null  float64
 3   dist_centre                 93872 non-null  float64
 4   n_poi                       93872 non-null  float64
 5   has_park                    93872 non-null  int64  
 6   has_balcony                 93872 non-null  int64  
 7   has_sec                     93872 non-null  int64  
 8   has_store                   93872 non-null  int64  
 9   price_z                     93872 non-null  float64
 10  src_month                   93872 non-null  int64  
 11  market_volatility           93872 non-null  float64
 12  infrastructure_quality      93872 non-null  float64
 13  green_space_ratio           9387

In [43]:
X_train = df_train_encoded[selected_features].drop('price_z', axis=1)

y_train = df_train_encoded['price_z']

In [44]:
model_ols = sm.OLS(y_train, X_train).fit()
print(model_ols.summary())

                                 OLS Regression Results                                
Dep. Variable:                price_z   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          2.088e+07
Date:                Mon, 10 Nov 2025   Prob (F-statistic):                        0.00
Time:                        16:21:19   Log-Likelihood:                          19378.
No. Observations:               93872   AIC:                                 -3.871e+04
Df Residuals:                   93851   BIC:                                 -3.852e+04
Df Model:                          21                                                  
Covariance Type:            nonrobust                                                  
                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------

In [49]:
import sys
sys.path.append('../src')  # adjust path if needed

from backward_elimination import backward_elimination_aic_bic

In [52]:
model_ols_aic = backward_elimination_aic_bic(X_train, y_train, criterion = 'AIC')

print(model_ols_aic.summary())

                                 OLS Regression Results                                
Dep. Variable:                price_z   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          2.308e+07
Date:                Mon, 10 Nov 2025   Prob (F-statistic):                        0.00
Time:                        16:26:27   Log-Likelihood:                          19378.
No. Observations:               93872   AIC:                                 -3.872e+04
Df Residuals:                   93853   BIC:                                 -3.854e+04
Df Model:                          19                                                  
Covariance Type:            nonrobust                                                  
                                 coef    std err          t      P>|t|      [0.025      0.975]
-------------------------

In [53]:
model_ols_bic = backward_elimination_aic_bic(X_train, y_train, criterion = 'BIC')

print(model_ols_bic.summary())

                                 OLS Regression Results                                
Dep. Variable:                price_z   R-squared (uncentered):                   1.000
Model:                            OLS   Adj. R-squared (uncentered):              1.000
Method:                 Least Squares   F-statistic:                          2.740e+07
Date:                Mon, 10 Nov 2025   Prob (F-statistic):                        0.00
Time:                        16:27:18   Log-Likelihood:                          19368.
No. Observations:               93872   AIC:                                 -3.870e+04
Df Residuals:                   93856   BIC:                                 -3.855e+04
Df Model:                          16                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [56]:
model_linear = LinearRegression()

model_linear.fit(X_train, y_train)

0,1,2
,fit_intercept,True
,copy_X,True
,tol,1e-06
,n_jobs,
,positive,False


---
For OLS we have:

model_ols\
model_ols_aic\
model_ols_bic\
model_linear

---

In [62]:
cv5 = KFold(
    n_splits = 5,
    shuffle = True,
    random_state = 123
)

polynomial_pipeline = Pipeline(
    [
        ('generator', PolynomialFeatures()),
        ('model', LinearRegression())
    ]
)

# degrees = list(range(1, 5))
degrees = [1, 2, 3, 4]

# we need to define a grid of hyperparameters for the PolynomailFeatures() transformer
# it has to be a dictionary with the name of the step and the name of the hyperparameter
# as defined in the pipeline and algorithm documentation

# ex: here we use the name "generator__degree" because
# our pipeline step is named "generator" and the hyperparameter of the 
# PolynomialFeatures() transformer is named "degree"
polynomial_grid = {'generator__degree': degrees}

# define a GridSearchCV object to find the optimal degree
polynomial_grid_search = GridSearchCV(
    polynomial_pipeline, # model or pipeline to tune
    param_grid = polynomial_grid, # dictionary with hyperparameters
    cv = cv5, # cross-validation 
    # python assumes the higher the better, hence we use negative RMSE
    scoring = 'neg_root_mean_squared_error', # evaluation metric
    n_jobs = -1
)

In [None]:
polynomial_grid_search.fit(X_train, y_train)

# adding "_best" to GridSearchCV result
model_polynomial_best = polynomial_grid_search.best_estimator_

print('Best degree:', polynomial_grid_search.best_params_)
print('Best -RMSE:', polynomial_grid_search.best_score_)

python(18996) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(18997) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(18998) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(18999) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(19000) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(19001) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(19002) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(19003) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(19004) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
python(19005) MallocStackLogging: can't turn off malloc stack logging because it was not enabled.
