# Model Validation

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pickle

import statsmodels.api as sm
import statsmodels.formula.api as smf
from statsmodels.api import OLS, add_constant
from statsmodels.tools.eval_measures import aic, bic

from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import median_absolute_error, r2_score
from sklearn.metrics import confusion_matrix, roc_curve, auc, classification_report

from sklearn.metrics import balanced_accuracy_score
from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler

from sklearn.linear_model import Ridge, Lasso, ElasticNet

In [6]:
import warnings
warnings.filterwarnings('ignore')

In [7]:
# for loading next ...

with open('../Data/after_model_building.pkl', 'rb') as f:
    model_ols = pickle.load(f)
    model_ols_aic = pickle.load(f)
    model_ols_bic = pickle.load(f)
    model_linear = pickle.load(f)
    model_polynomial_best = pickle.load(f)
    model_ridge_best = pickle.load(f)
    model_lasso_best = pickle.load(f)
    model_elastic_best = pickle.load(f)
    model_svr = pickle.load(f)
    selected_features = pickle.load(f)

selected_features

['dim_m2',
 'n_rooms',
 'year_built',
 'dist_centre',
 'n_poi',
 'has_park',
 'has_balcony',
 'has_sec',
 'has_store',
 'price_z',
 'src_month',
 'market_volatility',
 'infrastructure_quality',
 'green_space_ratio',
 'estimated_maintenance_cost',
 'obj_type_0d6c4dfc',
 'obj_type_2a6d5c01',
 'obj_type_Unknown',
 'build_mat_7f8c00f9',
 'build_mat_Unknown',
 'has_lift_no',
 'has_lift_yes']

In [8]:
with open('../Data/after_feature_engineering.pkl', 'rb') as f:
    df_train = pickle.load(f)
    df_valid = pickle.load(f)
    df_test = pickle.load(f)
    df_train_encoded = pickle.load(f)
    df_valid_encoded = pickle.load(f)
    df_test_encoded = pickle.load(f)

In [9]:
df_valid_encoded[selected_features].head()

Unnamed: 0,dim_m2,n_rooms,year_built,dist_centre,n_poi,has_park,has_balcony,has_sec,has_store,price_z,...,infrastructure_quality,green_space_ratio,estimated_maintenance_cost,obj_type_0d6c4dfc,obj_type_2a6d5c01,obj_type_Unknown,build_mat_7f8c00f9,build_mat_Unknown,has_lift_no,has_lift_yes
9765,79.5,4.0,1977.268414,5.063,4.0,no,no,yes,no,13.67346,...,54.911698,0.999,20.33,0,0,1,0,0,0,1
29996,33.28,2.0,2023.0,5.236,1.0,no,yes,no,no,13.052183,...,1.48,0.999,6.43,1,0,0,0,1,0,1
93432,51.33,2.0,1950.0,1.157,42.0,yes,no,no,yes,13.542354,...,37.53,1.0,20.04,1,0,0,0,0,1,0
119975,53.57,2.0,1950.0,1.263,47.0,no,yes,yes,yes,13.122335,...,96.69,1.0,17.97,0,0,0,0,0,1,0
35660,54.13,3.0,2019.0,6.871,6.0,no,yes,no,no,13.76016,...,5.88,0.999,14.11,0,0,1,0,1,0,1


In [12]:
# model_ols
# model_ols_aic
# model_ols_bic
# model_linear
# model_polynomial_best
# model_ridge_best
# model_lasso_best
# model_elastic_best
# model_svr

In [17]:
# y_valid_true = df_valid_encoded['price_z']

# X_valid = df_valid_encoded[selected_features].drop('price_z', axis=1)

In [21]:
# # model ols
# y_pred_model_ols = model_ols.predict(X_valid)

## Changin pd.to_numeric all non-numeric X...

### df_valid_encoded

In [19]:
df_valid_encoded[selected_features].select_dtypes(include = 'object').head()

Unnamed: 0,unit_id,has_park,has_balcony,has_sec,has_store,src_month,loc_code
9765,c5102cb29f3905ed,no,no,yes,no,2023-12,693f303c
29996,1536c29f037a1d59,no,yes,no,no,2023-08,378f340c
93432,c6223ed977ebae57,yes,no,no,yes,2023-08,378f340c
119975,dbc11749ecc469d7,no,yes,yes,yes,2023-10,6900ba06
35660,e9b6659066fae87f,no,yes,no,no,2023-10,0ab06839


In [20]:
obj_cols = df_valid_encoded[selected_features].select_dtypes(include='object').columns
obj_cols

Index(['has_park', 'has_balcony', 'has_sec', 'has_store', 'src_month'], dtype='object')

In [22]:
bool_cols = ['has_park', 'has_balcony', 'has_sec', 'has_store']

df_valid_encoded[bool_cols] = df_valid_encoded[bool_cols].replace({'yes': 1, 'no': 0})


In [23]:
df_valid_encoded['src_month'] = pd.to_datetime(df_valid_encoded['src_month'])
df_valid_encoded['src_month'] = df_valid_encoded['src_month'].map(lambda x: x.toordinal())

In [25]:
df_valid_encoded[selected_features] = df_valid_encoded[selected_features].apply(pd.to_numeric)

In [29]:
df_valid_encoded[selected_features].select_dtypes(include = 'object').head()

9765
29996
93432
119975
35660


### now... df_test_encoded

In [30]:
df_test_encoded[selected_features].select_dtypes(include = 'object').head()

Unnamed: 0,has_park,has_balcony,has_sec,has_store,src_month
17069,no,yes,no,yes,2024-02
149288,no,no,yes,yes,2023-10
79538,yes,yes,no,yes,2024-03
114059,no,yes,no,no,2023-10
74338,no,no,no,yes,2023-09


In [31]:
obj_cols = df_test_encoded[selected_features].select_dtypes(include='object').columns
obj_cols

Index(['has_park', 'has_balcony', 'has_sec', 'has_store', 'src_month'], dtype='object')

In [32]:
bool_cols = ['has_park', 'has_balcony', 'has_sec', 'has_store']

df_test_encoded[bool_cols] = df_test_encoded[bool_cols].replace({'yes': 1, 'no': 0})


In [33]:
df_test_encoded['src_month'] = pd.to_datetime(df_test_encoded['src_month'])
df_test_encoded['src_month'] = df_test_encoded['src_month'].map(lambda x: x.toordinal())

In [34]:
df_test_encoded[selected_features] = df_test_encoded[selected_features].apply(pd.to_numeric)

In [35]:
df_test_encoded[selected_features].select_dtypes(include = 'object').head()

17069
149288
79538
114059
74338


In [37]:
# ================================================================


# df_valid_encoded and df_test_encoded contains only numeric data


# ================================================================

In [38]:
y_valid_true = df_valid_encoded['price_z']

X_valid = df_valid_encoded[selected_features].drop('price_z', axis=1)

In [39]:
# model ols
y_pred_model_ols = model_ols.predict(X_valid)

In [43]:
y_pred_model_linear = model_linear.predict(X_valid)

In [45]:
y_pred_model_polynomial_best = model_polynomial_best.predict(X_valid)

In [50]:
y_pred_model_ridge_best = model_ridge_best.predict(X_valid)

In [51]:
y_pred_model_lasso_best = model_lasso_best.predict(X_valid)

In [54]:
y_pred_model_elastic_best = model_elastic_best.predict(X_valid)

In [55]:
y_pred_model_svr = model_svr.predict(X_valid)

In [57]:
def regression_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)

    # return results in a form of a dictionary
    return {
        'RMSE': rmse,
        'MAE': mae,
        'MedAE': medae,
        'MAPE': mape,
        'R2': r2
    }

In [59]:
# model ols
regression_metrics(y_valid_true, y_pred_model_ols)

{'RMSE': np.float64(0.19643557868469855),
 'MAE': 0.14692169571155744,
 'MedAE': 0.11545041337067019,
 'MAPE': np.float64(1.0986125325979323),
 'R2': 0.8462161169562312}

In [60]:
# model linear
regression_metrics(y_valid_true, y_pred_model_linear)

{'RMSE': np.float64(0.19617471529878844),
 'MAE': 0.14677124792279372,
 'MedAE': 0.11549723567341807,
 'MAPE': np.float64(1.097476901851629),
 'R2': 0.8466242909499476}

In [61]:
# model polynomial best
regression_metrics(y_valid_true, y_pred_model_polynomial_best)

{'RMSE': np.float64(0.13277501129094613),
 'MAE': 0.1068901542756209,
 'MedAE': 0.09497581614432526,
 'MAPE': np.float64(0.7974763197904432),
 'R2': 0.9297407975619006}

In [62]:
# model ridge best
regression_metrics(y_valid_true, y_pred_model_ridge_best)

{'RMSE': np.float64(0.1961814252711732),
 'MAE': 0.14678378647994106,
 'MedAE': 0.11561709117110297,
 'MAPE': np.float64(1.0975698708891986),
 'R2': 0.8466137986255811}

In [63]:
# model lasso best
regression_metrics(y_valid_true, y_pred_model_lasso_best)

{'RMSE': np.float64(0.19641770390050148),
 'MAE': 0.1466728359705461,
 'MedAE': 0.11509926609887167,
 'MAPE': np.float64(1.0971956420844946),
 'R2': 0.8462441030132778}

In [64]:
# model elastic net best
regression_metrics(y_valid_true, y_pred_model_elastic_best)

{'RMSE': np.float64(0.19627351896721676),
 'MAE': 0.14642309083173377,
 'MedAE': 0.11517381155655215,
 'MAPE': np.float64(1.094902740310499),
 'R2': 0.8464697562652277}

In [65]:
# model SVR
regression_metrics(y_valid_true, y_pred_model_svr)

{'RMSE': np.float64(0.578834087170959),
 'MAE': 0.5276865705080468,
 'MedAE': 0.5604401441058471,
 'MAPE': np.float64(3.873285602904818),
 'R2': -0.33529959877088333}

# Polynomial Regression Model Performance

The polynomial regression model was tuned and evaluated using GridSearchCV and a validation dataset. The results are summarized below:

## GridSearchCV Tuning Results
- **Best Degree:** 2  
  The optimal polynomial degree selected by GridSearchCV is 2, indicating that a quadratic relationship fits the data best.
- **Best -RMSE (cross-validation):** -0.132  
  This corresponds to an RMSE of approximately 0.132 on the training folds during cross-validation.

## Validation Dataset Performance
The tuned model was then tested on the separate validation dataset:

- **Root Mean Squared Error (RMSE):** 0.133  
- **Mean Absolute Error (MAE):** 0.107  
- **Median Absolute Error (MedAE):** 0.095  
- **Mean Absolute Percentage Error (MAPE):** 0.797  
- **R-squared (R²):** 0.930  

---
The polynomial regression model with degree 2 demonstrates strong predictive performance. The cross-validation results show the model generalizes well, and the validation dataset confirms high accuracy and explanatory power. Compared to other models tested, this tuned polynomial model achieves the best balance of error and variance explained.


In [66]:
obj_cols = df_train_encoded[selected_features].select_dtypes(include='object').columns
obj_cols

Index(['has_park', 'has_balcony', 'has_sec', 'has_store', 'src_month'], dtype='object')

In [67]:
bool_cols = ['has_park', 'has_balcony', 'has_sec', 'has_store']

df_train_encoded[bool_cols] = df_train_encoded[bool_cols].replace({'yes': 1, 'no': 0})


In [68]:
df_train_encoded['src_month'] = pd.to_datetime(df_train_encoded['src_month'])
df_train_encoded['src_month'] = df_train_encoded['src_month'].map(lambda x: x.toordinal())

In [69]:
df_train_encoded[selected_features] = df_train_encoded[selected_features].apply(pd.to_numeric)

In [70]:
y_train_true = df_train_encoded['price_z']

X_train = df_train_encoded[selected_features].drop('price_z', axis=1)

In [71]:
y_pred_train = model_polynomial_best.predict(X_train)

In [72]:
# model polynomial best on train dataset
regression_metrics(y_train_true, y_pred_train)

{'RMSE': np.float64(0.13226351977282463),
 'MAE': 0.10699620564795165,
 'MedAE': 0.09521399659478291,
 'MAPE': np.float64(0.7982877075153567),
 'R2': 0.9306360287596201}

In [74]:
with open('../Data/after_model_validation.pkl', 'wb') as f:
    pickle.dump(model_polynomial_best, f)
    pickle.dump(df_train_encoded, f)
    pickle.dump(df_valid_encoded, f)
    pickle.dump(df_test_encoded, f)
    pickle.dump(selected_features, f)