## Model Validation

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import joblib

from statsmodels.tools.eval_measures import aic, bic

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import median_absolute_error, r2_score

from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

import warnings

In [None]:
warnings.filterwarnings('ignore')

In [None]:
# model_adaboost_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/model_adaboost_best.pkl')
model_bagging_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/model_bagging_best.pkl')
model_dt_best2 = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/model_dt_best2.pkl')
model_gbr_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/model_gbr_best.pkl')
model_rf_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/model_rf_best.pkl')
model_svr_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/model_svr_best.pkl')
model_xgb_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/model_xgb_best.pkl')
model_stacking_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/model_stacking_best.pkl')

In [None]:
df_train = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/Data/processed/df_train.csv', sep=',')
df_valid = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/Data/processed/df_valid.csv', sep=',')
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/Data/processed/df_test.csv', sep=',')

In [None]:
df_valid.head()

Unnamed: 0,price,price_category,beds,baths,average_rent,year_of_completion,total_parking_spaces,total_floors,total_building_area_sqft,elevators,...,area_name_Dubai Harbour,area_name_Dubai Hills Estate,area_name_Dubai Marina,area_name_Jumeirah Lake Towers (JLT),area_name_Jumeirah Village Circle (JVC),area_name_Other,area_name_Palm Jumeirah,area_name_Saadiyat Island,area_name_Sobha Hartland,area_name_Yas Island
0,699000,0,1,2,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,2250000,1,2,3,130238,2017,294,7,296,8,...,0,0,0,0,0,1,0,0,0,0
2,2899000,1,2,2,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
3,2000000,1,5,8,171945,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
4,2000000,1,3,4,120026,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0


In [None]:
X_valid = df_valid.drop(['price', 'price_y'], axis = 1)

y_valid_actual = df_valid['price_y']


In [None]:
# Bagging
y_pred_model_bagging = model_bagging_best.predict(X_valid)

In [None]:
# Decision Tree Regressor
y_pred_model_dt = model_dt_best2.predict(X_valid)

In [None]:
# Gradient Boosting Regressor
y_pred_model_gbr = model_gbr_best.predict(X_valid)

In [None]:
# Random Forest
y_pred_model_rf = model_rf_best.predict(X_valid)

In [None]:
# Support Vector Regressor
y_pred_model_svr = model_svr_best.predict(X_valid)

In [None]:
# Extreme Gradient Boosting
y_pred_model_xgb = model_xgb_best.predict(X_valid)

In [None]:
# Ensemble Learning - Stacking
y_pred_model_stacking = model_stacking_best.predict(X_valid)

In [None]:
def regression_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)
    mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
    r2 = r2_score(y_true, y_pred)

    # return results in a form of a dictionary
    return {
        'RMSE': rmse,
        'MAE': mae,
        'MedAE': medae,
        'MAPE': mape,
        'R2': r2
    }

In [None]:
# model Bagging
regression_metrics(y_valid_actual, y_pred_model_bagging)

{'RMSE': np.float64(0.5451358482836873),
 'MAE': 0.3144321836855454,
 'MedAE': np.float64(0.18586211836443312),
 'MAPE': np.float64(inf),
 'R2': 0.9191583413083679}

In [None]:
# model Decision Tree
regression_metrics(y_valid_actual, y_pred_model_dt)

{'RMSE': np.float64(0.5965315302806106),
 'MAE': 0.34974374156597493,
 'MedAE': np.float64(0.2026284818490076),
 'MAPE': np.float64(1.6275200058063564),
 'R2': 0.9031961704065148}

In [None]:
# model Gradient Boosting
regression_metrics(y_valid_actual, y_pred_model_gbr)

{'RMSE': np.float64(0.5632810298756779),
 'MAE': 0.3573782208689117,
 'MedAE': np.float64(0.2520548527508719),
 'MAPE': np.float64(inf),
 'R2': 0.9136870456954532}

In [None]:
# model Random Forest
regression_metrics(y_valid_actual, y_pred_model_rf)

{'RMSE': np.float64(0.5395441216115144),
 'MAE': 0.31527224146927835,
 'MedAE': np.float64(0.18884341919188152),
 'MAPE': np.float64(inf),
 'R2': 0.920808300815908}

In [None]:
# model SVR
regression_metrics(y_valid_actual, y_pred_model_svr)

{'RMSE': np.float64(0.6288818677740685),
 'MAE': 0.3671432074551087,
 'MedAE': np.float64(0.23648961586670758),
 'MAPE': np.float64(inf),
 'R2': 0.8924119893346389}

In [None]:
# model XGB
regression_metrics(y_valid_actual, y_pred_model_xgb)

{'RMSE': np.float64(0.5287429199147801),
 'MAE': 0.31878209626328713,
 'MedAE': np.float64(0.20450104467344765),
 'MAPE': np.float64(inf),
 'R2': 0.9239472607144741}

In [None]:
# model Ensemble - Stacking
regression_metrics(y_valid_actual, y_pred_model_stacking)

{'RMSE': np.float64(0.5170459031161408),
 'MAE': 0.3106873813628409,
 'MedAE': np.float64(0.1949349072008939),
 'MAPE': np.float64(inf),
 'R2': 0.9272749658786664}

# Target Transformation & Inverse-Scaling Instability

## 1. **What Issue Arose**
The real-scale regression metrics (RMSE, MAE, R², MAPE) became extremely large and unstable after applying the inverse Yeo–Johnson transformation to the model predictions.

## 2. **Why the Issue Arose**
The inverse Yeo–Johnson transformation amplified prediction errors because the model’s predictions did not cover the full range of the transformed target distribution.  
This mismatch was caused by **extreme outliers in the original price variable**, which distorted the Yeo–Johnson transformation and made the inverse mapping highly sensitive, especially in the lower tail of the distribution.

## 3. **What the Actual Case Was**
- The target variable (`price`) contained **very large outliers** typical of real-estate markets.
- The target was transformed using **Yeo–Johnson**, and the model was trained on this transformed target.
- Predictions stayed within a narrow band of the transformed space and **missed the lower tail**, causing the inverse transform to explode.
- One-hot encoding and StandardScaler were applied only to features (X), so they were **not** the cause of the issue.

## 4. **How the Issue Can Be Resolved**
- Remove or cap extreme outliers in the target variable before applying any power transformation.
- Recompute the Yeo–Johnson (or log) transformation on the cleaned target.
- Retrain the model using the newly transformed target.
- Perform inverse transformation only after ensuring predictions fall within the valid transformed range.
- Optionally clip predictions to the valid Yeo–Johnson domain before inversion to stabilize metrics.

## 5. **Next Steps**
I will:
- Return to the feature-engineering stage.
- **Remove or cap outliers from the target variable**.
- **Reapply a power transformation (Yeo–Johnson or log1p)** to the cleaned target.
- Rebuild the models from scratch using the corrected target.
- Re-evaluate and validate the models on the new target distribution.

## 6. **Additional Consideration**
If necessary, I may apply **target encoding** to categorical variables to improve the performance of tree-based models, especially when dealing with high-cardinality features.
