In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

import pickle
import joblib

from statsmodels.tools.eval_measures import aic, bic

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.metrics import median_absolute_error, r2_score

from sklearn.model_selection import KFold, RepeatedKFold, LeaveOneOut, GridSearchCV
from sklearn.model_selection import RandomizedSearchCV, cross_val_score

import warnings

In [3]:
warnings.filterwarnings('ignore')

In [4]:
model_adaboost_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/3_models/model_adaboost_best.pkl')
model_bagging_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/3_models/model_bagging_best.pkl')
model_dt_best2 = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/3_models/model_dt_best2.pkl')
model_gbr_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/3_models/model_gbr_best.pkl')
model_rf_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/3_models/model_rf_best.pkl')
model_xgb_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/3_models/model_xgb_best.pkl')
model_stacking_best = joblib.load('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/models/3_models/model_stacking_best.pkl')

In [18]:
df_valid = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/Data/processed/target-encoded-with-outliers/df_valid.csv', sep=',')
df_test = pd.read_csv('/content/drive/My Drive/Colab Notebooks/Dubai-Houses/Data/processed/target-encoded-with-outliers/df_test.csv', sep=',')

In [19]:
df_valid.head()

Unnamed: 0,price,beds,baths,average_rent,year_of_completion,total_parking_spaces,total_floors,total_building_area_sqft,elevators,price_y,year,month,quarter,price_category_te,type_te,furnishing_te,completion_status_te,building_name_te,area_name_te,city_te
0,699000,1,2,0,0,0,0,0,0,18.729317,2024,4,2,18.976823,20.408979,20.928019,21.03227,21.042112,19.562551,20.416234
1,2250000,2,3,130238,2017,294,7,296,8,20.981548,2023,12,4,20.758554,20.408979,20.928019,20.808694,20.158027,21.052694,21.054796
2,2899000,2,2,0,0,0,0,0,0,21.486238,2024,2,1,20.758554,20.408979,20.928019,21.03227,21.042112,20.852511,21.054796
3,2000000,5,8,171945,0,0,0,0,0,20.749021,2024,4,2,20.758554,22.499987,20.928019,20.808694,21.042112,20.73274,19.228296
4,2000000,3,4,120026,0,0,0,0,0,20.749021,2024,3,1,20.758554,21.280747,20.928019,20.808694,21.042112,20.099493,21.054796


In [20]:
X_valid = df_valid.drop(['price', 'price_y'], axis = 1)

y_valid_actual = df_valid['price_y']

In [21]:
# Bagging
y_pred_model_bagging = model_bagging_best.predict(X_valid)

In [22]:
# Decision Tree Regressor
y_pred_model_dt = model_dt_best2.predict(X_valid)

In [23]:
# Gradient Boosting Regressor
y_pred_model_gbr = model_gbr_best.predict(X_valid)

In [24]:
# Random Forest
y_pred_model_rf = model_rf_best.predict(X_valid)

In [25]:
# AdaBoost
y_pred_model_adaboost = model_adaboost_best.predict(X_valid)

In [26]:
# Extreme Gradient Boosting
y_pred_model_xgb = model_xgb_best.predict(X_valid)

In [27]:
# Ensemble Learning - Stacking
y_pred_model_stacking = model_stacking_best.predict(X_valid)

In [36]:
# def regression_metrics(y_true, y_pred):
#     rmse = np.sqrt(mean_squared_error(y_true, y_pred))
#     mae = mean_absolute_error(y_true, y_pred)
#     medae = median_absolute_error(y_true, y_pred)
#     mape = np.mean(np.abs((y_true - y_pred) / y_true)) * 100
#     r2 = r2_score(y_true, y_pred)

#     # return results in a form of a dictionary
#     return {
#         'RMSE': rmse,
#         'MAE': mae,
#         'MedAE': medae,
#         'MAPE': mape,
#         'R2': r2
#     }

def regression_metrics(y_true, y_pred):
    rmse = np.sqrt(mean_squared_error(y_true, y_pred))
    mae = mean_absolute_error(y_true, y_pred)
    medae = median_absolute_error(y_true, y_pred)

    mask = y_true != 0
    mape = np.mean(np.abs((y_true[mask] - y_pred[mask]) / y_true[mask])) * 100

    r2 = r2_score(y_true, y_pred)

    return {
        'RMSE': rmse,
        'MAE': mae,
        'MedAE': medae,
        'MAPE': mape,
        'R2': r2
    }


In [37]:
# model Bagging
regression_metrics(y_valid_actual, y_pred_model_bagging)

{'RMSE': np.float64(0.49328166566198955),
 'MAE': 0.27728753604074124,
 'MedAE': np.float64(0.1741992891825177),
 'MAPE': np.float64(1.28405806239252),
 'R2': 0.9338064491382392}

In [38]:
# model Decision Tree
regression_metrics(y_valid_actual, y_pred_model_dt)

{'RMSE': np.float64(0.5746360929653943),
 'MAE': 0.3209132002902746,
 'MedAE': np.float64(0.18562552259178133),
 'MAPE': np.float64(1.4855679902714067),
 'R2': 0.9101720409540133}

In [39]:
# model Gradient Boosting
regression_metrics(y_valid_actual, y_pred_model_gbr)

{'RMSE': np.float64(0.5051130240852285),
 'MAE': 0.3291629201756945,
 'MedAE': np.float64(0.23903477622144642),
 'MAPE': np.float64(1.5374427003729776),
 'R2': 0.9305930653022498}

In [40]:
# model Random Forest
regression_metrics(y_valid_actual, y_pred_model_rf)

{'RMSE': np.float64(0.49116855596909187),
 'MAE': 0.2866478867478794,
 'MedAE': np.float64(0.18803251676770394),
 'MAPE': np.float64(1.3285881800139767),
 'R2': 0.9343723515363572}

In [41]:
# model AdaBoost
regression_metrics(y_valid_actual, y_pred_model_adaboost)

{'RMSE': np.float64(0.8081969293676021),
 'MAE': 0.5412515991117177,
 'MedAE': np.float64(0.4161253718394313),
 'MAPE': np.float64(2.5419350337838083),
 'R2': 0.8223111718375876}

In [42]:
# model XGB
regression_metrics(y_valid_actual, y_pred_model_xgb)

{'RMSE': np.float64(0.4973852237133773),
 'MAE': 0.29453611707509,
 'MedAE': np.float64(0.19471052749214302),
 'MAPE': np.float64(1.3739389142632725),
 'R2': 0.9327005539697112}

In [43]:
# model Ensemble - Stacking
regression_metrics(y_valid_actual, y_pred_model_stacking)

{'RMSE': np.float64(0.4666304402377157),
 'MAE': 0.2781288756222673,
 'MedAE': np.float64(0.1799876827854856),
 'MAPE': np.float64(1.2940214654110365),
 'R2': 0.9407658905426071}

---
In this 3 approach, i finally made corrections, huuhhh, it was hard work haha, but i got this. finally improved my model. Best method was to use data with outliers and apply target-encoding.

---

### Model Performance on Validation Set

The model shows strong predictive accuracy across all major regression metrics.  
Below is a summary of the results and what each metric indicates about performance.

#### Validation Metrics
- **RMSE: 0.4666**  
  Measures the square-root of the average squared error. Lower values indicate tighter predictions.  
  This value suggests the model’s errors are small and well‑controlled.

- **MAE: 0.2781**  
  Represents the average absolute difference between predictions and true values.  
  A low MAE confirms that the model is consistently close to the target.

- **Median AE: 0.1800**  
  The median absolute error is even lower than MAE, meaning most predictions are very accurate and the error distribution is stable.

- **MAPE: 1.29%**  
  Shows the average percentage error. After excluding zero targets, the model achieves a very low percentage error, indicating strong relative accuracy.

- **R²: 0.9408**  
  Indicates that the model explains ~94% of the variance in the validation data.  
  This reflects a strong fit without signs of underfitting.

Overall, the model performs reliably on the validation sample, with low error magnitudes and high explanatory power.
