In [216]:
MLFLOW_TRACKING_URI = '../models/mlruns/'
MLFLOW_EXPERIMENT_NAME = "real-estate-price-prediction"
LOG_PATH = "../models/temp/"
X_TRAIN_PATH = "../dataset/X_train.pkl"
X_TEST_PATH = "../dataset/X_test.pkl"
Y_TRAIN_PATH = "../dataset/y_train.pkl"
Y_TEST_PATH = "../dataset/y_test.pkl"


In [217]:
import os
from pathlib import Path
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_squared_error, r2_score ,mean_absolute_error, mean_squared_log_error
from sklearn.model_selection import GridSearchCV
import joblib
import mlflow
import mlflow.sklearn
from mlflow.models import infer_signature

In [218]:
Path(MLFLOW_TRACKING_URI).mkdir(parents=True, exist_ok=True)
Path(LOG_PATH).mkdir(parents=True, exist_ok=True)

In [219]:
X_train = joblib.load(X_TRAIN_PATH)
X_test = joblib.load(X_TEST_PATH)
y_train = joblib.load(Y_TRAIN_PATH)
y_test = joblib.load(Y_TEST_PATH)

In [220]:
X_train.shape,X_test.shape,y_train.shape,y_test.shape

((14839, 2328), (3710, 2328), (14839,), (3710,))

In [221]:
X_train.head()

Unnamed: 0,size_sqm,bedrooms_num,has_maid_room,bathrooms_num,type_Cabin,type_Chalet,type_Duplex,type_Full Floor,type_Hotel Apartment,type_Palace,...,compound_Zed Towers,compound_Zeid Ibn Sabet St.,compound_Zezenia,compound_Zizinia Al Mostakbal,compound_Zizinia St.,compound_Zoheira Abdeen St.,compound_Zomoroda,compound_Zomra East,compound_Zoya,compound_بوابة النعيم
10282,-1.074575,-1.687612,False,-1.508434,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7738,0.64136,0.907964,True,0.054473,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
10653,0.453424,0.907964,True,0.835926,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
3315,0.085724,0.042772,False,-0.72698,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
6741,-0.780415,-0.82242,False,-0.72698,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [222]:
xgb = xgb.XGBRegressor()
xgb.fit(X_train,y_train)

0,1,2
,objective,'reg:squarederror'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


In [223]:
y_pred = xgb.predict(X_test)

In [224]:
r2 = r2_score(y_test,y_pred)
print(f"R2 score : {r2}")

R2 score : 0.6716229523913617


In [225]:
r2

0.6716229523913617

In [226]:
X_train.shape[0] + X_test.shape[0] 

18549

In [227]:
n = int(X_train.shape[0] + X_test.shape[0]) 
p = int(X_train.shape[1] + X_test.shape[1] )

In [228]:
print(f"n = {n} \np = {p}")

n = 18549 
p = 4656


In [229]:
adj_r2 = 1-(1 - r2)*(n-1)/(n-p-1)

In [230]:
print(adj_r2)

0.5615651109239114


In [231]:
max(y_test) , min(y_test)

(400000000.0, 233766.0)

In [232]:
rmae_loss = mean_squared_log_error(y_test,y_pred)
print(f"rmae_loss score : {rmae_loss*100} %")


rmae_loss score : 23.406551836416405 %


In [233]:
models = {
    "LinearRegression": LinearRegression(),
    "RandomForestRegressor": RandomForestRegressor(),
    "DecisionTreeRegressor": DecisionTreeRegressor(),
    "XGBRegressor":XGBRegressor(),
    "KNeighborsRegressor": KNeighborsRegressor()
}

params_grid = {
    "LinearRegression": {},
    "RandomForestRegressor": {},
    "DecisionTreeRegressor": {},
    "XGBRegressor": {},
    "KNeighborsRegressor": {}
}


In [None]:
mlflow.set_tracking_uri(MLFLOW_TRACKING_URI)
mlflow.set_experiment(MLFLOW_EXPERIMENT_NAME)

best_models = {}

for model_name, model in models.items():
    with mlflow.start_run(run_name=f"{model_name}_gridsearch"):
        # Grid Search
        grid_search = GridSearchCV(
            estimator=model,
            param_grid=params_grid[model_name],
            cv=5,
            scoring="r2",
            n_jobs=-1,
            verbose=1
        )
        grid_search.fit(X_train, y_train)

        # Predictions & Metrics
        y_pred = grid_search.predict(X_test)
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mse)

        # Log parameters and metrics
        mlflow.log_params(grid_search.best_params_)
        mlflow.log_metrics({"mse": mse, "r2": r2, "rmse": rmse})
        signature = infer_signature(X_test,y_pred)


        # Log model to MLflow
        mlflow.sklearn.log_model(grid_search.best_estimator_, name=f"model_{model_name}", signature=signature)

        # Store best model info
        best_models[model_name] = {
            "model": grid_search.best_estimator_,
            "mse": mse,
            "rmse": rmse,
            "r2": r2,
            "params": grid_search.best_params_
        }

# Select and log best model overall
best_model_name = max(best_models, key=lambda name: best_models[name]["r2"])
best_model_info = best_models[best_model_name]

with mlflow.start_run(run_name="best_model_summary"):
    mlflow.log_param("best_model_name", best_model_name)
    mlflow.log_params(best_model_info["params"])
    mlflow.log_metric("best_model_r2", best_model_info["r2"])
    mlflow.sklearn.log_model(best_model_info["model"], name="best_model")

print(f"🏆 Best model: {best_model_name}")
print(f"R²: {best_model_info['r2']:.4f}")
print(f"Params: {best_model_info['params']}")

2025/10/18 15:25:01 INFO mlflow.tracking.fluent: Experiment with name 'real-estate-price-prediction' does not exist. Creating a new experiment.


Fitting 5 folds for each of 1 candidates, totalling 5 fits


In [None]:
best_model_info

{'model': RandomForestRegressor(),
 'mse': 126289758703629.7,
 'rmse': np.float64(11237871.626942074),
 'r2': 0.7438485377220517,
 'params': {}}

In [None]:
pred = best_model_info["model"].predict(X_test)

In [None]:
# ✅ Get experiment
experiment = mlflow.get_experiment_by_name(MLFLOW_EXPERIMENT_NAME)
runs = mlflow.search_runs(experiment_ids=[experiment.experiment_id])

# ✅ Clean and filter only real model runs (not summaries)
valid_runs = runs[
    runs["tags.mlflow.runName"].str.endswith("_gridsearch", na=False)
].copy()

# ✅ Select only important columns (safe for all MLflow versions)
cols = [
    "run_id",
    "tags.mlflow.runName",
    "metrics.mse",
    "metrics.rmse",
    "metrics.r2",
] + [c for c in runs.columns if c.startswith("params.")]
cols.append("artifact_uri")

# ✅ Create a clean DataFrame
valid_runs = valid_runs[cols]

# ✅ Drop rows with missing R² and sort best first
valid_runs = valid_runs.dropna(subset=["metrics.r2"])
valid_runs = valid_runs.sort_values("metrics.r2", ascending=False)

# ✅ Show top 10 models
print(valid_runs.head(10).to_string(index=False))

                          run_id              tags.mlflow.runName  metrics.mse  metrics.rmse  metrics.r2 params.best_model_name params.weights params.p params.n_neighbors params.n_estimators params.min_samples_split params.max_depth                                                                                                   artifact_uri
55bc3038edc343c286951b6896262a23      LinearRegression_gridsearch 3.833678e-14  1.957978e-07    1.000000                   None           None     None               None                None                     None             None file:///d:/Real-Estate-Analysis/notebooks/mlruns/389890397536557068/55bc3038edc343c286951b6896262a23/artifacts
dd7449ab23704ce6b0c5a02db2a98dc8      LinearRegression_gridsearch 3.833678e-14  1.957978e-07    1.000000                   None           None     None               None                None                     None             None file:///d:/Real-Estate-Analysis/notebooks/mlruns/389890397536557068/dd7449ab237

In [None]:
summary = runs_df[['tags.mlflow.runName', 'metrics.mse', 'metrics.rmse', 'metrics.r2']].dropna()
summary = summary.drop_duplicates()
summary = summary.rename(columns={
    'tags.mlflow.runName': 'Model',
    'metrics.mse': 'MSE',
    'metrics.rmse': 'RMSE',
    'metrics.r2': 'R2'
})
summary = summary.sort_values("R2", ascending=False)
print(summary)


                               Model           MSE          RMSE        R2
5        LinearRegression_gridsearch  3.833678e-14  1.957978e-07  1.000000
8   DecisionTreeRegressor_gridsearch  2.648800e+11  5.146650e+05  0.999463
4   RandomForestRegressor_gridsearch  3.397572e+11  5.828869e+05  0.999311
9   RandomForestRegressor_gridsearch  5.813551e+11  7.624664e+05  0.998821
3   DecisionTreeRegressor_gridsearch  1.190648e+12  1.091168e+06  0.997585
6     KNeighborsRegressor_gridsearch  2.186060e+13  4.675532e+06  0.955661
1     KNeighborsRegressor_gridsearch  2.355865e+13  4.853726e+06  0.952216
2            XGBRegressor_gridsearch  2.652248e+13  5.149998e+06  0.946205
13           XGBRegressor_gridsearch  9.275971e+12  3.045648e+06  0.783235
26  RandomForestRegressor_gridsearch  3.676722e+13  6.063598e+06  0.140809
21  RandomForestRegressor_gridsearch  3.776959e+13  6.145697e+06  0.117385
15  RandomForestRegressor_gridsearch  4.017719e+13  6.338548e+06  0.061123
31  RandomForestRegressor

In [None]:
pred = best_model_info["model"].predict(X_test)
r2 = r2_score(y_test ,pred)
adj_r2 = 1 - (1-r2)*(n-1)/(n-p-1)
rmae_loss = mean_squared_log_error(y_test ,pred)
print(f"best_model_r2 =  {r2} \nadj_r2 = {adj_r2}\nrmse = {rmae_loss*100} %")


best_model_r2 =  0.7438485377220517 
adj_r2 = 0.6579976013294425
rmse = 19.188478987964043 %


In [None]:
joblib.dump(best_model_info["model"], "../models/best_model.pkl")

['../models/best_model.pkl']