# Importing The Lib

In [1]:
import pandas as pd
import numpy as np
from sqlalchemy import create_engine
import psycopg2
import os
import logging
from pycaret.regression import *
import mlflow
import mlflow.sklearn  # For tracking the final model

# Initalizing The Loggs

In [2]:
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Extracting The Data

In [3]:
def extract_data(db_url: str) -> pd.DataFrame:
    """
    Extracts data from a PostgreSQL database using the provided database URL.

    Args:
        db_url (str): The connection string for the PostgreSQL database.

    Returns:
        pd.DataFrame: A pandas DataFrame containing the extracted data.
    """
    logging.info('Connecting to the database...')
    
    try:
        # Create a SQLAlchemy engine using the database URL
        engine = create_engine(db_url)
        
        # Open a connection to the database and execute the SQL query
        with engine.connect() as connection:
            query = """                                                                
                SELECT long, lat, city, neighborhood, area, subcategory, facade,
                       bedrooms, bathrooms, furnished, floor,
                       building_age, price
                FROM fact_listing FL
                LEFT JOIN dim_location DL ON DL.location_id = FL.location_id
                LEFT JOIN dim_property DP ON DP.property_id = FL.property_id
                LEFT JOIN dim_property_details DPD ON DPD.details_id = DP.details_id
                WHERE subcategory != 'Lands for Sale'
            """
            logging.info('Executing query...')
            
            # Read the query result into a pandas DataFrame
            data = pd.read_sql_query(query, con=connection)
        
        logging.info('Data extracted successfully.')
        return data

    except Exception as e:
        # Log any errors that occur during the data extraction process
        logging.error(f"Error extracting data: {e}")
        raise


# Building The Model

In [4]:
def build_model(data: pd.DataFrame, target_column: str, save_path: str):
    """
    Sets up the PyCaret environment, trains a regression model, logs metrics, and saves the model.

    Args:
        data (pd.DataFrame): The input dataset for model training.
        target_column (str): The column in the dataset to predict (i.e., the target variable).
        save_path (str): The file path to save the trained model.

    Returns:
        model: The trained machine learning model.
    """
    logging.info('Setting up PyCaret environment...')
    
    try:
        
        # Initialize MLflow tracking
        mlflow.start_run()

        # Initialize the PyCaret regression setup with the provided dataset and target column
        s = setup(data, target=target_column, normalize=True, log_experiment=True, experiment_name='Data without land',
                  session_id=123)

        # Compare multiple models and select the best one
        model = compare_models()

        # Tune the best model
        model = tune_model(model)

        # Finalize the tuned model
        model = finalize_model(model)
        
        logging.info(f'Model training completed. Best model: {model}')
        
        # Generate the absolute file path for saving the model
        save_path = os.path.abspath(save_path)
        
        # Save the trained model to the specified path
        save_model(model, save_path)
        
        logging.info(f'Model saved at {save_path}')

        # Log the model to MLflow using sklearn since PyCaret's model is compatible
        mlflow.sklearn.log_model(model, artifact_path='model')

        # Retrieve and log performance metrics
        metrics = pull()  # Pull the metrics from the latest model training/tuning

        # Log specific metrics to MLflow
        mlflow.log_metric("MAE", metrics["MAE"].iloc[0])
        mlflow.log_metric("MSE", metrics["MSE"].iloc[0])
        mlflow.log_metric("RMSE", metrics["RMSE"].iloc[0])
        mlflow.log_metric("R2", metrics["R2"].iloc[0])
        
        mlflow.end_run()

        return model

    except Exception as e:
        # Log any errors that occur during model building or saving
        logging.error(f"Error in building or saving model: {e}")
        mlflow.end_run(status='FAILED')
        raise


In [5]:
# Define the PostgreSQL database connection URL (can be made configurable)
db_url = 'postgresql://postgres:2003@localhost:5432/houses'

# Extract data from the database
data = extract_data(db_url)

# Print the first few rows of the data
print("Data preview:")
print(data.head())

# Define the file path for saving the trained model
model_save_path = 'saved model/Prop-model'

# Build the regression model using the extracted data and save it
model = build_model(data, target_column='price', save_path=model_save_path)

# Load and display the saved model for verification
try:
    loaded_model = load_model(model_save_path)
    logging.info(f'Loaded model: {loaded_model}')

except Exception as e:
    # Log any errors that occur during model loading
    logging.error(f"Error loading model: {e}")



2025-04-19 15:25:48,202 - INFO - Connecting to the database...
2025-04-19 15:25:48,403 - INFO - Executing query...
2025-04-19 15:25:48,490 - INFO - Data extracted successfully.
2025-04-19 15:25:48,498 - INFO - Setting up PyCaret environment...


Data preview:
        long        lat   city          neighborhood  area  \
0  35.881947  32.018876  Amman               Jubaiha   160   
1  36.022293  31.961647  Amman               Jubaiha   115   
2  35.903079  31.973569  Amman           Sports City   150   
3  36.103447  32.091877  Zarqa  Al Zarqa Al Jadeedeh   135   
4  35.919193  31.875235  Amman          Khirbet Sooq   112   

           subcategory     facade bedrooms bathrooms    furnished  \
0  Apartments for Sale   Northern        3         3  Unfurnished   
1  Apartments for Sale  Southeast        3         3  Unfurnished   
2  Apartments for Sale  Southwest        3         3  Unfurnished   
3  Apartments for Sale  Northwest        3         3  Unfurnished   
4  Apartments for Sale    Eastern        3         2  Unfurnished   

          floor   building_age    price  
0   Third Floor  0 - 11 months  96000.0  
1  Ground Floor  0 - 11 months  63000.0  
2  Second Floor    1 - 5 years  76000.0  
3   Third Floor  0 - 11 months

Unnamed: 0,Description,Value
0,Session id,123
1,Target,price
2,Target type,Regression
3,Original data shape,"(7918, 13)"
4,Transformed data shape,"(7918, 82)"
5,Transformed train set shape,"(5542, 82)"
6,Transformed test set shape,"(2376, 82)"
7,Numeric features,3
8,Categorical features,9
9,Preprocess,True


2025/04/19 15:25:52 INFO mlflow.tracking.fluent: Experiment with name 'Data without land' does not exist. Creating a new experiment.


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
catboost,CatBoost Regressor,14239.1015,424424731.8552,20589.861,0.7289,0.3,0.2286,1.071
xgboost,Extreme Gradient Boosting,14108.7096,437532912.0,20901.448,0.7204,0.3137,0.2291,0.117
rf,Random Forest Regressor,13770.8371,442333622.4542,21025.1063,0.7172,0.3074,0.2294,0.565
et,Extra Trees Regressor,12928.1355,448853078.1469,21155.2931,0.7139,0.3031,0.2129,0.518
lightgbm,Light Gradient Boosting Machine,14693.5315,458515434.8543,21403.4502,0.7068,0.3115,0.2395,0.168
gbr,Gradient Boosting Regressor,16894.6423,542892327.3094,23289.9512,0.6531,0.3332,0.2703,0.196
dt,Decision Tree Regressor,16576.7475,757598704.3944,27505.4503,0.5157,0.3873,0.261,0.07
par,Passive Aggressive Regressor,20431.4613,773061749.6458,27780.0434,0.5064,0.3883,0.3043,0.104
huber,Huber Regressor,20465.3055,802349971.1615,28215.6222,0.4892,0.4009,0.3096,0.25
br,Bayesian Ridge,20702.6505,821388974.3999,28424.636,0.4783,0.3931,0.327,0.076




Unnamed: 0_level_0,MAE,MSE,RMSE,R2,RMSLE,MAPE
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15321.0724,484802524.9385,22018.2316,0.6916,0.3113,0.2431
1,14312.5268,438866093.7015,20949.1311,0.6882,0.3088,0.2361
2,14429.95,430904398.6775,20758.2369,0.7328,0.3139,0.2492
3,15283.6252,484360837.5789,22008.1993,0.71,0.3128,0.2302
4,15026.2722,449763272.4903,21207.623,0.7127,0.299,0.232
5,15299.2363,465941947.4464,21585.6885,0.6996,0.3115,0.2485
6,13705.4458,450309547.8959,21220.4983,0.7154,0.3162,0.2485
7,14696.8567,444075530.3798,21073.0997,0.719,0.3063,0.2309
8,15284.1301,464302433.7267,21547.6782,0.7167,0.2867,0.2224
9,14198.4541,414744916.4761,20365.287,0.7198,0.3229,0.2437


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


2025-04-19 15:27:41,691 - INFO - Model training completed. Best model: Pipeline(memory=Memory(location=None),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['long', 'lat', 'area'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=['city', 'neighborhood',
                                             'subcategory', 'facade',
                                             'bedrooms', 'bathrooms',
                                             'furnished', 'floor',
                                             'building_age'],
                                    transformer=SimpleImputer(strategy='most_frequent'...
                                                                    'building_age'],
                                                              handle_missing='return_nan',
                                                              use_cat_na

Transformation Pipeline and Model Successfully Saved


2025-04-19 15:27:47,829 - INFO - Loaded model: Pipeline(memory=FastMemory(location=C:\Users\4t4\AppData\Local\Temp\joblib),
         steps=[('numerical_imputer',
                 TransformerWrapper(include=['long', 'lat', 'area'],
                                    transformer=SimpleImputer())),
                ('categorical_imputer',
                 TransformerWrapper(include=['city', 'neighborhood',
                                             'subcategory', 'facade',
                                             'bedrooms', 'bathrooms',
                                             'furnished', 'floor',
                                             'building_age'],
                                    transformer=...
                                                                    'building_age'],
                                                              handle_missing='return_nan',
                                                              use_cat_names=True))),
           

Transformation Pipeline and Model Successfully Loaded


# Getting The Mlflow UI

In [6]:
!mlflow ui
# Note: The MLflow UI will be available at http://localhost:5000 after running the above command.

^C
