In [25]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
import pipeline_0

In [41]:
TARGET = "computedPricePerSquareMeter"  # you build this from advertisedPrice / etc
REQUIRED = [
    # to avoid mixing sale vs rent if your dataset contains both
    "transactionType",

    # location (at least these two)
    "computedPostalCode",

    # fundamentals (the core price drivers)
    "computedSurfaceArea",
    "propertyType",
    "roomsQuantity",
    "computedBedroomsQuantity",
    "computedPricePerSquareMeter"
]
NUM_COLS = [
 'blurInfo.centroid.lat',
 'blurInfo.centroid.lon',
 'computedSurfaceArea',
 'roomsQuantity',
 'computedBedroomsQuantity',
 'floor',
 'computedBathroomsQuantity',
 'computedShowerRoomsQuantity',
 'computedToiletQuantity',
 'computedEnergyValue',
 'computedMinEnergyConsumption',
 'computedMaxEnergyConsumption',
 'greenhouseGazValue'
]
BOOL_COLS = [
 'computedNewProperty',
 'computedHasElevator',
 'computedHasTerrace'
]
CAT_COLS = [
    'computedPostalCode',
    'transactionType',
    'city',
    'propertyType',
    'heating',
    'exposition',
    'computedEnergyClassification',
    'greenhouseGazClassification'
]
FEATURES = CAT_COLS + BOOL_COLS + NUM_COLS
RENAME_MAP = {
    "transactionType": "transaction_type",
    "computedPostalCode": "postal_code",
    "city": "city",
    "blurInfo.centroid.lat": "lat",
    "blurInfo.centroid.lon": "lon",
    "propertyType": "property_type",
    "computedSurfaceArea": "surface_area",
    "roomsQuantity": "rooms",
    "computedBedroomsQuantity": "bedrooms",
    "floor": "floor",
    "computedNewProperty": "is_new",
    "computedHasElevator": "has_elevator",
    "computedHasTerrace": "has_terrace",
    "heating": "heating",
    "exposition": "exposition",
    "computedBathroomsQuantity": "bathrooms",
    "computedShowerRoomsQuantity": "shower_rooms",
    "computedToiletQuantity": "toilets",
    "computedEnergyClassification": "energy_class",
    "computedEnergyValue": "energy_value",
    "computedMinEnergyConsumption": "energy_min",
    "computedMaxEnergyConsumption": "energy_max",
    "greenhouseGazClassification": "ghg_class",
    "greenhouseGazValue": "ghg_value",
    "computedPricePerSquareMeter": "price_per_sqm",  # target
}
def return_REQUIRED():
    return REQUIRED
def return_TARGET():
    return TARGET
def return_FEATURES():
    return FEATURES
def return_RENAME_MAP():
    return RENAME_MAP
def return_CAT():
    return CAT_COLS
def return_NUM():
    return NUM_COLS
def return_BOOL():
    return BOOL_COLS

In [42]:
df = pd.read_csv("ressources/data/bienIci_2458.csv")

In [43]:
df.head()

Unnamed: 0,transactionType,rentWithoutCharges,charges,energyPerformanceDiagnosticDate,title,heating,district.name,hasTerrace,greenhouseGazClassification,greenhouseGazValue,...,bedroomsQuantity,descriptionTextLength,postalCodeForSearchFilters,price,city,blurInfo.type,status.onTheMarket,district.libelle,blurInfo.centroid.lon,hasElevator
0,rent,1771.0,,2023-09-19,Exclusivité- Appartement meublé- Paris XV,électricité individuel,Paris 15e Arrondissement - Cambronne - Garibaldi,,B,6.0,...,1.0,667,75015,1771.0,Paris 15e,disk,True,Cambronne - Garibaldi,2.303488,
1,rent,1500.0,450.0,2024-06-25,2 pièces avec terrasse - Paris 15,radiateur gaz collectif,Paris 15e Arrondissement - Alleray - Procession,True,C,29.0,...,1.0,474,75015,1950.0,Paris 15e,disk,True,Alleray - Procession,2.304538,True
2,rent,13000.0,,2025-07-06,Appartement exceptionnel- Paris VIII,,Paris 8e Arrondissement - Hoche Friedland,True,D,33.0,...,4.0,850,75008,13000.0,Paris 8e,disk,True,Hoche Friedland,2.29956,
3,rent,3800.0,,2025-12-02,Appartement meublé - Paris 14ème,,Paris 14e Arrondissement - Montsouris - Dareau,,D,34.0,...,2.0,701,75014,3800.0,Paris 14e,disk,True,Montsouris - Dareau,2.33551,
4,rent,866.0,,2023-02-20,STUDIO-ILES SAINT LOUIS,,Paris 4e Arrondissement - Les Iles,,B,11.0,...,0.0,311,75004,866.0,Paris 4e,disk,True,Les Iles,2.354162,


In [44]:
df_p0 = pipeline_0.run_pipeline(df)

In [45]:
df_p0[FEATURES].dtypes

computedPostalCode                       int64
transactionType                         object
city                                    object
propertyType                            object
heating                                 object
exposition                              object
computedEnergyClassification    string[python]
greenhouseGazClassification             object
computedNewProperty                    boolean
computedHasElevator                    boolean
computedHasTerrace                     boolean
blurInfo.centroid.lat                  float64
blurInfo.centroid.lon                  float64
computedSurfaceArea                    float64
roomsQuantity                          float64
computedBedroomsQuantity               float64
floor                                  float64
computedBathroomsQuantity              float64
computedShowerRoomsQuantity            float64
computedToiletQuantity                 float64
computedEnergyValue                    Float64
computedMinEn

In [51]:
def prepareX(X:pd.DataFrame,num_cols:list = NUM_COLS,
             cat_cols:list = CAT_COLS, bool_cols:list = BOOL_COLS):
    default_x_cols = num_cols + cat_cols + bool_cols
    missings = set(default_x_cols) - set(X.columns)
    if len(missings) > 0 :
        for c in missings :
            if c in num_cols :
                X[c] = np.nan
            elif c in cat_cols :
                X[c] = "__UNKNOWN__"
            elif c in bool_cols :
                X[c] = False
    X = X[default_x_cols].copy()
    for c in bool_cols :
        X[c] = X[c].astype("boolean")
        X[c] = X[c].fillna(False)
    for c in num_cols :
        X[c] = pd.to_numeric(X[c],errors="coerce")
    for c in cat_cols :
        X[c] = X[c].astype("string")
        X[c] = X[c].fillna("__UNKNOWN__")
    return X
def prepareY(y:pd.Series):
    y = y.copy()
    return pd.to_numeric(y, errors="coerce")

In [47]:
def prepareDataset(df: pd.DataFrame, num_cols:list = NUM_COLS,
             cat_cols:list = CAT_COLS, bool_cols:list = BOOL_COLS,
                   TARGET: str = TARGET, REQUIRED: list = REQUIRED,
                  RENAME_MAP : dict = RENAME_MAP):
    # keep only rows where REQUIRED are non-NA
    m_required = df[REQUIRED].notna().all(axis=1)
    out = df.loc[m_required].copy()

    default_x_cols = num_cols + cat_cols + bool_cols

    #casting
    out = pd.concat([prepareX(X = out[default_x_cols],
                 num_cols = num_cols,
                 cat_cols = cat_cols,
                 bool_cols = bool_cols),prepareY(y = out[TARGET])],axis = 1)
    #rename (normalize cols name)
    out = out.rename(columns=RENAME_MAP)
    renamed_features = [RENAME_MAP.get(c, c) for c in FEATURES]
    renamed_target = RENAME_MAP.get(TARGET, TARGET)
    X = out[renamed_features].copy()
    y = pd.to_numeric(out[renamed_target], errors="coerce")
    return X, y

In [48]:
X,y = prepareDataset(df = df_p0)

In [49]:
test_df = pd.concat([X,y])

In [50]:
test_df.dtypes

postal_code         string[python]
transaction_type    string[python]
city                string[python]
property_type       string[python]
heating             string[python]
exposition          string[python]
energy_class        string[python]
ghg_class           string[python]
is_new                     boolean
has_elevator               boolean
has_terrace                boolean
lat                        float64
lon                        float64
surface_area               float64
rooms                      float64
bedrooms                   float64
floor                      float64
bathrooms                  float64
shower_rooms               float64
toilets                    float64
energy_value               Float64
energy_min                 Float64
energy_max                 Float64
ghg_value                  float64
price_per_sqm              float64
dtype: object

In [None]:
def set_seed(seed : int):
    np.random.seed(42)


In [None]:
Here’s a complete, runnable Python example using CatBoostRegressor for regression tasks, including data preparation, training, prediction, and evaluation.
It also includes robust input validation and handling of categorical features (CatBoost’s specialty).

Python# Install CatBoost if not already installed:
# pip install catboost scikit-learn pandas

import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

# -----------------------------
# 1. Generate Example Dataset
# -----------------------------
# For demonstration, we'll create a synthetic dataset
np.random.seed(42)
n_samples = 200

data = pd.DataFrame({
    'feature_num1': np.random.rand(n_samples) * 10,          # numerical
    'feature_num2': np.random.randint(0, 100, n_samples),    # numerical
    'feature_cat': np.random.choice(['A', 'B', 'C'], n_samples),  # categorical
    'target': np.random.rand(n_samples) * 50 + 10             # target variable
})

# -----------------------------
# 2. Split into Train/Test
# -----------------------------
X = data.drop(columns=['target'])
y = data['target']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# -----------------------------
# 3. Identify Categorical Features
# -----------------------------
cat_features = ['feature_cat']  # column names of categorical features

# -----------------------------
# 4. Create CatBoost Pool
# -----------------------------
train_pool = Pool(X_train, y_train, cat_features=cat_features)
test_pool = Pool(X_test, y_test, cat_features=cat_features)

# -----------------------------
# 5. Initialize and Train Model
# -----------------------------
model = CatBoostRegressor(
    iterations=500,           # number of boosting iterations
    learning_rate=0.05,        # step size
    depth=6,                   # tree depth
    loss_function='RMSE',      # regression loss
    random_seed=42,
    verbose=False              # suppress training output
)

model.fit(train_pool)

# -----------------------------
# 6. Make Predictions
# -----------------------------
y_pred = model.predict(test_pool)

# -----------------------------
# 7. Evaluate Model
# -----------------------------
rmse = mean_squared_error(y_test, y_pred, squared=False)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.4f}")
print(f"R² Score: {r2:.4f}")

# -----------------------------
# 8. Save and Load Model
# -----------------------------
model.save_model("catboost_regressor_model.cbm")
loaded_model = CatBoostRegressor()
loaded_model.load_model("catboost_regressor_model.cbm")

# Verify loaded model works
print("Prediction from loaded model:", loaded_model.predict(X_test.iloc[:1]))


Key Points

CatBoostRegressor is ideal for datasets with categorical features — it handles them natively without manual encoding.
Pool objects allow you to specify categorical features explicitly.
Hyperparameters:

iterations: Number of boosting rounds.
learning_rate: Smaller values improve accuracy but require more iterations.
depth: Controls tree complexity.
loss_function: Commonly 'RMSE' for regression.


Evaluation: RMSE and R² are standard metrics for regression.
Model Persistence: Use .save_model() and .load_model() for reuse.


If you want, I can also give you an optimized CatBoostRegressor pipeline with hyperparameter tuning using GridSearchCV so you can get the best model automatically.
Do you want me to prepare that?
