In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('services/data/processed/immobilier.csv')

df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
0,2017-01,Ang Mo Kio,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0
1,2017-01,Ang Mo Kio,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0
2,2017-01,Ang Mo Kio,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0
3,2017-01,Ang Mo Kio,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0
4,2017-01,Ang Mo Kio,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0


In [3]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

# Charger les données
df = pd.read_csv('services/data/processed/immobilier.csv').drop(columns=['block'])

# Convertir la colonne 'month' en datetime
df['month'] = pd.to_datetime(df['month'])
df['year'] = df['month'].dt.year
df['month'] = df['month'].dt.month

# Fonction robuste pour extraire les années et mois
def parse_remaining_lease(lease):
    if isinstance(lease, str):
        parts = lease.split(' years ')
        years = 0
        months = 0
        
        # Vérifier si on a bien une partie "years"
        if len(parts) > 1:
            years = int(parts[0])
            months_part = parts[1]
        else:
            months_part = parts[0]
        
        # Vérifier si la partie "months" est présente
        if 'months' in months_part:
            months = int(months_part.split(' months')[0])
        
        return years + months / 12
    return np.nan

df['remaining_lease_years'] = df['remaining_lease'].apply(parse_remaining_lease)

# Sélectionner les colonnes utiles
features = ['month', 'year', 'town', 'flat_type', 'street_name',
            'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
            'remaining_lease_years']

# Définir les variables catégorielles pour CatBoost
cat_features = ['town', 'flat_type', 'street_name', 'storey_range', 'flat_model']

# Supprimer les valeurs manquantes
df = df.dropna()

# Séparation des données
X = df[features]
y = df['resale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Création des Pool pour CatBoost
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

# Entraînement du modèle
model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.09,
    depth=10,
    loss_function='RMSE',
    eval_metric='RMSE',
    cat_features=cat_features,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50, use_best_model=True)

# Prédictions
y_pred = model.predict(X_test)




0:	learn: 165623.5501009	test: 166222.6555979	best: 166222.6555979 (0)	total: 355ms	remaining: 8m 52s
100:	learn: 36732.1578962	test: 36043.3448837	best: 36043.3448837 (100)	total: 17.1s	remaining: 3m 56s
200:	learn: 31417.1342968	test: 31172.9752794	best: 31172.9752794 (200)	total: 33.6s	remaining: 3m 36s
300:	learn: 29022.8446598	test: 29218.3696532	best: 29218.3696532 (300)	total: 49.7s	remaining: 3m 17s
400:	learn: 27530.2574509	test: 28120.4143248	best: 28120.4143248 (400)	total: 1m 6s	remaining: 3m 1s
500:	learn: 26462.4033184	test: 27463.5042986	best: 27463.5042986 (500)	total: 1m 22s	remaining: 2m 44s
600:	learn: 25683.3905493	test: 27092.0498052	best: 27092.0498052 (600)	total: 1m 39s	remaining: 2m 28s
700:	learn: 25006.5328895	test: 26812.0127082	best: 26812.0127082 (700)	total: 1m 55s	remaining: 2m 12s
800:	learn: 24409.1680859	test: 26600.0643875	best: 26600.0643875 (800)	total: 2m 12s	remaining: 1m 55s
900:	learn: 23861.0869066	test: 26416.8234122	best: 26416.8234122 (900)

In [18]:
import pickle

# Sauvegarde avec pickle
with open("catboost_model.pkl", "wb") as f:
    pickle.dump(model, f)


In [7]:
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error

rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R² Score: {r2:.4f}')

RMSE: 25978.68
MAE: 18615.08
R² Score: 0.9789


In [8]:
# on regarde les features importante du modele catboost

feature_importances = model.get_feature_importance(train_pool)

feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f'{name}: {score:.2f}')

year: 20.63
town: 17.22
floor_area_sqm: 13.55
lease_commence_date: 13.17
flat_type: 12.91
flat_model: 8.50
street_name: 7.47
storey_range: 4.94
month: 1.08
remaining_lease_years: 0.53


In [9]:
import optuna

def objective(trial):
    params = {
        "iterations": trial.suggest_int("iterations", 1000, 3000, step=500),  # Entre 1000 et 3000
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),  # Entre 0.01 et 0.3
        "depth": trial.suggest_int("depth", 6, 12),  # Entre 6 et 12
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1e-3, 10, log=True),  # Régularisation
        "random_strength": trial.suggest_float("random_strength", 1e-3, 10, log=True),  # Stabilité
        "loss_function": "RMSE",
        "eval_metric": "RMSE",
        "cat_features": cat_features,
        "verbose": 0
    }
    
    model = CatBoostRegressor(**params)
    model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50, use_best_model=True, verbose=0)
    
    # Récupérer la meilleure RMSE sur le test set
    return model.best_score_["validation"]["RMSE"]


In [10]:
study = optuna.create_study(direction="minimize")  # Minimiser la RMSE
study.optimize(objective, n_trials=20)  # Faire 20 essais

print("Best Hyperparameters:", study.best_params)


[I 2025-02-16 20:06:54,572] A new study created in memory with name: no-name-c0d6a057-b12c-4167-a116-3268b0f2140e
[I 2025-02-16 20:14:44,923] Trial 0 finished with value: 25885.844958707625 and parameters: {'iterations': 3000, 'learning_rate': 0.09577902968981677, 'depth': 10, 'l2_leaf_reg': 0.04548450617561394, 'random_strength': 0.013330508384703675}. Best is trial 0 with value: 25885.844958707625.
[I 2025-02-16 20:20:22,015] Trial 1 finished with value: 26483.976514315676 and parameters: {'iterations': 3000, 'learning_rate': 0.16055312892045245, 'depth': 12, 'l2_leaf_reg': 1.5088894489655977, 'random_strength': 0.036404383660262905}. Best is trial 0 with value: 25885.844958707625.
[I 2025-02-16 20:28:08,052] Trial 2 finished with value: 25936.497468380738 and parameters: {'iterations': 2000, 'learning_rate': 0.12619571543502905, 'depth': 8, 'l2_leaf_reg': 0.0014551371956676832, 'random_strength': 0.029391528168139225}. Best is trial 0 with value: 25885.844958707625.
[I 2025-02-16 20

Best Hyperparameters: {'iterations': 2500, 'learning_rate': 0.041514698583052935, 'depth': 11, 'l2_leaf_reg': 0.08247401875925693, 'random_strength': 0.4823931328271286}


In [4]:
best_params = {'iterations': 2500, 'learning_rate': 0.041514698583052935, 'depth': 11, 'l2_leaf_reg': 0.08247401875925693, 'random_strength': 0.4823931328271286}
best_model = CatBoostRegressor(**best_params, cat_features=cat_features, verbose=100)
best_model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50, use_best_model=True)

# Évaluer le modèle
y_pred_best = best_model.predict(X_test)



0:	learn: 172280.4186665	test: 172777.6162065	best: 172777.6162065 (0)	total: 202ms	remaining: 8m 25s
100:	learn: 41577.7978029	test: 41164.7221619	best: 41164.7221619 (100)	total: 19s	remaining: 7m 30s
200:	learn: 34876.3076557	test: 34820.7236878	best: 34820.7236878 (200)	total: 38.3s	remaining: 7m 18s
300:	learn: 31436.9962440	test: 31797.0622759	best: 31797.0622759 (300)	total: 57.9s	remaining: 7m 2s
400:	learn: 29150.8783652	test: 29914.1745542	best: 29914.1745542 (400)	total: 1m 17s	remaining: 6m 45s
500:	learn: 27597.3010662	test: 28763.5465257	best: 28763.5465257 (500)	total: 1m 37s	remaining: 6m 27s
600:	learn: 26532.4411416	test: 28043.8935903	best: 28043.8935903 (600)	total: 1m 57s	remaining: 6m 10s
700:	learn: 25714.3258206	test: 27567.8168088	best: 27567.8168088 (700)	total: 2m 16s	remaining: 5m 51s
800:	learn: 24955.4522702	test: 27158.6464934	best: 27158.6464934 (800)	total: 2m 36s	remaining: 5m 31s
900:	learn: 24333.5385932	test: 26909.2877840	best: 26909.2877840 (900)	

In [6]:
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error

rmse_best = root_mean_squared_error(y_test, y_pred_best)
r2_best = r2_score(y_test, y_pred_best)
mae = mean_absolute_error(y_test, y_pred_best)
print(f"Optimized MAE: {mae:.2f}")
print(f"Optimized RMSE: {rmse_best:.2f}")
print(f"Optimized R² Score: {r2_best:.4f}")


Optimized MAE: 18474.76
Optimized RMSE: 25872.16
Optimized R² Score: 0.9791


In [7]:
import pickle

with open("catboost_model_entraine.pkl", "wb") as f:
    pickle.dump(best_model, f)


In [None]:
# On fait de la recherche d'hyperparametres avec quelques epochs pour voir si on peut ameliorer le modele
# Je te laisse le faire avec plus d'iterations


# from sklearn.model_selection import GridSearchCV

# param_grid = {
#     'learning_rate': [0.1],
#     'depth': [8, 10],
#     'l2_leaf_reg': [5, 10],
# }

# model = CatBoostRegressor(iterations=100, loss_function='RMSE', eval_metric='RMSE', cat_features=cat_features, verbose=0)
# grid_search = GridSearchCV(model, param_grid, cv=None, n_jobs=-1, scoring='neg_mean_squared_error')
# grid_search.fit(X_train, y_train)

# print(f'Best parameters: {grid_search.best_params_}')
# print(f'Best score: {grid_search.best_score_}')

Best parameters: {'depth': 10, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Best score: -1607875981.532204
