In [1]:
import pandas as pd

In [2]:
df = pd.read_csv('services/data/processed/immobilier.csv')

df.head()

Unnamed: 0,month,town,flat_type,block,street_name,storey_range,floor_area_sqm,flat_model,lease_commence_date,remaining_lease,resale_price
0,2017-01,Ang Mo Kio,2 ROOM,406,ANG MO KIO AVE 10,10 TO 12,44.0,Improved,1979,61 years 04 months,232000.0
1,2017-01,Ang Mo Kio,3 ROOM,108,ANG MO KIO AVE 4,01 TO 03,67.0,New Generation,1978,60 years 07 months,250000.0
2,2017-01,Ang Mo Kio,3 ROOM,602,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,262000.0
3,2017-01,Ang Mo Kio,3 ROOM,465,ANG MO KIO AVE 10,04 TO 06,68.0,New Generation,1980,62 years 01 month,265000.0
4,2017-01,Ang Mo Kio,3 ROOM,601,ANG MO KIO AVE 5,01 TO 03,67.0,New Generation,1980,62 years 05 months,265000.0


In [None]:
import pandas as pd
import numpy as np
from catboost import CatBoostRegressor, Pool
from sklearn.model_selection import train_test_split

# Charger les données
df = pd.read_csv('services/data/processed/immobilier.csv').drop(columns=['block'])

# Convertir la colonne 'month' en datetime
df['month'] = pd.to_datetime(df['month'])
df['year'] = df['month'].dt.year
df['month'] = df['month'].dt.month

# Fonction robuste pour extraire les années et mois
def parse_remaining_lease(lease):
    if isinstance(lease, str):
        parts = lease.split(' years ')
        years = 0
        months = 0
        
        # Vérifier si on a bien une partie "years"
        if len(parts) > 1:
            years = int(parts[0])
            months_part = parts[1]
        else:
            months_part = parts[0]
        
        # Vérifier si la partie "months" est présente
        if 'months' in months_part:
            months = int(months_part.split(' months')[0])
        
        return years + months / 12
    return np.nan

df['remaining_lease_years'] = df['remaining_lease'].apply(parse_remaining_lease)

# Sélectionner les colonnes utiles
features = ['month', 'year', 'town', 'flat_type', 'street_name',
            'storey_range', 'floor_area_sqm', 'flat_model', 'lease_commence_date',
            'remaining_lease_years']

# Définir les variables catégorielles pour CatBoost
cat_features = ['town', 'flat_type', 'street_name', 'storey_range', 'flat_model']

# Supprimer les valeurs manquantes
df = df.dropna()

# Séparation des données
X = df[features]
y = df['resale_price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Création des Pool pour CatBoost
train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
test_pool = Pool(X_test, label=y_test, cat_features=cat_features)

# Entraînement du modèle
model = CatBoostRegressor(
    iterations=1500,
    learning_rate=0.09,
    depth=10,
    loss_function='RMSE',
    eval_metric='RMSE',
    cat_features=cat_features,
    verbose=100
)

model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=50, use_best_model=True)

# Prédictions
y_pred = model.predict(X_test)




0:	learn: 165623.5501009	test: 166222.6555979	best: 166222.6555979 (0)	total: 186ms	remaining: 4m 39s
100:	learn: 36732.1578962	test: 36043.3448837	best: 36043.3448837 (100)	total: 13.2s	remaining: 3m 2s
200:	learn: 31417.1342968	test: 31172.9752794	best: 31172.9752794 (200)	total: 29.2s	remaining: 3m 8s
300:	learn: 29022.8446598	test: 29218.3696532	best: 29218.3696532 (300)	total: 45.4s	remaining: 3m
400:	learn: 27530.2574509	test: 28120.4143248	best: 28120.4143248 (400)	total: 1m 2s	remaining: 2m 50s
500:	learn: 26462.4033184	test: 27463.5042986	best: 27463.5042986 (500)	total: 1m 18s	remaining: 2m 36s
600:	learn: 25683.3905493	test: 27092.0498052	best: 27092.0498052 (600)	total: 1m 35s	remaining: 2m 22s
700:	learn: 25006.5328895	test: 26812.0127082	best: 26812.0127082 (700)	total: 1m 52s	remaining: 2m 8s
800:	learn: 24409.1680859	test: 26600.0643875	best: 26600.0643875 (800)	total: 2m 8s	remaining: 1m 52s
900:	learn: 23861.0869066	test: 26416.8234122	best: 26416.8234122 (900)	total:

UnicodeDecodeError: 'utf-8' codec can't decode byte 0x92 in position 21: invalid start byte

In [18]:
import pickle

# Sauvegarde avec pickle
with open("catboost_model.pkl", "wb") as f:
    pickle.dump(model, f)


In [11]:
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error

rmse = root_mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(f'RMSE: {rmse:.2f}')
print(f'MAE: {mae:.2f}')
print(f'R² Score: {r2:.4f}')

RMSE: 27484.82
MAE: 19890.03
R² Score: 0.9764


In [12]:
# on regarde les features importante du modele catboost

feature_importances = model.get_feature_importance(train_pool)

feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f'{name}: {score:.2f}')

year: 21.58
town: 15.90
floor_area_sqm: 15.43
flat_type: 13.89
lease_commence_date: 13.56
flat_model: 8.26
street_name: 6.34
storey_range: 4.08
month: 0.77
remaining_lease_years: 0.19


In [None]:
# On fait de la recherche d'hyperparametres avec quelques epochs pour voir si on peut ameliorer le modele
# Je te laisse le faire avec plus d'iterations


from sklearn.model_selection import GridSearchCV

param_grid = {
    'learning_rate': [0.1],
    'depth': [8, 10],
    'l2_leaf_reg': [5, 10],
}

model = CatBoostRegressor(iterations=100, loss_function='RMSE', eval_metric='RMSE', cat_features=cat_features, verbose=0)
grid_search = GridSearchCV(model, param_grid, cv=None, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

print(f'Best parameters: {grid_search.best_params_}')
print(f'Best score: {grid_search.best_score_}')

Best parameters: {'depth': 10, 'l2_leaf_reg': 5, 'learning_rate': 0.1}
Best score: -1607875981.532204


In [None]:
#On enregistre le meilleur modele 

#best_model = grid_search.best_estimator_
#best_model.save_model('services/models/housing_model.cbm')