# Data cleaning

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import chi2_contingency


from scipy import stats
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RepeatedKFold
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.metrics import mean_squared_error, mean_absolute_error

from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor, StackingRegressor
from sklearn.neighbors import KNeighborsRegressor  # Utilisez KNeighborsClassifier pour la classification

import xgboost as xgb
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
import lightgbm as lgb

pd.options.display.max_columns = 45            #just extending the number  of columns that can be seen
pd.options.display.max_colwidth = 120  

In [2]:
# load json as a dataframe with pandas
data = pd.read_json("data/final_dataset.json")

In [3]:
data.columns

Index(['Url', 'BathroomCount', 'BedroomCount', 'ConstructionYear', 'Country',
       'District', 'Fireplace', 'FloodingZone', 'Furnished', 'Garden',
       'GardenArea', 'Kitchen', 'LivingArea', 'Locality', 'MonthlyCharges',
       'NumberOfFacades', 'PEB', 'PostalCode', 'Price', 'PropertyId',
       'Province', 'Region', 'RoomCount', 'ShowerCount', 'StateOfBuilding',
       'SubtypeOfProperty', 'SurfaceOfPlot', 'SwimmingPool', 'Terrace',
       'ToiletCount', 'TypeOfProperty', 'TypeOfSale'],
      dtype='object')

In [4]:
data.head()

Unnamed: 0,Url,BathroomCount,BedroomCount,ConstructionYear,Country,District,Fireplace,FloodingZone,Furnished,Garden,GardenArea,Kitchen,LivingArea,Locality,MonthlyCharges,NumberOfFacades,PEB,PostalCode,Price,PropertyId,Province,Region,RoomCount,ShowerCount,StateOfBuilding,SubtypeOfProperty,SurfaceOfPlot,SwimmingPool,Terrace,ToiletCount,TypeOfProperty,TypeOfSale
2,https://www.immoweb.be/en/classified/apartment/for-sale/zeebrugge/8380/10957010,1.0,1,1969.0,Belgium,Brugge,,,,,,,29.0,Zeebrugge,,,,8380,99000,10957010,West Flanders,Flanders,1.0,0.0,GOOD,flat_studio,,,1.0,1.0,2,residential_sale
6,https://www.immoweb.be/en/classified/house/for-sale/tournai/7500/10956841,6.0,13,1920.0,Belgium,Tournai,,,0.0,,,,391.0,Tournai,,3.0,D,7500,765000,10956841,Hainaut,Wallonie,31.0,,GOOD,apartment_block,130.0,,,5.0,1,residential_sale
8,https://www.immoweb.be/en/classified/house/for-sale/blankenberge/8370/10956807,2.0,4,2008.0,Belgium,Brugge,,NON_FLOOD_ZONE,1.0,,,INSTALLED,111.0,BLANKENBERGE,,2.0,B,8370,399000,10956807,West Flanders,Flanders,,0.0,GOOD,house,0.0,,,2.0,1,residential_sale
10,https://www.immoweb.be/en/classified/house/for-sale/de-panne/8660/10956664,1.0,4,,Belgium,Veurne,,,,1.0,1.0,,,De Panne,,2.0,F,8660,230000,10956664,West Flanders,Flanders,,,TO_BE_DONE_UP,house,170.0,0.0,1.0,2.0,1,residential_sale
11,https://www.immoweb.be/en/classified/apartment/for-sale/hasselt/3500/10956668,0.0,2,1972.0,Belgium,Hasselt,,NON_FLOOD_ZONE,,,,,92.0,Hasselt,,,B,3500,198000,10956668,Limburg,Flanders,1.0,0.0,AS_NEW,apartment,,,1.0,1.0,2,residential_sale


In [5]:
postal_codes_to_keep = pd.read_csv('data/postalcode_be.csv')
postal_codes_to_keep_list = postal_codes_to_keep['Code postal'].tolist()
data = data[data['PostalCode'].isin(postal_codes_to_keep_list)]

In [6]:
data.drop_duplicates("PropertyId",inplace=True)
data.drop(data[data.ConstructionYear > 2033].index,inplace=True)
data.drop(data[data.PostalCode < 1000].index,inplace=True)
data.drop(data[data.GardenArea > 200000].index,inplace=True)
data.drop(data[data.LivingArea > 4500].index, inplace=True)
data.drop(data[data.Price > 4000000].index,inplace=True)
data.drop(data[data.NumberOfFacades > 8].index,inplace=True)
data.drop(data[data.ShowerCount > 58].index,inplace=True)
data.drop(data[data.SurfaceOfPlot > 200000].index,inplace=True)
data.drop(data[data.ToiletCount > 25].index,inplace=True)
data.drop(data[data.District == None].index,inplace=True)
data.drop(data[data.Province == None].index,inplace=True)
data.drop(data[data.Region == None].index,inplace=True)
data.drop(data[data.Locality == None].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_monthly_amount"].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_without_lump_sum"].index,inplace=True)
data.drop(data[data.TypeOfSale == "annuity_lump_sum"].index,inplace=True)
data.drop(data[data.TypeOfSale == "homes_to_build"].index,inplace=True)

In [7]:
data_sales = data[data.TypeOfSale == "residential_sale"]
data_rent = data[data.TypeOfSale == "residential_monthly_rent"]

In [8]:
data_sales.drop(['MonthlyCharges'], axis=1, inplace=True)
data_sales['Fireplace'].fillna(0, inplace=True)
data_sales['Garden'].fillna(0, inplace=True)
data_sales['GardenArea'].fillna(data_sales['GardenArea'].median(), inplace=True)
data_sales['Furnished'].fillna(0, inplace=True)
data_sales['SwimmingPool'].fillna(0, inplace=True)
data_sales['ShowerCount'].fillna(0, inplace=True)
data_sales['FloodingZone'].fillna('NON_FLOOD_ZONE', inplace=True)
data_sales['SurfaceOfPlot'].fillna(data_sales['SurfaceOfPlot'].median(), inplace=True)
data_sales['Kitchen'].fillna('NOT_INSTALLED', inplace=True)
data_sales['Terrace'].fillna(0, inplace=True)
data_sales['NumberOfFacades'].fillna(2, inplace=True)
data_sales['ToiletCount'].fillna(0, inplace=True)
data_sales['BathroomCount'].fillna(data_sales['BathroomCount'].median(), inplace=True)
data_sales['StateOfBuilding'].fillna('GOOD', inplace=True)
data_sales['PEB'].fillna('Unknown', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sales.drop(['MonthlyCharges'], axis=1, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sales['Fireplace'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sales['Garden'].fillna(0, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sales['Garde

In [9]:
#correlation_matrix = data_sales.corr()
#plt.figure(figsize=(20,12))
#sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
#plt.show()

In [10]:
keep_PEB = ['A++', 'A+', 'B', 'C', 'D', 'E', 'F', 'G']
data_sales = data_sales[data_sales['PEB'].isin(keep_PEB)]
# order of the PEB
peb_order = ['A++', 'A+', 'B', 'C', 'D', 'E', 'F', 'G', 'Unknown']
ordinal_encoder = OrdinalEncoder(categories=[peb_order])
data_sales.loc[:, 'PEB_Encoded'] = ordinal_encoder.fit_transform(data_sales[['PEB']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sales.loc[:, 'PEB_Encoded'] = ordinal_encoder.fit_transform(data_sales[['PEB']])


In [11]:
state_order = ['AS_NEW', 'JUST_RENOVATED', 'GOOD', 'TO_BE_DONE_UP', 'TO_RENOVATE', 'TO_RESTORE', 'Unknown']
ordinal_encoder = OrdinalEncoder(categories=[state_order])
data_sales.loc[:, 'State_Encoded'] = ordinal_encoder.fit_transform(data_sales[['StateOfBuilding']])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sales.loc[:, 'State_Encoded'] = ordinal_encoder.fit_transform(data_sales[['StateOfBuilding']])


In [12]:
def transform_flooding_zone(zone):
    if zone == 'NON_FLOOD_ZONE':
        return 0
    else:
        return 1

In [13]:
data_sales['FloodingZone_Encoded'] = data_sales['FloodingZone'].apply(transform_flooding_zone)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sales['FloodingZone_Encoded'] = data_sales['FloodingZone'].apply(transform_flooding_zone)


In [14]:
data_sales['LivingArea'].fillna(data_sales['LivingArea'].median(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_sales['LivingArea'].fillna(data_sales['LivingArea'].median(), inplace=True)


In [15]:
data_sales = data_sales.drop(columns=['Url', 'Country', 'TypeOfSale', 'PropertyId', 'TypeOfProperty', 'PostalCode'])

# Pipeline of Machine Learning

In [22]:
data_sales.columns

Index(['BathroomCount', 'BedroomCount', 'ConstructionYear', 'District',
       'Fireplace', 'FloodingZone', 'Furnished', 'Garden', 'GardenArea',
       'Kitchen', 'LivingArea', 'Locality', 'NumberOfFacades', 'PEB', 'Price',
       'Province', 'Region', 'RoomCount', 'ShowerCount', 'StateOfBuilding',
       'SubtypeOfProperty', 'SurfaceOfPlot', 'SwimmingPool', 'Terrace',
       'ToiletCount', 'PEB_Encoded', 'State_Encoded', 'FloodingZone_Encoded'],
      dtype='object')

In [23]:
columns_to_keep = [
    'BathroomCount', 'BedroomCount', 'LivingArea', 'SurfaceOfPlot', 'ToiletCount', 'PEB', 'State_Encoded',
    'District', 'FloodingZone', 'Kitchen', 'Province', 'Region', 'SubtypeOfProperty', 'Garden',
    'SwimmingPool', 'Price'
]

In [24]:
data_sales = data_sales[columns_to_keep]

In [25]:
ode_cols = ['BathroomCount', 'BedroomCount', 'LivingArea', 'SurfaceOfPlot', 'ToiletCount', 'State_Encoded']
ohe_cols = ['District', 'FloodingZone', 'Kitchen', 'Province', 'Region', 'SubtypeOfProperty', 'PEB']
num_cols = ['Garden', 'SwimmingPool'] 

In [26]:
num_pipeline = Pipeline(steps=[
    ('impute', KNNImputer()),
    ('scaler', StandardScaler())
])
ode_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('scaler', StandardScaler()) 
])
ohe_pipeline = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('ohe', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

In [27]:
col_trans = ColumnTransformer(transformers=[
    ('num_p', num_pipeline, num_cols),
    ('ode_p', ode_pipeline, ode_cols),
    ('ohe_p', ohe_pipeline, ohe_cols)
], remainder='passthrough', n_jobs=-1)

In [28]:
# Preprocessing pipeline
pipeline = Pipeline(steps=[
    ('preprocessing', col_trans),
    ('model', CatBoostRegressor())
])

In [29]:
X = data_sales.drop('Price', axis=1)
y = data_sales['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=25)

In [30]:
param_grid = {
    'preprocessing__num_p__impute__n_neighbors': [3, 5, 7, 9, 11]
}
grid_search = GridSearchCV(pipeline, 
                            param_grid,
                            scoring = 'accuracy', 
                            cv = 5, # Use 5 folds
                            verbose = 0,
                            n_jobs = -1 #Use all but one CPU core
                          )

grid_search.fit(X_train, y_train)
best_k = grid_search.best_params_['preprocessing__num_p__impute__n_neighbors']
# print("Best k for KNNImputer:", best_k)

Learning rate set to 0.073011
0:	learn: 365483.6430039	total: 48.9ms	remaining: 48.8s
1:	learn: 353603.1741320	total: 51.2ms	remaining: 25.5s
2:	learn: 342568.8592857	total: 53.3ms	remaining: 17.7s
3:	learn: 332791.8899623	total: 55.2ms	remaining: 13.7s
4:	learn: 323519.8194804	total: 67.7ms	remaining: 13.5s
5:	learn: 315303.1543038	total: 85.8ms	remaining: 14.2s
6:	learn: 308047.2231193	total: 100ms	remaining: 14.2s
7:	learn: 300805.5986990	total: 107ms	remaining: 13.3s
8:	learn: 294561.5509217	total: 118ms	remaining: 13s
9:	learn: 289056.9655786	total: 126ms	remaining: 12.5s
10:	learn: 283643.2752478	total: 166ms	remaining: 14.9s
11:	learn: 278724.7268182	total: 182ms	remaining: 15s
12:	learn: 274243.6361237	total: 195ms	remaining: 14.8s
13:	learn: 270019.2509255	total: 211ms	remaining: 14.8s
14:	learn: 266273.3411755	total: 231ms	remaining: 15.2s
15:	learn: 262728.4455740	total: 239ms	remaining: 14.7s
16:	learn: 259428.7756742	total: 254ms	remaining: 14.7s
17:	learn: 256397.1290290	

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

721:	learn: 157038.2631986	total: 11.2s	remaining: 4.3s
722:	learn: 156986.0080338	total: 11.2s	remaining: 4.28s
723:	learn: 156968.8770709	total: 11.2s	remaining: 4.26s
724:	learn: 156950.8136034	total: 11.2s	remaining: 4.25s
725:	learn: 156876.6790129	total: 11.2s	remaining: 4.24s
726:	learn: 156832.4100512	total: 11.2s	remaining: 4.22s
727:	learn: 156800.8712919	total: 11.2s	remaining: 4.2s
728:	learn: 156772.6729729	total: 11.2s	remaining: 4.18s
729:	learn: 156771.9340356	total: 11.2s	remaining: 4.16s
730:	learn: 156745.9731501	total: 11.3s	remaining: 4.15s
731:	learn: 156745.2711935	total: 11.3s	remaining: 4.13s
732:	learn: 156733.1936156	total: 11.3s	remaining: 4.11s
733:	learn: 156684.3332353	total: 11.3s	remaining: 4.09s
734:	learn: 156677.9060250	total: 11.3s	remaining: 4.07s
735:	learn: 156638.1128975	total: 11.3s	remaining: 4.06s
736:	learn: 156612.5218895	total: 11.3s	remaining: 4.04s
737:	learn: 156591.8647178	total: 11.3s	remaining: 4.02s
738:	learn: 156558.3353082	total:

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

721:	learn: 157038.2631986	total: 11.5s	remaining: 4.43s
722:	learn: 156986.0080338	total: 11.7s	remaining: 4.47s
723:	learn: 156968.8770709	total: 11.7s	remaining: 4.45s
724:	learn: 156950.8136034	total: 11.7s	remaining: 4.43s
725:	learn: 156876.6790129	total: 11.7s	remaining: 4.41s
726:	learn: 156832.4100512	total: 11.7s	remaining: 4.4s
727:	learn: 156800.8712919	total: 11.7s	remaining: 4.38s
728:	learn: 156772.6729729	total: 11.7s	remaining: 4.36s
729:	learn: 156771.9340356	total: 11.7s	remaining: 4.34s
730:	learn: 156745.9731501	total: 11.7s	remaining: 4.32s
731:	learn: 156745.2711935	total: 11.7s	remaining: 4.3s
732:	learn: 156733.1936156	total: 11.7s	remaining: 4.28s
733:	learn: 156684.3332353	total: 11.8s	remaining: 4.29s
734:	learn: 156677.9060250	total: 11.8s	remaining: 4.27s
735:	learn: 156638.1128975	total: 11.9s	remaining: 4.26s
736:	learn: 156612.5218895	total: 11.9s	remaining: 4.24s
737:	learn: 156591.8647178	total: 11.9s	remaining: 4.22s
738:	learn: 156558.3353082	total:

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

721:	learn: 157038.2631986	total: 11.8s	remaining: 4.55s
722:	learn: 156986.0080338	total: 11.8s	remaining: 4.53s
723:	learn: 156968.8770709	total: 11.8s	remaining: 4.51s
724:	learn: 156950.8136034	total: 11.8s	remaining: 4.49s
725:	learn: 156876.6790129	total: 11.8s	remaining: 4.47s
726:	learn: 156832.4100512	total: 11.9s	remaining: 4.45s
727:	learn: 156800.8712919	total: 11.9s	remaining: 4.44s
728:	learn: 156772.6729729	total: 11.9s	remaining: 4.42s
729:	learn: 156771.9340356	total: 12s	remaining: 4.46s
730:	learn: 156745.9731501	total: 12.1s	remaining: 4.46s
731:	learn: 156745.2711935	total: 12.1s	remaining: 4.44s
732:	learn: 156733.1936156	total: 12.2s	remaining: 4.43s
733:	learn: 156684.3332353	total: 12.2s	remaining: 4.42s
734:	learn: 156677.9060250	total: 12.2s	remaining: 4.39s
735:	learn: 156638.1128975	total: 12.2s	remaining: 4.39s
736:	learn: 156612.5218895	total: 12.2s	remaining: 4.37s
737:	learn: 156591.8647178	total: 12.3s	remaining: 4.36s
738:	learn: 156558.3353082	total:

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

722:	learn: 156821.8979109	total: 12.4s	remaining: 4.75s
723:	learn: 156795.2873578	total: 12.4s	remaining: 4.74s
724:	learn: 156781.7787823	total: 12.5s	remaining: 4.73s
725:	learn: 156756.8835705	total: 12.5s	remaining: 4.71s
726:	learn: 156729.6435994	total: 12.5s	remaining: 4.7s
727:	learn: 156702.7967823	total: 12.5s	remaining: 4.68s
728:	learn: 156682.2285762	total: 12.5s	remaining: 4.66s
729:	learn: 156660.8265160	total: 12.5s	remaining: 4.63s
730:	learn: 156629.6864220	total: 12.5s	remaining: 4.61s
731:	learn: 156555.1827373	total: 12.5s	remaining: 4.59s
732:	learn: 156542.2955212	total: 12.5s	remaining: 4.57s
733:	learn: 156510.6827618	total: 12.5s	remaining: 4.54s
734:	learn: 156482.5531362	total: 12.5s	remaining: 4.52s
735:	learn: 156463.0230798	total: 12.5s	remaining: 4.5s
736:	learn: 156439.8570477	total: 12.6s	remaining: 4.48s
737:	learn: 156410.5379034	total: 12.6s	remaining: 4.46s
738:	learn: 156375.4230795	total: 12.6s	remaining: 4.44s
739:	learn: 156356.7158241	total:

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

722:	learn: 154259.9388236	total: 12.4s	remaining: 4.77s
723:	learn: 154256.8577243	total: 12.5s	remaining: 4.75s
724:	learn: 154236.4694276	total: 12.6s	remaining: 4.76s
725:	learn: 154235.0397548	total: 12.6s	remaining: 4.74s
726:	learn: 154205.2488975	total: 12.7s	remaining: 4.75s
727:	learn: 154185.2015014	total: 12.7s	remaining: 4.74s
728:	learn: 154143.3438372	total: 12.7s	remaining: 4.72s
729:	learn: 154098.2085782	total: 12.7s	remaining: 4.7s
730:	learn: 154041.0608058	total: 12.7s	remaining: 4.68s
731:	learn: 154005.7096230	total: 12.7s	remaining: 4.66s
732:	learn: 153865.1057147	total: 12.8s	remaining: 4.66s
733:	learn: 153859.5078950	total: 12.8s	remaining: 4.64s
734:	learn: 153821.6037699	total: 12.8s	remaining: 4.62s
735:	learn: 153788.4890208	total: 12.8s	remaining: 4.6s
736:	learn: 153778.4609347	total: 12.8s	remaining: 4.58s
737:	learn: 153660.3434012	total: 12.8s	remaining: 4.55s
738:	learn: 153597.1505610	total: 12.8s	remaining: 4.53s
739:	learn: 153554.6522891	total:

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

721:	learn: 153982.1232859	total: 13.1s	remaining: 5.06s
722:	learn: 153967.9027307	total: 13.2s	remaining: 5.05s
723:	learn: 153932.7158393	total: 13.2s	remaining: 5.03s
724:	learn: 153912.0032659	total: 13.2s	remaining: 5.01s
725:	learn: 153898.1721212	total: 13.2s	remaining: 4.99s
726:	learn: 153863.9059502	total: 13.2s	remaining: 4.96s
727:	learn: 153854.5580021	total: 13.2s	remaining: 4.94s
728:	learn: 153845.8764575	total: 13.2s	remaining: 4.92s
729:	learn: 153837.8054722	total: 13.2s	remaining: 4.89s
730:	learn: 153798.9708321	total: 13.2s	remaining: 4.87s
731:	learn: 153738.9880265	total: 13.2s	remaining: 4.85s
732:	learn: 153632.4637364	total: 13.2s	remaining: 4.82s
733:	learn: 153596.1054961	total: 13.2s	remaining: 4.8s
734:	learn: 153576.2389763	total: 13.3s	remaining: 4.78s
735:	learn: 153568.7146747	total: 13.3s	remaining: 4.75s
736:	learn: 153526.5060586	total: 13.3s	remaining: 4.73s
737:	learn: 153505.5916587	total: 13.3s	remaining: 4.71s
738:	learn: 153486.3542142	total

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

153:	learn: 191422.4727545	total: 1.54s	remaining: 8.48s
154:	learn: 191250.5987271	total: 1.56s	remaining: 8.51s
155:	learn: 191140.4396835	total: 1.56s	remaining: 8.46s
156:	learn: 190835.6575657	total: 1.57s	remaining: 8.41s
157:	learn: 190712.7907464	total: 1.57s	remaining: 8.36s
158:	learn: 190579.1303940	total: 1.57s	remaining: 8.31s
159:	learn: 190437.9602371	total: 1.57s	remaining: 8.26s
160:	learn: 190229.1949268	total: 1.58s	remaining: 8.21s
161:	learn: 190046.3786022	total: 1.58s	remaining: 8.19s
162:	learn: 189902.0785599	total: 1.59s	remaining: 8.16s
163:	learn: 189749.4464416	total: 1.6s	remaining: 8.14s
164:	learn: 189644.8866047	total: 1.6s	remaining: 8.11s
165:	learn: 189529.2196152	total: 1.62s	remaining: 8.13s
166:	learn: 189391.4340686	total: 1.62s	remaining: 8.09s
167:	learn: 189220.9405533	total: 1.63s	remaining: 8.07s
168:	learn: 189145.2071720	total: 1.64s	remaining: 8.05s
169:	learn: 189011.9347267	total: 1.64s	remaining: 8.02s
170:	learn: 188910.0767702	total:

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

297:	learn: 176183.1483052	total: 2.54s	remaining: 5.99s
298:	learn: 176139.4296324	total: 2.54s	remaining: 5.97s
299:	learn: 175994.0105235	total: 2.55s	remaining: 5.95s
300:	learn: 175930.8323395	total: 2.55s	remaining: 5.93s
301:	learn: 175876.0080100	total: 2.56s	remaining: 5.92s
302:	learn: 175815.3609267	total: 2.58s	remaining: 5.93s
303:	learn: 175752.6740728	total: 2.59s	remaining: 5.93s
304:	learn: 175703.5755679	total: 2.59s	remaining: 5.91s
305:	learn: 175651.8378227	total: 2.61s	remaining: 5.91s
306:	learn: 175498.8866515	total: 2.61s	remaining: 5.89s
307:	learn: 175430.8693163	total: 2.61s	remaining: 5.87s
308:	learn: 175370.0897233	total: 2.62s	remaining: 5.85s
309:	learn: 175328.8200717	total: 2.62s	remaining: 5.83s
310:	learn: 175275.6890205	total: 2.62s	remaining: 5.81s
311:	learn: 175228.5983368	total: 2.63s	remaining: 5.79s
312:	learn: 175140.6028457	total: 2.63s	remaining: 5.77s
313:	learn: 175090.3613442	total: 2.64s	remaining: 5.76s
314:	learn: 174978.7553626	tota

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

585:	learn: 161711.5506768	total: 3.05s	remaining: 2.15s
586:	learn: 161672.2226150	total: 3.05s	remaining: 2.15s
587:	learn: 161588.6380713	total: 3.05s	remaining: 2.14s
588:	learn: 161549.5112350	total: 3.05s	remaining: 2.13s
589:	learn: 161520.2076977	total: 3.05s	remaining: 2.12s
590:	learn: 161387.7732316	total: 3.05s	remaining: 2.11s
591:	learn: 161342.4029270	total: 3.05s	remaining: 2.1s
592:	learn: 161325.9960708	total: 3.06s	remaining: 2.1s
593:	learn: 161274.1595843	total: 3.06s	remaining: 2.09s
594:	learn: 161227.7405899	total: 3.06s	remaining: 2.08s
595:	learn: 161187.6294430	total: 3.06s	remaining: 2.07s
596:	learn: 161158.9582399	total: 3.06s	remaining: 2.06s
597:	learn: 161154.1250431	total: 3.06s	remaining: 2.06s
598:	learn: 161116.5734373	total: 3.06s	remaining: 2.05s
599:	learn: 161025.0114248	total: 3.06s	remaining: 2.04s
600:	learn: 160988.0844828	total: 3.06s	remaining: 2.03s
601:	learn: 160939.9765230	total: 3.06s	remaining: 2.02s
602:	learn: 160914.5535331	total:

Traceback (most recent call last):
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 971, in _score
    scores = scorer(estimator, X_test, y_test, **score_params)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 279, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/utils/_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "/home/databroma/miniconda3/envs/ml_env/lib/python3.10/site-packages/sklearn/metrics/_classification.py", line 231, in accuracy_score
    y_type, y_true, y_pred = _check_targets(y

Learning rate set to 0.075631
0:	learn: 364603.5044118	total: 48ms	remaining: 47.9s
1:	learn: 351701.1337919	total: 49.1ms	remaining: 24.5s
2:	learn: 340266.0498344	total: 50.1ms	remaining: 16.6s
3:	learn: 329972.7673114	total: 51ms	remaining: 12.7s
4:	learn: 320698.9862793	total: 52ms	remaining: 10.3s
5:	learn: 312421.6158208	total: 53ms	remaining: 8.78s
6:	learn: 304951.3192007	total: 54ms	remaining: 7.65s
7:	learn: 298084.4266384	total: 54.9ms	remaining: 6.81s
8:	learn: 291814.2362264	total: 55.9ms	remaining: 6.15s
9:	learn: 286265.5850116	total: 56.9ms	remaining: 5.63s
10:	learn: 281337.8218006	total: 58ms	remaining: 5.21s
11:	learn: 276571.9530858	total: 59.1ms	remaining: 4.86s
12:	learn: 272550.3694239	total: 60ms	remaining: 4.56s
13:	learn: 268282.0556861	total: 61.1ms	remaining: 4.3s
14:	learn: 264698.4433195	total: 62.1ms	remaining: 4.08s
15:	learn: 261243.0287226	total: 63.2ms	remaining: 3.88s
16:	learn: 258129.7390716	total: 64.2ms	remaining: 3.71s
17:	learn: 254764.6382748	

In [31]:
pipeline.set_params(preprocessing__num_p__impute__n_neighbors=best_k)
pipeline.fit(X_train, y_train)
y_pred_lr = pipeline.predict(X_test)

Learning rate set to 0.075631
0:	learn: 364603.5044118	total: 1.55ms	remaining: 1.55s
1:	learn: 351701.1337919	total: 2.68ms	remaining: 1.34s
2:	learn: 340266.0498344	total: 3.77ms	remaining: 1.25s
3:	learn: 329972.7673114	total: 4.77ms	remaining: 1.19s
4:	learn: 320698.9862793	total: 5.79ms	remaining: 1.15s
5:	learn: 312421.6158208	total: 6.78ms	remaining: 1.12s
6:	learn: 304951.3192007	total: 7.79ms	remaining: 1.1s
7:	learn: 298084.4266384	total: 8.77ms	remaining: 1.09s
8:	learn: 291814.2362264	total: 9.76ms	remaining: 1.07s
9:	learn: 286265.5850116	total: 10.8ms	remaining: 1.06s
10:	learn: 281337.8218006	total: 11.8ms	remaining: 1.06s
11:	learn: 276571.9530858	total: 12.9ms	remaining: 1.06s
12:	learn: 272550.3694239	total: 13.9ms	remaining: 1.06s
13:	learn: 268282.0556861	total: 15ms	remaining: 1.05s
14:	learn: 264698.4433195	total: 16ms	remaining: 1.05s
15:	learn: 261243.0287226	total: 17ms	remaining: 1.05s
16:	learn: 258129.7390716	total: 18.1ms	remaining: 1.04s
17:	learn: 254764.

In [32]:
# Calcul des scores
train_score = pipeline.score(X_train, y_train)
test_score = pipeline.score(X_test, y_test)
mae = mean_absolute_error(y_test, y_pred_lr)

print("Train Score: ", train_score)
print("Test Score: ", test_score)
print("Mean Absolute Error: ", mae)

Train Score:  0.8432126206539872
Test Score:  0.7667510061691452
Mean Absolute Error:  94131.36902736657
866:	learn: 150705.4255348	total: 15.4s	remaining: 2.35s
867:	learn: 150691.8559404	total: 15.4s	remaining: 2.34s
868:	learn: 150669.8828615	total: 15.4s	remaining: 2.32s
869:	learn: 150621.8535357	total: 15.4s	remaining: 2.3s
870:	learn: 150599.1199823	total: 15.4s	remaining: 2.28s
871:	learn: 150578.9891455	total: 15.4s	remaining: 2.27s
872:	learn: 150570.2437772	total: 15.5s	remaining: 2.25s
873:	learn: 150525.7308110	total: 15.5s	remaining: 2.23s
874:	learn: 150492.5795631	total: 15.5s	remaining: 2.21s
875:	learn: 150396.8774322	total: 15.5s	remaining: 2.2s
876:	learn: 150374.7426645	total: 15.5s	remaining: 2.18s
877:	learn: 150333.8099171	total: 15.5s	remaining: 2.16s
878:	learn: 150315.6608739	total: 15.6s	remaining: 2.15s
879:	learn: 150297.2128906	total: 15.6s	remaining: 2.13s
880:	learn: 150279.7066816	total: 15.6s	remaining: 2.11s
881:	learn: 150259.4225632	total: 15.6s	re