In [1]:
# Imports 

import pandas as pd
import os
import requests
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor

In [2]:
# Paths 

DATA_DIR_PATH = 'data'

In [3]:
# Import DVF data for Saint-Malo (https://app.dvf.etalab.gouv.fr/) 

csv_file_list = [file.name for file in os.scandir(DATA_DIR_PATH) if file.name.endswith('.csv')]

df = pd.DataFrame()

transac_count = 0

for file in csv_file_list:
    new_file = pd.read_csv(os.path.join(DATA_DIR_PATH, file), sep=';')
    new_file['section']=file[9:11]
    transac_count += len(new_file)
    df = pd.concat([df,new_file], ignore_index=True)

#transac_count == len(df)

In [4]:
df.set_index('id_mutation', inplace=True)

In [5]:
df.date_mutation = pd.to_datetime(df.date_mutation)

In [6]:
df.surface_terrain = df.surface_terrain.fillna(0)
df.nombre_pieces_principales = df.nombre_pieces_principales.fillna(0)

In [7]:
#df['nombre_lots'].isna().sum()
#df['type_local'].isna().sum()
#df['surface_reelle_bati'].isna().sum()
#df['nombre_pieces_principales'].isna().sum()

In [8]:
df.section_prefixe = df.section_prefixe.apply(lambda x: x.replace('0', ''))

In [9]:
# drop unnecessary data

df.drop(columns=['numero_disposition', 'code_commune', 'code_departement', 'ancien_code_commune', 'ancien_nom_commune', 
                 'ancien_id_parcelle', 'adresse_suffixe','adresse_code_voie', 'lot1_numero', 'lot2_numero', 'lot3_numero', 
                 'lot4_numero', 'lot5_numero', 'lot1_surface_carrez', 'lot2_surface_carrez', 'lot3_surface_carrez', 'section_prefixe',
                 'lot4_surface_carrez', 'lot5_surface_carrez', 'nom_commune', 'id_parcelle', 'ancien_id_parcelle', 'numero_volume', 
                 'code_type_local', 'nature_culture', 'code_nature_culture', 'code_nature_culture_speciale', 'nature_culture_speciale']
        , inplace=True)

In [10]:
# Add inondation risk information (info from https://georisques.gouv.fr/)

quartier_risque_df = pd.read_csv('section-inondable.csv', sep=';')
df = pd.merge(df, quartier_risque_df, how='left', left_on='section',right_on='section_prefixe')

In [11]:
# Filter according to transaction type, date, and reshuffle

df_filtered = df[(df.nature_mutation.isin(['Vente', "Vente en l'état futur d'achèvement"])) &
         (df.type_local.isin(['Appartement', 'Maison'])) &
         (df.date_mutation > '2023-01-01')]

df_filtered = df_filtered.sample(frac=1, random_state=42).reset_index(False)

In [12]:
df_filtered['maison'] = (df_filtered['type_local'] == "Maison").astype(int)

In [13]:
df.drop(columns=['type_local'], inplace=True)

In [14]:
# Create X and y

X = df_filtered[['nombre_lots', 'maison', 'surface_reelle_bati', 'nombre_pieces_principales', 'surface_terrain', 'latitude', 'longitude','risque']]
y = df_filtered['valeur_fonciere']

In [15]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 842 entries, 0 to 841
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   nombre_lots                842 non-null    int64  
 1   maison                     842 non-null    int64  
 2   surface_reelle_bati        842 non-null    float64
 3   nombre_pieces_principales  842 non-null    float64
 4   surface_terrain            842 non-null    float64
 5   latitude                   839 non-null    float64
 6   longitude                  839 non-null    float64
 7   risque                     842 non-null    int64  
dtypes: float64(5), int64(3)
memory usage: 52.8 KB


In [16]:
# Create train and test sets

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [17]:
regressor=XGBRegressor(eval_metric='mse')

In [18]:
from sklearn.model_selection import cross_val_score

In [19]:
cv_scores = cross_val_score(regressor, X_train, y_train, cv=5, scoring='r2')

ValueError: 
All the 5 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
4 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/sklearn.py", line 1055, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/sklearn.py", line 521, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/sklearn.py", line 958, in _create_dmatrix
    return QuantileDMatrix(
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 1529, in __init__
    self._init(
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 1588, in _init
    it.reraise()
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 576, in reraise
    raise exc  # pylint: disable=raising-bad-type
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 557, in _handle_exception
    return fn()
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 641, in <lambda>
    return self._handle_exception(lambda: self.next(input_data), 0)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/data.py", line 1280, in next
    input_data(**self.kwargs)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 633, in input_data
    self.proxy.set_info(
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 932, in set_info
    self.set_label(label)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 1070, in set_label
    dispatch_meta_backend(self, label, "label", "float")
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/data.py", line 1225, in dispatch_meta_backend
    _meta_from_pandas_series(data, name, dtype, handle)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/data.py", line 545, in _meta_from_pandas_series
    _meta_from_numpy(data, name, dtype, handle)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/data.py", line 1159, in _meta_from_numpy
    _check_call(_LIB.XGDMatrixSetInfoFromInterface(handle, c_str(field), interface_str))
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 282, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [14:13:49] /Users/runner/work/xgboost/xgboost/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000123990355 dmlc::LogMessageFatal::~LogMessageFatal() + 117
  [bt] (1) 2   libxgboost.dylib                    0x0000000123a66db9 xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const&, xgboost::StringView, xgboost::Json) + 3801
  [bt] (2) 3   libxgboost.dylib                    0x0000000123a65df2 xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView) + 146
  [bt] (3) 4   libxgboost.dylib                    0x00000001239a8729 XGDMatrixSetInfoFromInterface + 233
  [bt] (4) 5   libffi.dylib                        0x00007ff816a97882 ffi_call_unix64 + 82
  [bt] (5) 6   ???                                 0x00007ff7b557c120 0x0 + 140701876076832



--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/sklearn.py", line 1090, in fit
    self._Booster = train(
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 730, in inner_f
    return func(**kwargs)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/training.py", line 181, in train
    bst.update(dtrain, i, obj)
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 2050, in update
    _check_call(
  File "/Users/marionchaffaut/.pyenv/versions/3.10.6/envs/personal-projects/lib/python3.10/site-packages/xgboost/core.py", line 282, in _check_call
    raise XGBoostError(py_str(_LIB.XGBGetLastError()))
xgboost.core.XGBoostError: [14:13:49] /Users/runner/work/xgboost/xgboost/src/metric/metric.cc:49: Unknown metric function mse
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000123990355 dmlc::LogMessageFatal::~LogMessageFatal() + 117
  [bt] (1) 2   libxgboost.dylib                    0x0000000123b41e1b xgboost::Metric::Create(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, xgboost::Context const*) + 139
  [bt] (2) 3   libxgboost.dylib                    0x0000000123b1230c xgboost::LearnerConfiguration::ConfigureMetrics(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > >, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > > const&) + 268
  [bt] (3) 4   libxgboost.dylib                    0x0000000123b02c0d xgboost::LearnerConfiguration::Configure() + 1245
  [bt] (4) 5   libxgboost.dylib                    0x0000000123b02f1a xgboost::LearnerImpl::UpdateOneIter(int, std::__1::shared_ptr<xgboost::DMatrix>) + 106
  [bt] (5) 6   libxgboost.dylib                    0x00000001239b238f XGBoosterUpdateOneIter + 143
  [bt] (6) 7   libffi.dylib                        0x00007ff816a97882 ffi_call_unix64 + 82
  [bt] (7) 8   ???                                 0x00007ff7b557ee50 0x0 + 140701876088400




In [20]:
regressor.fit(X_train, y_train)

XGBoostError: [14:14:32] /Users/runner/work/xgboost/xgboost/src/data/data.cc:507: Check failed: valid: Label contains NaN, infinity or a value too large.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000123990355 dmlc::LogMessageFatal::~LogMessageFatal() + 117
  [bt] (1) 2   libxgboost.dylib                    0x0000000123a66db9 xgboost::MetaInfo::SetInfoFromHost(xgboost::Context const&, xgboost::StringView, xgboost::Json) + 3801
  [bt] (2) 3   libxgboost.dylib                    0x0000000123a65df2 xgboost::MetaInfo::SetInfo(xgboost::Context const&, xgboost::StringView, xgboost::StringView) + 146
  [bt] (3) 4   libxgboost.dylib                    0x00000001239a8729 XGDMatrixSetInfoFromInterface + 233
  [bt] (4) 5   libffi.dylib                        0x00007ff816a97882 ffi_call_unix64 + 82
  [bt] (5) 6   ???                                 0x00007ff7b557da20 0x0 + 140701876083232

