In [32]:
import pandas as pd
import numpy as np

from sklearn.model_selection import cross_validate
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_selector

from category_encoders.cat_boost import CatBoostEncoder

from xgboost import XGBRegressor

import sklearn
sklearn.set_config(transform_output = "pandas")

In [27]:
data = pd.read_csv('datasets/clean/neoauto_clean.csv')
data.head()

Unnamed: 0,Brands,Models,Version,Currency,Price,Urlpic,Year,KM,Fuel_type,Transmission,Location,Color,Cilinder,Upholstery,Engine,Age
0,SUZUKI,S-PRESSO,4x2,USD,9800.0,https://cde.neoauto.pe/autos_usados/360x240/66...,2023,16684,gasolina,mecanica,Lima,Rojo,3.0,tela,1000.0,1
1,DFSK,GLORY 500,delantera,USD,12000.0,https://cde.neoauto.pe/autos_usados/360x240/70...,2023,11054,gas_glp,mecanica,Lima,Blanco,4.0,tela,1500.0,1
2,TOYOTA,YARIS,delantera,USD,15500.0,https://cde.neoauto.pe/autos_usados/360x240/70...,2023,15500,gas_gnv,mecanica,Lima,Rojo,4.0,tela,1300.0,1
3,FORD,RAPTOR,4x4,USD,75900.0,https://cde.neoauto.pe/autos_usados/360x240/30...,2020,12500,gasolina,automatica_secuencial,Lima,Blanco,6.0,tela,3500.0,4
4,KIA,SPORTAGE,4x2,USD,27900.0,https://cde.neoauto.pe/autos_usados/360x240/70...,2023,52130,gasolina,mecanica,Lima,Gris,4.0,cuero,1999.0,1


In [29]:
cols_to_remove = ['Urlpic', 'Year', 'Currency']
data.drop(columns = cols_to_remove, inplace = True)

In [33]:
X = data.drop(columns = 'Price').copy()
y = data['Price'].copy()

In [52]:
# create encoder
encoder = CatBoostEncoder(handle_missing = 'return_nan',
                          handle_unknown = 'return_nan')

transformer = ColumnTransformer([('enc', encoder, make_column_selector(dtype_include = ['object', 'category']))],
                                remainder = 'passthrough',
                                verbose_feature_names_out = False)

estimator = XGBRegressor(eval_metric = mean_squared_error,
                         n_estimators = 100,
                         max_depth = 6,
                         min_child_weight = 0.01,
                         eta = 0.05,
                         reg_lambda = 0.5,
                         alpha = 5,
                         gamma = 5,
                         subsample = 0.5,
                         random_state = 123)

# create pipeline
pipe = Pipeline([('trans', transformer), ('est', estimator)])

# declare cross valite
cross_pipe = cross_validate(estimator = pipe,
                            X = X,
                            y = y,
                            cv = 5,
                            scoring = 'r2',
                            return_estimator = True,
                            return_train_score = True,
                            verbose = 2,
                            error_score = 'raise')


[CV] END .................................................... total time=   0.6s
[CV] END .................................................... total time=   4.8s
[CV] END .................................................... total time=   0.4s
[CV] END .................................................... total time=   0.5s
[CV] END .................................................... total time=   0.3s


In [53]:
cross_pipe

{'fit_time': array([0.57366729, 4.80483031, 0.40733314, 0.4406414 , 0.25274515]),
 'score_time': array([0.03383136, 0.04231381, 0.02638841, 0.02770662, 0.02542305]),
 'estimator': [Pipeline(steps=[('trans',
                   ColumnTransformer(remainder='passthrough',
                                     transformers=[('enc',
                                                    CatBoostEncoder(handle_missing='return_nan',
                                                                    handle_unknown='return_nan'),
                                                    <sklearn.compose._column_transformer.make_column_selector object at 0x7f5440dd84d0>)],
                                     verbose_feature_names_out=False)),
                  ('est',
                   XGBRegressor(alpha=5, base_score=None, booster=None,
                                callbacks=None,...
                                eval_metric=<function mean_squared_error at 0x7f545a33a660>,
                        