In [1]:
import pandas as pd
import numpy as np

from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df = pd.read_csv('../../../data/full_data.csv')
df

Unnamed: 0.1,Unnamed: 0,sell_type,car_type,year,price,km
0,0,Nuevo,Model S,2024,99600.0,503.0
1,1,Nuevo,Model S,2024,102270.0,99.0
2,2,Nuevo,Model S,2024,102270.0,0.0
3,3,Nuevo,Model S,2024,90340.0,577.0
4,4,Nuevo,Model S,2024,101050.0,3652.0
...,...,...,...,...,...,...
1839,1854,segunda_mano,Model 3,2021,24700.0,41.0
1840,1855,segunda_mano,Model 3,2021,31500.0,49.0
1841,1856,segunda_mano,Model 3,2019,22900.0,128.0
1842,1858,segunda_mano,Model X,2019,42999.0,110.0


In [3]:
df.isna().sum()

Unnamed: 0    0
sell_type     0
car_type      0
year          0
price         0
km            0
dtype: int64

In [4]:
df.dtypes

Unnamed: 0      int64
sell_type      object
car_type       object
year            int64
price         float64
km            float64
dtype: object

In [5]:
for column in df.columns:
    print(df[column].unique())
    print('=' * 50)

[   0    1    2 ... 1856 1858 1859]
['Nuevo' 'Ocasion' 'segunda_mano']
['Model S' 'Model 3' 'Model X' 'Model Y']
[2024 2023 2018 2019 2022 2020 2021 2017 2014 2016 2015 2013 2012]
[9.96000e+04 1.02270e+05 9.03400e+04 1.01050e+05 1.02320e+05 8.88900e+04
 9.00500e+04 1.15240e+05 9.07500e+04 9.16100e+04 1.02330e+05 1.16660e+05
 9.75000e+04 1.11430e+05 3.91200e+04 4.26000e+04 4.35600e+04 4.54700e+04
 4.65700e+04 4.90700e+04 3.87700e+04 4.37500e+04 4.43700e+04 4.52200e+04
 5.45800e+04 5.53200e+04 5.97700e+04 6.08200e+04 3.61900e+04 3.63900e+04
 3.77700e+04 4.57000e+04 4.65300e+04 4.66000e+04 4.67700e+04 3.82800e+04
 4.81200e+04 4.83700e+04 4.94200e+04 5.48600e+04 5.59500e+04 3.69900e+04
 3.80800e+04 3.84700e+04 4.37300e+04 4.39200e+04 4.52800e+04 4.56500e+04
 4.58900e+04 4.69900e+04 4.74700e+04 3.85200e+04 4.43200e+04 9.56700e+04
 9.99600e+04 1.00720e+05 1.02640e+05 1.04370e+05 1.06050e+05 1.06620e+05
 1.10270e+05 1.11440e+05 1.15670e+05 1.18070e+05 1.00340e+05 1.02020e+05
 1.09710e+05 1.16

In [6]:
df.drop(columns=["Unnamed: 0"], inplace=True)
df.drop_duplicates(inplace=True)

df = pd.get_dummies(data=df, columns=['sell_type', 'car_type'], drop_first=True)

In [7]:
df.dtypes

year                        int64
price                     float64
km                        float64
sell_type_Ocasion            bool
sell_type_segunda_mano       bool
car_type_Model S             bool
car_type_Model X             bool
car_type_Model Y             bool
dtype: object

In [8]:
df

Unnamed: 0,year,price,km,sell_type_Ocasion,sell_type_segunda_mano,car_type_Model S,car_type_Model X,car_type_Model Y
0,2024,99600.0,503.0,False,False,True,False,False
1,2024,102270.0,99.0,False,False,True,False,False
2,2024,102270.0,0.0,False,False,True,False,False
3,2024,90340.0,577.0,False,False,True,False,False
4,2024,101050.0,3652.0,False,False,True,False,False
...,...,...,...,...,...,...,...,...
1826,2021,25500.0,240.0,False,True,False,False,False
1832,2016,43000.0,64.0,False,True,False,True,False
1835,2023,26000.0,20.0,False,True,False,False,False
1841,2019,22900.0,128.0,False,True,False,False,False


In [9]:
df.describe()

Unnamed: 0,year,price,km
count,1300.0,1300.0,1300.0
mean,2021.838462,46974.966923,358.195385
std,2.425182,22503.673269,1251.754225
min,2012.0,50.0,0.0
25%,2020.0,32000.0,1.0
50%,2023.0,42410.0,49.0
75%,2024.0,52485.0,103.25
max,2024.0,133390.0,9500.0


In [10]:
X = df.drop(columns=['price'])  # Elimina la columna objetivo del conjunto de características
y = df['price']  # Define la variable objetivo
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1337)

TESTS

In [11]:
criterios = ['squared_error', 'absolute_error', 'friedman_mse', 'poisson']
profundidades = [2, 3, 5, 10, 20, 100]

mejor_clf = {'profundidad':'',
              'criterio':'',
              'mae':'',
              'mse':'',
              'rmse':'',
              'r2':''}
mejor_rmse = 999999999999999

In [12]:
for criterio in criterios:
    for profundidad in profundidades:
        # Crear el modelo de RandomForest
        dtr = DecisionTreeRegressor(criterion=criterio, max_depth=profundidad, random_state=1337)
        
        # Entrenar el modelo
        dtr.fit(X_train, y_train)
        
        # Realizar las predicciones
        predicciones = dtr.predict(X_test)
        
        # Calcular las métricas
        mae = mean_absolute_error(y_test, predicciones)
        mse = mean_squared_error(y_test, predicciones)
        rmse = np.sqrt(mse)
        r2 = r2_score(y_test, predicciones)
        
        if rmse <= mejor_rmse:
            mejor_rmse = rmse
            mejor_clf['profundidad'] = profundidad
            mejor_clf['criterio'] = criterio
            mejor_clf['mae'] = mae
            mejor_clf['mse'] = mse
            mejor_clf['rmse'] = rmse
            mejor_clf['r2'] = r2

In [13]:
mejor_clf

{'profundidad': 5,
 'criterio': 'poisson',
 'mae': 5172.663252117422,
 'mse': 46927160.44107371,
 'rmse': np.float64(6850.34016973418),
 'r2': 0.9115639127304334}

In [14]:
dtr = DecisionTreeRegressor(criterion=mejor_clf['criterio'], max_depth=mejor_clf['profundidad'], random_state=1337)
dtr

In [15]:
dtr.fit(X_train, y_train)
predicciones = dtr.predict(X_test)

In [16]:
X_test['predicted_price'] = predicciones
X_test['real_price'] = df.loc[X_test.index, 'price']
X_test

Unnamed: 0,year,km,sell_type_Ocasion,sell_type_segunda_mano,car_type_Model S,car_type_Model X,car_type_Model Y,predicted_price,real_price
851,2021,79.0,True,False,False,False,False,31218.433526,33100.0
1250,2017,188.0,False,True,True,False,False,28892.090909,33900.0
193,2023,11.0,True,False,False,True,False,89900.000000,82500.0
49,2024,0.0,False,False,False,False,False,47276.118881,38470.0
273,2024,0.0,False,False,False,False,True,51147.903226,52575.0
...,...,...,...,...,...,...,...,...,...
1508,2024,100.0,False,True,False,False,False,35920.000000,29000.0
632,2023,10.0,True,False,False,False,True,41657.773585,44300.0
1138,2019,87.0,False,True,False,False,False,31218.433526,25490.0
511,2024,0.0,False,False,False,False,True,51147.903226,54440.0


In [36]:
# FIN DecisionTreeRegressor - Marcos