In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm

In [2]:
df = pd.read_csv("../dataframe/unido_final.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,modelo,km,color,precio,year
0,0,Model 3,63000.0,azul,25990.0,2022
1,1,Model S,94000.0,negro,51990.0,2020
2,2,Model 3,138000.0,gris,26990.0,2020
3,3,Model X,170000.0,blanco,44800.0,2019
4,4,Model 3,61046.0,rojo,29900.0,2019
5,5,Model 3,104900.0,negro,24900.0,2019
6,6,Model S,143350.0,negro,38990.0,2018
7,7,Model S,143350.0,negro,38990.0,2018
8,8,Model S,49000.0,negro,45900.0,2019
9,9,Model 3,69855.0,rojo,31999.0,2019


In [3]:
data_encoded = pd.get_dummies(df, columns=['modelo', 'color'], drop_first=True)

In [4]:
X = data_encoded.drop(columns=['precio','Unnamed: 0'])
y = data_encoded['precio']

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
param_grid = {
    'max_depth': [3, 5, 10, None],  # Sin límite y límites específicos
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
resultados = []
lista_dataframes = []

In [7]:
for max_depth in tqdm(param_grid['max_depth']):
    for min_samples_split in param_grid['min_samples_split']:
        for min_samples_leaf in param_grid['min_samples_leaf']:
            # Crear el modelo con los hiperparámetros actuales
            tree = DecisionTreeRegressor(
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=42
            )
            
            # Entrenar el modelo
            tree.fit(X_train, y_train)
            
            # Predecir en el conjunto de prueba
            y_pred = tree.predict(X_test)
            
            # Calcular métricas
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)
            
            # Guardar los resultados
            resultados.append({
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'Mean Squared Error': mse,
                'Root Mean Squared Error': rmse,
                'Coeficiente de determinación': r2
            })
            comparacion = pd.DataFrame({
                'Valor Real': y_test,
                'Valor Predicho': y_pred
            })
            comparacion.reset_index(drop=True, inplace=True)
            print(f"max_depth: {max_depth}, min_samples_split: {min_samples_split}, min_samples_leaf: {min_samples_leaf} mse: {mse}, rmse:{rmse}, r2:{r2}")
            print(comparacion)
            print("**********************************")
            lista_dataframes.append(comparacion)

100%|██████████| 4/4 [00:00<00:00, 24.84it/s]

max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 121505390.1481526, rmse:11022.948341898033, r2:0.7589384050734659
     Valor Real  Valor Predicho
0       47040.0    49475.338542
1       79900.0    35313.707865
2       52360.0    49475.338542
3       37900.0    35313.707865
4       48870.0    49475.338542
..          ...             ...
104     28490.0    35313.707865
105     52110.0    49475.338542
106     79990.0    35313.707865
107     39900.0    35313.707865
108     39900.0    35313.707865

[109 rows x 2 columns]
**********************************
max_depth: 3, min_samples_split: 2, min_samples_leaf: 2 mse: 122833690.6578366, rmse:11083.03616604388, r2:0.756303112606064
     Valor Real  Valor Predicho
0       47040.0    49475.338542
1       79900.0    35313.707865
2       52360.0    49475.338542
3       37900.0    35313.707865
4       48870.0    49475.338542
..          ...             ...
104     28490.0    35313.707865
105     52110.0    49475.338542
106     79990.0  




In [8]:
min_rmse = 99999999999999999999
elemento_min = resultados[0]
for elemento in resultados:
    if elemento['Root Mean Squared Error'] < min_rmse:
        min_rmse = elemento['Root Mean Squared Error']
        elemento_min = elemento
print("El resultado con menor rmse es: ")
print(elemento_min)

El resultado con menor rmse es: 
{'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'Mean Squared Error': 44354878.56177139, 'Root Mean Squared Error': 6659.945837750589, 'Coeficiente de determinación': 0.9120017823420329}


In [12]:
############RANDOM FOREST REGRESSOR
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [ 'sqrt', 'log2', None]
}
resultados_rf = []
lista_dataframes_random_forest = []

In [13]:
for n_estimators in tqdm(param_grid_rf['n_estimators']):
    for max_depth in param_grid_rf['max_depth']:
        for min_samples_split in param_grid_rf['min_samples_split']:
            for min_samples_leaf in param_grid_rf['min_samples_leaf']:
                for max_features in param_grid_rf['max_features']:
                    # Crear el modelo Random Forest con los hiperparámetros actuales
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        max_features=max_features,
                        random_state=42
                    )
                    
                    # Entrenar el modelo
                    rf.fit(X_train, y_train)
                    
                    # Predecir en el conjunto de prueba
                    y_rf_pred = rf.predict(X_test)
                    
                    # Calcular métricas
                    mse = mean_squared_error(y_test, y_rf_pred)
                    rmse = np.sqrt(mse)
                    r2 = r2_score(y_test, y_rf_pred)
                    
                    # Guardar los resultados
                    resultados_rf.append({
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'min_samples_split': min_samples_split,
                        'min_samples_leaf': min_samples_leaf,
                        'max_features': max_features,
                        'MSE': mse,
                        'RMSE': rmse,
                        'R²': r2
                    })
                    comparacion = pd.DataFrame({
                        'Valor Real': y_test,
                        'Valor Predicho': y_pred
                    })
                    comparacion.reset_index(drop=True, inplace=True)
                    print(f"max_depth: {max_depth}, min_samples_split: {min_samples_split}, min_samples_leaf: {min_samples_leaf} mse: {mse}, rmse:{rmse}, r2:{r2}")
                    print(comparacion)
                    print("**********************************")
                    lista_dataframes_random_forest.append(comparacion)

  0%|          | 0/3 [00:00<?, ?it/s]

max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 176277638.29673225, rmse:13276.95892502241, r2:0.6502725633333637
     Valor Real  Valor Predicho
0       47040.0    49878.333333
1       79900.0    46966.666667
2       52360.0    48642.727273
3       37900.0    56873.625000
4       48870.0    47353.888889
..          ...             ...
104     28490.0    28534.000000
105     52110.0    52374.285714
106     79990.0    56873.625000
107     39900.0    35137.500000
108     39900.0    32925.000000

[109 rows x 2 columns]
**********************************
max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 176277638.29673225, rmse:13276.95892502241, r2:0.6502725633333637
     Valor Real  Valor Predicho
0       47040.0    49878.333333
1       79900.0    46966.666667
2       52360.0    48642.727273
3       37900.0    56873.625000
4       48870.0    47353.888889
..          ...             ...
104     28490.0    28534.000000
105     52110.0    52374.285714
106     79990.0

 33%|███▎      | 1/3 [00:04<00:09,  4.98s/it]

max_depth: None, min_samples_split: 10, min_samples_leaf: 5 mse: 79179318.56621882, rmse:8898.276156999109, r2:0.8429115548248866
     Valor Real  Valor Predicho
0       47040.0    49878.333333
1       79900.0    46966.666667
2       52360.0    48642.727273
3       37900.0    56873.625000
4       48870.0    47353.888889
..          ...             ...
104     28490.0    28534.000000
105     52110.0    52374.285714
106     79990.0    56873.625000
107     39900.0    35137.500000
108     39900.0    32925.000000

[109 rows x 2 columns]
**********************************
max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 174257354.68258482, rmse:13200.657357972172, r2:0.6542807212400747
     Valor Real  Valor Predicho
0       47040.0    49878.333333
1       79900.0    46966.666667
2       52360.0    48642.727273
3       37900.0    56873.625000
4       48870.0    47353.888889
..          ...             ...
104     28490.0    28534.000000
105     52110.0    52374.285714
106     799

 67%|██████▋   | 2/3 [00:14<00:07,  7.77s/it]

max_depth: None, min_samples_split: 10, min_samples_leaf: 5 mse: 78180422.24468696, rmse:8841.969364609162, r2:0.8448933232068581
     Valor Real  Valor Predicho
0       47040.0    49878.333333
1       79900.0    46966.666667
2       52360.0    48642.727273
3       37900.0    56873.625000
4       48870.0    47353.888889
..          ...             ...
104     28490.0    28534.000000
105     52110.0    52374.285714
106     79990.0    56873.625000
107     39900.0    35137.500000
108     39900.0    32925.000000

[109 rows x 2 columns]
**********************************
max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 172871904.58098304, rmse:13148.076079068871, r2:0.6570293960994866
     Valor Real  Valor Predicho
0       47040.0    49878.333333
1       79900.0    46966.666667
2       52360.0    48642.727273
3       37900.0    56873.625000
4       48870.0    47353.888889
..          ...             ...
104     28490.0    28534.000000
105     52110.0    52374.285714
106     799

100%|██████████| 3/3 [00:32<00:00, 10.93s/it]


In [15]:
min_rmse = 99999999999999999999
elemento_min = resultados_rf[0]
for elemento in resultados_rf:
    if elemento['MSE'] < min_rmse:
        min_rmse = elemento['MSE']
        elemento_min = elemento
print("El resultado con menor rmse es: ")
print(elemento_min)

El resultado con menor rmse es: 
{'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': None, 'MSE': 40061783.865482114, 'RMSE': 6329.4378791076, 'R²': 0.9205191020543212}
