In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score
from tqdm import tqdm

In [2]:
df = pd.read_csv("../dataframe/unido_final.csv")
df.head(10)

Unnamed: 0.1,Unnamed: 0,modelo,km,color,precio,year
0,0,Model 3,63000.0,azul,25990.0,2022
1,1,Model S,94000.0,negro,51990.0,2020
2,2,Model 3,138000.0,gris,26990.0,2020
3,3,Model X,170000.0,blanco,44800.0,2019
4,4,Model 3,61046.0,rojo,29900.0,2019
5,5,Model 3,104900.0,negro,24900.0,2019
6,6,Model S,143350.0,negro,38990.0,2018
7,7,Model S,143350.0,negro,38990.0,2018
8,8,Model S,49000.0,negro,45900.0,2019
9,9,Model 3,69855.0,rojo,31999.0,2019


In [3]:
df.dtypes

Unnamed: 0      int64
modelo         object
km            float64
color          object
precio        float64
year            int64
dtype: object

In [4]:
data_encoded = pd.get_dummies(df, columns=['modelo', 'color'], drop_first=True)

In [5]:
X = data_encoded.drop(columns=['precio','Unnamed: 0'])
y = data_encoded['precio']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
param_grid = {
    'max_depth': [3, 5, 10, None],  # Sin límite y límites específicos
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5]
}
resultados = []
lista_dataframes = []

In [8]:
for max_depth in tqdm(param_grid['max_depth']):
    for min_samples_split in param_grid['min_samples_split']:
        for min_samples_leaf in param_grid['min_samples_leaf']:
            # Crear el modelo con los hiperparámetros actuales
            tree = DecisionTreeRegressor(
                max_depth=max_depth,
                min_samples_split=min_samples_split,
                min_samples_leaf=min_samples_leaf,
                random_state=42
            )
            
            # Entrenar el modelo
            tree.fit(X_train, y_train)
            
            # Predecir en el conjunto de prueba
            y_pred = tree.predict(X_test)
            
            # Calcular métricas
            mse = mean_squared_error(y_test, y_pred)
            rmse = np.sqrt(mse)
            r2 = r2_score(y_test, y_pred)
            
            # Guardar los resultados
            resultados.append({
                'max_depth': max_depth,
                'min_samples_split': min_samples_split,
                'min_samples_leaf': min_samples_leaf,
                'Mean Squared Error': mse,
                'Root Mean Squared Error': rmse,
                'Coeficiente de determinación': r2
            })
            comparacion = pd.DataFrame({
                'Valor Real': y_test,
                'Valor Predicho': y_pred
            })
            comparacion.reset_index(drop=True, inplace=True)
            print(f"max_depth: {max_depth}, min_samples_split: {min_samples_split}, min_samples_leaf: {min_samples_leaf} mse: {mse}, rmse:{rmse}, r2:{r2}")
            print(comparacion)
            print("**********************************")
            lista_dataframes.append(comparacion)

100%|██████████| 4/4 [00:00<00:00, 25.06it/s]

max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 146475989.75478992, rmse:12102.72654218007, r2:0.6538123295226461
     Valor Real  Valor Predicho
0       27500.0    30030.447602
1       31364.0    30030.447602
2       49800.0    50510.645418
3       24900.0    30030.447602
4       48760.0    50510.645418
..          ...             ...
225     36860.0    50510.645418
226     28200.0    30030.447602
227     34400.0    30030.447602
228     40970.0    50510.645418
229     26990.0    30030.447602

[230 rows x 2 columns]
**********************************
max_depth: 3, min_samples_split: 2, min_samples_leaf: 2 mse: 146475989.75478992, rmse:12102.72654218007, r2:0.6538123295226461
     Valor Real  Valor Predicho
0       27500.0    30030.447602
1       31364.0    30030.447602
2       49800.0    50510.645418
3       24900.0    30030.447602
4       48760.0    50510.645418
..          ...             ...
225     36860.0    50510.645418
226     28200.0    30030.447602
227     34400.0




In [9]:
min_rmse = 99999999999999999999
elemento_min = resultados[0]
for elemento in resultados:
    if elemento['Root Mean Squared Error'] < min_rmse:
        min_rmse = elemento['Root Mean Squared Error']
        elemento_min = elemento
print("El resultado con menor rmse es: ")
print(elemento_min)

El resultado con menor rmse es: 
{'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'Mean Squared Error': 40397427.61608545, 'Root Mean Squared Error': 6355.897074063224, 'Coeficiente de determinación': 0.9045229775671626}


In [10]:
############RANDOM FOREST REGRESSOR
param_grid_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 5],
    'max_features': [ 'sqrt', 'log2', None]
}
resultados_rf = []
lista_dataframes_random_forest = []

In [11]:
for n_estimators in tqdm(param_grid_rf['n_estimators']):
    for max_depth in param_grid_rf['max_depth']:
        for min_samples_split in param_grid_rf['min_samples_split']:
            for min_samples_leaf in param_grid_rf['min_samples_leaf']:
                for max_features in param_grid_rf['max_features']:
                    # Crear el modelo Random Forest con los hiperparámetros actuales
                    rf = RandomForestRegressor(
                        n_estimators=n_estimators,
                        max_depth=max_depth,
                        min_samples_split=min_samples_split,
                        min_samples_leaf=min_samples_leaf,
                        max_features=max_features,
                        random_state=42
                    )
                    
                    # Entrenar el modelo
                    rf.fit(X_train, y_train)
                    
                    # Predecir en el conjunto de prueba
                    y_rf_pred = rf.predict(X_test)
                    
                    # Calcular métricas
                    mse = mean_squared_error(y_test, y_rf_pred)
                    rmse = np.sqrt(mse)
                    r2 = r2_score(y_test, y_rf_pred)
                    
                    # Guardar los resultados
                    resultados_rf.append({
                        'n_estimators': n_estimators,
                        'max_depth': max_depth,
                        'min_samples_split': min_samples_split,
                        'min_samples_leaf': min_samples_leaf,
                        'max_features': max_features,
                        'MSE': mse,
                        'RMSE': rmse,
                        'R²': r2
                    })
                    comparacion = pd.DataFrame({
                        'Valor Real': y_test,
                        'Valor Predicho': y_pred
                    })
                    comparacion.reset_index(drop=True, inplace=True)
                    print(f"max_depth: {max_depth}, min_samples_split: {min_samples_split}, min_samples_leaf: {min_samples_leaf} mse: {mse}, rmse:{rmse}, r2:{r2}")
                    print(comparacion)
                    print("**********************************")
                    lista_dataframes_random_forest.append(comparacion)

  0%|          | 0/3 [00:00<?, ?it/s]

max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 144902429.7872889, rmse:12037.542514454057, r2:0.6575313490043887
     Valor Real  Valor Predicho
0       27500.0    24048.333333
1       31364.0    31706.000000
2       49800.0    49624.285714
3       24900.0    26407.142857
4       48760.0    42943.888889
..          ...             ...
225     36860.0    45262.857143
226     28200.0    27000.000000
227     34400.0    32308.333333
228     40970.0    49228.750000
229     26990.0    29065.272727

[230 rows x 2 columns]
**********************************
max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 144902429.7872889, rmse:12037.542514454057, r2:0.6575313490043887
     Valor Real  Valor Predicho
0       27500.0    24048.333333
1       31364.0    31706.000000
2       49800.0    49624.285714
3       24900.0    26407.142857
4       48760.0    42943.888889
..          ...             ...
225     36860.0    45262.857143
226     28200.0    27000.000000
227     34400.0

 33%|███▎      | 1/3 [00:05<00:11,  5.53s/it]

max_depth: None, min_samples_split: 10, min_samples_leaf: 5 mse: 85385387.73647444, rmse:9240.421404701976, r2:0.7981964926621765
     Valor Real  Valor Predicho
0       27500.0    24048.333333
1       31364.0    31706.000000
2       49800.0    49624.285714
3       24900.0    26407.142857
4       48760.0    42943.888889
..          ...             ...
225     36860.0    45262.857143
226     28200.0    27000.000000
227     34400.0    32308.333333
228     40970.0    49228.750000
229     26990.0    29065.272727

[230 rows x 2 columns]
**********************************
max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 143715627.4510796, rmse:11988.14528820366, r2:0.6603362888226973
     Valor Real  Valor Predicho
0       27500.0    24048.333333
1       31364.0    31706.000000
2       49800.0    49624.285714
3       24900.0    26407.142857
4       48760.0    42943.888889
..          ...             ...
225     36860.0    45262.857143
226     28200.0    27000.000000
227     34400

 67%|██████▋   | 2/3 [00:16<00:08,  8.61s/it]

max_depth: None, min_samples_split: 10, min_samples_leaf: 5 mse: 86015353.66418232, rmse:9274.446272645193, r2:0.796707604023442
     Valor Real  Valor Predicho
0       27500.0    24048.333333
1       31364.0    31706.000000
2       49800.0    49624.285714
3       24900.0    26407.142857
4       48760.0    42943.888889
..          ...             ...
225     36860.0    45262.857143
226     28200.0    27000.000000
227     34400.0    32308.333333
228     40970.0    49228.750000
229     26990.0    29065.272727

[230 rows x 2 columns]
**********************************
max_depth: 3, min_samples_split: 2, min_samples_leaf: 1 mse: 141378061.34576508, rmse:11890.250684731802, r2:0.6658609933557773
     Valor Real  Valor Predicho
0       27500.0    24048.333333
1       31364.0    31706.000000
2       49800.0    49624.285714
3       24900.0    26407.142857
4       48760.0    42943.888889
..          ...             ...
225     36860.0    45262.857143
226     28200.0    27000.000000
227     3440

100%|██████████| 3/3 [00:37<00:00, 12.40s/it]

max_depth: None, min_samples_split: 10, min_samples_leaf: 5 mse: 85788422.62780695, rmse:9262.20398327563, r2:0.7972439426204593
     Valor Real  Valor Predicho
0       27500.0    24048.333333
1       31364.0    31706.000000
2       49800.0    49624.285714
3       24900.0    26407.142857
4       48760.0    42943.888889
..          ...             ...
225     36860.0    45262.857143
226     28200.0    27000.000000
227     34400.0    32308.333333
228     40970.0    49228.750000
229     26990.0    29065.272727

[230 rows x 2 columns]
**********************************





In [12]:
min_rmse = 99999999999999999999
elemento_min = resultados_rf[0]
for elemento in resultados_rf:
    if elemento['MSE'] < min_rmse:
        min_rmse = elemento['MSE']
        elemento_min = elemento
print("El resultado con menor rmse es: ")
print(elemento_min)

El resultado con menor rmse es: 
{'n_estimators': 50, 'max_depth': None, 'min_samples_split': 2, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'MSE': 36661374.55027225, 'RMSE': 6054.863710297058, 'R²': 0.9133529264877934}
