In [26]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, mean_absolute_percentage_error

# eliminar id 
df_cars=pd.read_csv(r"C:\Users\Usuario\Documents\Proyecto ML\src\USA_cars_datasets.csv")

In [27]:
df_cars.drop(columns=['Unnamed: 0','vin', 'lot'],inplace=True)

In [28]:
# Transformar de millas a kilómetros
df_cars['km'] = df_cars['mileage'] * 1.60934

df_cars.drop(columns=['mileage'], inplace=True)

In [29]:
df_cars = df_cars[df_cars['title_status'] != 'salvage insurance']

In [30]:
from datetime import datetime
# Obtener el año actual
current_year = datetime.now().year

# Calcular la edad del vehículo
df_cars['year'] = current_year - df_cars['year']

In [31]:
df_cars.head()

Unnamed: 0,price,brand,model,year,title_status,color,state,country,condition,km
0,6300,toyota,cruiser,16,clean vehicle,black,new jersey,usa,10 days left,441147.45278
1,2899,ford,se,13,clean vehicle,silver,tennessee,usa,6 days left,306662.95568
2,5350,dodge,mpv,6,clean vehicle,silver,georgia,usa,2 days left,63713.7706
3,25000,ford,door,10,clean vehicle,blue,virginia,usa,22 hours left,103232.72364
4,27700,chevrolet,1500,6,clean vehicle,red,florida,usa,22 hours left,10708.54836


In [32]:
X = df_cars.drop(columns=['price'])  # Características
y = df_cars['price']


# Convertir variables categóricas a numéricas
X = pd.get_dummies(X, drop_first=True)

In [33]:
# Dividir los datos en conjunto de entrenamiento y prueba
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Definir hiperparámetros para GridSearchCV
param_grid = {
    'Decision Tree': {
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
    },
    'Random Forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [3, 5, 10],
        'min_samples_split': [2, 5, 10],
    },
    'Gradient Boosting': {
        'n_estimators': [50, 100, 200],
        'learning_rate': [0.01, 0.1, 0.2],
        'max_depth': [3, 5, 10],
    }
}

In [34]:
#best params_

In [35]:
# convierte  a numpy array
y_test = np.array(y_test)

In [36]:
# Modelos base
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(random_state=42),
    'Random Forest': RandomForestRegressor(random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(random_state=42)
}

# Evaluar modelos con hiperparámetros
results = {}
for name, model in models.items():
    if name in param_grid:
        print(f"Optimizando hiperparámetros para {name}...")
        grid_search = GridSearchCV(model, param_grid[name], cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
    else:
        best_model = model.fit(X_train, y_train)

    y_pred = best_model.predict(X_test)

    # Calcular métricas
    mae = mean_absolute_error(y_test, y_pred)
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2 = r2_score(y_test, y_pred)
    mape=mean_absolute_percentage_error(y_test, y_pred)

    results[name] = {'MAE': mae, 'RMSE': rmse, 'R2': r2, 'MAPE':mape}

# Mostrar los resultados
results_df = pd.DataFrame(results).T
print("\nResultados de los modelos:")
print(results_df)

# Identificar el mejor modelo
best_model_name = results_df['R2'].idxmax()
best_model = models[best_model_name]
print(f"\nEl mejor modelo es: {best_model_name} con MAPE = {results_df.loc[best_model_name, 'MAPE']:.4f}")

'''
# Guardar el modelo final
best_model.fit(X, y)
import joblib
joblib.dump(best_model, 'best_model.pkl')
print("\nEl mejor modelo ha sido guardado como 'best_model.pkl'.")
'''

Optimizando hiperparámetros para Decision Tree...
Optimizando hiperparámetros para Random Forest...
Optimizando hiperparámetros para Gradient Boosting...

Resultados de los modelos:
                           MAE         RMSE        R2      MAPE
Linear Regression  4686.896806  6716.249697  0.657255  0.481207
Decision Tree      5104.836023  8099.655651  0.501517  0.472487
Random Forest      5560.555865  7625.846786  0.558131  0.533184
Gradient Boosting  4162.742632  6386.081059  0.690125  0.395629

El mejor modelo es: Gradient Boosting con MAPE = 0.3956


'\n# Guardar el modelo final\nbest_model.fit(X, y)\nimport joblib\njoblib.dump(best_model, \'best_model.pkl\')\nprint("\nEl mejor modelo ha sido guardado como \'best_model.pkl\'.")\n'

In [37]:
best_params = grid_search.best_params_
print("Mejores hiperparámetros para Gradient Boosting:")
print(best_params)

Mejores hiperparámetros para Gradient Boosting:
{'learning_rate': 0.1, 'max_depth': 10, 'n_estimators': 200}


In [38]:
best_gb = GradientBoostingRegressor(random_state=42, **best_params)
best_gb.fit(X_train, y_train)

In [39]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Estandarizar los datos
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Aplicar PCA
pca = PCA()
X_train_pca = pca.fit_transform(X_train_scaled)

# Calcular la varianza explicada acumulada
cumulative_variance_ratio = np.cumsum(pca.explained_variance_ratio_)

# Encontrar el número de componentes que explican el 95% de la varianza
n_components = np.argmax(cumulative_variance_ratio >= 0.95) + 1

print(f"Número de componentes que explican el 95% de la varianza: {n_components}")

# Crear un nuevo PCA con el número de componentes seleccionado
pca_95 = PCA(n_components=n_components)
X_train_pca_95 = pca_95.fit_transform(X_train_scaled)
X_test_pca_95 = pca_95.transform(X_test_scaled)

# Entrenar el modelo Gradient Boosting con los datos PCA
best_gb_pca = GradientBoostingRegressor(random_state=42, **best_params)
best_gb_pca.fit(X_train_pca_95, y_train)

# Evaluar el modelo
y_pred_pca = best_gb_pca.predict(X_test_pca_95)
mape_pca = mean_absolute_percentage_error(y_test, y_pred_pca)

print(f"MAPE del modelo Gradient Boosting con PCA: {mape_pca:.4f}")


Número de componentes que explican el 95% de la varianza: 204
MAPE del modelo Gradient Boosting con PCA: 0.5129


In [40]:
mape=mean_absolute_percentage_error(y_test, y_pred)
mape

np.float64(0.3956291732595808)

In [41]:
print(type(y_test))
print(y_test.shape)



<class 'numpy.ndarray'>
(468,)


In [42]:
y_pred

array([ 8095.64240873, 25950.15457683, 26981.40249145, 11882.65584841,
        5832.30042914, 21998.24113283, 43756.60607769, 46544.53547336,
       24333.0248031 ,  5787.72545767, 15684.1372709 , 19234.15168871,
       16679.70053493,  6952.13181969, 33727.45684827, 16400.55815304,
       30203.22645432, 28344.04474616, 18706.25920596, 30725.09940857,
       28344.04474616, 15121.03985799, 30307.73852714, 26527.44686622,
       29007.71738882,  9485.77735742, 27780.90788372, 23502.28720715,
       24509.57549357, 41949.2135269 , 27155.74369822, 11396.42582135,
       11907.4692972 , 25020.82818554,  8445.97345319, 17123.84032029,
       18229.7235334 , 17606.0516093 , 24271.92359293, 20919.25695122,
        6148.49813153, 19121.72151207,  9582.296894  , 11237.97224709,
       20421.21900572, 42611.04486129, 43322.84329192, 15037.58908654,
       13050.02509868,  5894.26375851, 18413.18476592,  9497.72903716,
       47166.41737173,  5180.475753  ,  9374.39848136, 21299.23776827,
      