In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Ridge
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# 1. Cargar datos
df = pd.read_csv("./data/Vehicle_Price_Cleaned_prepared.csv")

# 2. Separar features y target
X = df.drop("price", axis=1)
y = df["price"]

# 3. Detectar columnas
cat_cols = X.select_dtypes(include="object").columns.tolist()
num_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# 4. Preprocesamiento
preprocessor = ColumnTransformer([
    ("num", StandardScaler(), num_cols),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), cat_cols)
])

# 5. Pipeline base
pipeline = Pipeline([
    ("preprocess", preprocessor),
    ("model", Ridge())
])

# 6. GridSearch para alpha
param_grid = {
    "model__alpha": [0.01, 0.1, 1.0, 10.0, 100.0]
}

# 7. Aplicar log1p al target y hacer split
X_train, X_test, y_train, y_test = train_test_split(X, np.log1p(y), test_size=0.2, random_state=42)

# 8. GridSearchCV
grid_search = GridSearchCV(
    pipeline,
    param_grid,
    scoring="neg_root_mean_squared_error",
    cv=5,
    verbose=1
)

# 9. Entrenar
grid_search.fit(X_train, y_train)

# 10. Predecir y revertir log
y_pred_log = grid_search.predict(X_test)
y_pred = np.expm1(y_pred_log)
y_test_real = np.expm1(y_test)

# 11. Evaluar con métricas reales
rmse = mean_squared_error(y_test_real, y_pred, squared=False)
mae = mean_absolute_error(y_test_real, y_pred)
r2 = r2_score(y_test_real, y_pred)

print(f"✅ Mejor alpha: {grid_search.best_params_['model__alpha']}")
print(f"📉 RMSE real: {rmse:.2f}")
print(f"📉 MAE real: {mae:.2f}")
print(f"📈 R² real: {r2:.4f}")