In [None]:
# 📦 Importación de librerías
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

In [None]:
# 📂 Carga de datos
train = pd.read_csv(r"C:\Users\juan2\OneDrive\Escritorio\train.csv")
test = pd.read_csv(r"C:\Users\juan2\OneDrive\Escritorio\test.csv")

X = train.drop(columns=["RENDIMIENTO_GLOBAL"])
y = train["RENDIMIENTO_GLOBAL"]
test_ids = test["ID"]

In [None]:
# 🔍 Identificar columnas categóricas
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

In [None]:
# ⚙️ Preprocesamiento
preprocessor = ColumnTransformer([
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ]), cat_cols)
], remainder="passthrough")

In [None]:
# 🔗 Crear pipeline de modelado
pipe = Pipeline([
    ("pre", preprocessor),
    ("clf", HistGradientBoostingClassifier(random_state=42))
])

In [None]:
# 🔧 Espacio de búsqueda de hiperparámetros
param_dist = {
    "clf__learning_rate": [0.01, 0.05, 0.1],
    "clf__max_iter": [100, 200, 300, 400],
    "clf__max_depth": [5, 10, 15],
    "clf__min_samples_leaf": [10, 20, 50]
}

In [None]:
# 🔍 Búsqueda aleatoria con validación cruzada
search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3),
    scoring="accuracy",
    verbose=2,
    n_jobs=-1,
    random_state=42
)

In [None]:
# 🧪 División en conjunto de entrenamiento y validación
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

In [None]:
# 🚀 Entrenamiento del modelo con búsqueda de hiperparámetros
search.fit(X_train, y_train)

print("\n📌 Mejor combinación encontrada:")
print(search.best_params_)

In [None]:
# 📊 Evaluación sobre conjunto de validación
y_pred = search.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"\n📊 Accuracy en validación: {acc:.5f}")

In [None]:
# 🏁 Entrenamiento final sobre todo el conjunto de entrenamiento
search.best_estimator_.fit(X, y)

In [None]:
# 📈 Predicción sobre conjunto de test
y_test_pred = search.best_estimator_.predict(test)

In [None]:
# 💾 Guardar predicciones en archivo CSV
submission = pd.DataFrame({
    "ID": test_ids,
    "RENDIMIENTO_GLOBAL": y_test_pred
})
submission.to_csv(r"C:\Users\juan2\OneDrive\Escritorio\submission.csv", index=False)

print("\n✅ submission.csv generado con el modelo ajustado.")