<a href="https://colab.research.google.com/github/JuanOspina3/Proyecto-UDEA-ai4eng-20251---Pruebas-Saber-Pro-Colombia/blob/main/99%20-%20modelo%20soluci%C3%B3n.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.metrics import accuracy_score

# ===============================
# 1. CARGA DE DATOS
# ===============================
train = pd.read_csv(r"C:\Users\juan2\OneDrive\Escritorio\train.csv")
test = pd.read_csv(r"C:\Users\juan2\OneDrive\Escritorio\test.csv")

X = train.drop(columns=["RENDIMIENTO_GLOBAL"])
y = train["RENDIMIENTO_GLOBAL"]
test_ids = test["ID"]

# ===============================
# 2. PREPROCESAMIENTO
# ===============================
cat_cols = X.select_dtypes(include=["object", "category"]).columns.tolist()

preprocessor = ColumnTransformer([
    ("cat", Pipeline([
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))
    ]), cat_cols)
], remainder="passthrough")

# ===============================
# 3. MODELO Y PIPELINE
# ===============================
pipe = Pipeline([
    ("pre", preprocessor),
    ("clf", HistGradientBoostingClassifier(random_state=42))
])

# ===============================
# 4. BÚSQUEDA DE HIPERPARÁMETROS
# ===============================
param_dist = {
    "clf__learning_rate": [0.01, 0.05, 0.1],
    "clf__max_iter": [100, 200, 300, 400],
    "clf__max_depth": [5, 10, 15],
    "clf__min_samples_leaf": [10, 20, 50]
}

search = RandomizedSearchCV(
    pipe,
    param_distributions=param_dist,
    n_iter=20,
    cv=StratifiedKFold(n_splits=3),
    scoring="accuracy",
    verbose=2,
    n_jobs=-1,
    random_state=42
)

# ===============================
# 5. VALIDACIÓN Y ENTRENAMIENTO
# ===============================
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, stratify=y, random_state=42
)

search.fit(X_train, y_train)
print("\n\U0001F4CC Mejor combinación encontrada:")
print(search.best_params_)

y_pred = search.predict(X_val)
acc = accuracy_score(y_val, y_pred)
print(f"\n\U0001F4CA Accuracy en validación: {acc:.5f}")

# ===============================
# 6. ENTREGA FINAL
# ===============================
search.best_estimator_.fit(X, y)
y_test_pred = search.best_estimator_.predict(test)

submission = pd.DataFrame({
    "ID": test_ids,
    "RENDIMIENTO_GLOBAL": y_test_pred
})
submission.to_csv(r"C:\Users\juan2\OneDrive\Escritorio\submission.csv", index=False)
print("\n✅ submission.csv generado con el modelo ajustado.")
