MODELO SOLUCION

In [None]:
# ===============================================
# 1. Importación de librerías
# ===============================================
!pip install catboost
!pip install optuna

import pandas as pd
import numpy as np


# ===============================================
# 2. Función de limpieza y preprocesado
# ===============================================
def limpiar(df):
    df = df.copy()

    # ---------------------------
    # 2.1 Eliminar columnas irrelevantes
    # ---------------------------
    if "ID" in df.columns:
        df.drop(columns=["ID"], inplace=True)

    # ---------------------------
    # 2.2 Identificación de tipos
    # ---------------------------
    cat_cols = df.select_dtypes(include=["object"]).columns
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns

    # ---------------------------
    # 2.3 Manejo de valores faltantes
    # ---------------------------

    # Categóricos -> "NA" (CatBoost maneja bien strings)
    df[cat_cols] = df[cat_cols].fillna("NA").astype(str)

    # Numéricos -> mediana (robusto a outliers)
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # ---------------------------
    # 2.4 Conversión del target a números
    # ---------------------------
    if "RENDIMIENTO_GLOBAL" in df.columns:
        mapa = {
            "bajo": 0,
            "medio-bajo": 1,
            "medio-alto": 2,
            "alto": 3
        }
        df["RENDIMIENTO_GLOBAL"] = df["RENDIMIENTO_GLOBAL"].replace(mapa)

    return df


# ===============================================
# 3. Cargar datos y aplicar limpieza
# ===============================================
train = pd.read_csv("train.csv")
train = limpiar(train)

# Separar variables
X = train.drop(columns=["RENDIMIENTO_GLOBAL"])
y = train["RENDIMIENTO_GLOBAL"]

# Identificar columnas categóricas para CatBoost
cat_features = X.select_dtypes(include=["object"]).columns.tolist()

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# División estratificada
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting optuna
  Downloading optuna-4.6.0-py3-none-any.whl.metadata (17 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.10.1-py3-none-any.whl.metadata (11 kB)
Downloading optuna-4.6.0-py3-none-any.whl (404 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m404.7/404.7 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading colorlog-6.10.1-py3-none-any.whl (11 kB)
Installing collected packages: colorlog, optuna
Successfully installed colorlog-6.10.1 optuna-4.6.0


In [None]:
import pandas as pd
import numpy as np

In [None]:
def limpiar(df):
    df = df.copy()

    # Eliminar ID
    if "ID" in df.columns:
        df.drop(columns=["ID"], inplace=True)

    # Identificar columnas
    cat_cols = df.select_dtypes(include=["object"]).columns
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns

    # 1. NA categóricos -> "NA"
    df[cat_cols] = df[cat_cols].fillna("NA").astype(str)

    # 2. NA numéricos -> mediana (más robusto)
    df[num_cols] = df[num_cols].fillna(df[num_cols].median())

    # 3. Convertir target
    if "RENDIMIENTO_GLOBAL" in df.columns:
        mapa = {
            "bajo": 0,
            "medio-bajo": 1,
            "medio-alto": 2,
            "alto": 3
        }
        df["RENDIMIENTO_GLOBAL"] = df["RENDIMIENTO_GLOBAL"].replace(mapa)

    return df


In [None]:
train = pd.read_csv("train.csv")
train = limpiar(train)

X = train.drop(columns=["RENDIMIENTO_GLOBAL"])
y = train["RENDIMIENTO_GLOBAL"]

cat_features = X.select_dtypes(include=["object"]).columns.tolist()

  df["RENDIMIENTO_GLOBAL"] = df["RENDIMIENTO_GLOBAL"].replace(mapa)


In [None]:
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [None]:
model = CatBoostClassifier(
    iterations=900,
    depth=10,
    learning_rate=0.03,
    l2_leaf_reg=6,
    random_strength=2,
    bagging_temperature=0.5,
    border_count=254,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    auto_class_weights='Balanced',
    random_seed=42,
    verbose=50,
    early_stopping_rounds=80
)


In [None]:
model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    use_best_model=True
)

pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, pred))

0:	learn: 0.4104767	test: 0.4067965	best: 0.4067965 (0)	total: 34.8s	remaining: 8h 41m 42s
50:	learn: 0.4309658	test: 0.4275199	best: 0.4275199 (50)	total: 24m 4s	remaining: 6h 40m 40s
100:	learn: 0.4401183	test: 0.4349463	best: 0.4349815 (98)	total: 50m 20s	remaining: 6h 38m 16s
150:	learn: 0.4440702	test: 0.4380629	best: 0.4380629 (150)	total: 1h 16m 46s	remaining: 6h 20m 51s
200:	learn: 0.4467148	test: 0.4399737	best: 0.4399737 (200)	total: 1h 40m 12s	remaining: 5h 48m 30s
250:	learn: 0.4489721	test: 0.4412241	best: 0.4412241 (250)	total: 2h 5m 50s	remaining: 5h 25m 21s
300:	learn: 0.4511720	test: 0.4421314	best: 0.4421314 (300)	total: 2h 29m 25s	remaining: 4h 57m 22s
350:	learn: 0.4527165	test: 0.4423079	best: 0.4424515 (321)	total: 2h 53m 59s	remaining: 4h 32m 8s
400:	learn: 0.4542072	test: 0.4426813	best: 0.4428241 (381)	total: 3h 17m 44s	remaining: 4h 6m 4s
450:	learn: 0.4562625	test: 0.4427999	best: 0.4429959 (443)	total: 3h 41m 58s	remaining: 3h 40m 59s
500:	learn: 0.4587454	t

In [None]:
test = pd.read_csv("test.csv")
ids = test["ID"].copy()
test = limpiar(test)

test_pred = model.predict(test)

reverse_map = {0: "bajo", 1: "medio-bajo", 2: "medio-alto", 3: "alto"}
test_pred = pd.Series(test_pred.flatten()).map(reverse_map)

submission = pd.DataFrame({
    "ID": ids,
    "RENDIMIENTO_GLOBAL": test_pred
})

submission.to_csv("submission.csv", index=False)
print("Archivo submission.csv generado correctamente!")

Archivo submission.csv generado correctamente!
