In [1]:
!pip uninstall -y pandas
!pip install pandas==2.2.2


Found existing installation: pandas 2.2.2
Uninstalling pandas-2.2.2:
  Successfully uninstalled pandas-2.2.2
Collecting pandas==2.2.2
  Downloading pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (19 kB)
Downloading pandas-2.2.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (12.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.7/12.7 MB[0m [31m70.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pandas
Successfully installed pandas-2.2.2


In [2]:
# ===============================================
# 03 - MODELO CON PREPROCESADO SIMPLE + CATBOOST
# ===============================================

!pip install catboost

import pandas as pd
import numpy as np

# ---------------------------
# Función de limpieza simple
# ---------------------------
def limpiar_simple(df):
    df = df.copy()

    # Quitar ID si existe
    if "ID" in df.columns:
        df.drop(columns=["ID"], inplace=True)

    # NA categóricos -> modo
    cat_cols = df.select_dtypes(include=["object"]).columns
    for c in cat_cols:
        df[c] = df[c].fillna(df[c].mode()[0])

    # NA numéricos -> media
    num_cols = df.select_dtypes(include=["int64", "float64"]).columns
    for c in num_cols:
        df[c] = df[c].fillna(df[c].mean())

    # Mapear target
    if "RENDIMIENTO_GLOBAL" in df.columns:
        mapa = {"bajo":0, "medio-bajo":1, "medio-alto":2, "alto":3}
        df["RENDIMIENTO_GLOBAL"] = df["RENDIMIENTO_GLOBAL"].replace(mapa)

    return df


# ---------------------------
# Cargar datos
# ---------------------------
train = pd.read_csv("train.csv")
train = limpiar_simple(train)

X = train.drop(columns=["RENDIMIENTO_GLOBAL"])
y = train["RENDIMIENTO_GLOBAL"]

cat_features = X.select_dtypes(include=["object"]).columns.tolist()

from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# ---------------------------
# Modelo simple
# ---------------------------
model = CatBoostClassifier(
    iterations=400,
    depth=8,
    learning_rate=0.05,
    loss_function="MultiClass",
    eval_metric="Accuracy",
    verbose=50
)

model.fit(
    X_train, y_train,
    eval_set=(X_val, y_val),
    cat_features=cat_features,
    use_best_model=True
)

pred = model.predict(X_val)
print("Accuracy:", accuracy_score(y_val, pred))


# ---------------------------
# Predicción final
# ---------------------------
test = pd.read_csv("test.csv")
ids = test["ID"].copy()
test = limpiar_simple(test)

test_pred = model.predict(test)

reverse_map = {0:"bajo",1:"medio-bajo",2:"medio-alto",3:"alto"}
test_pred = pd.Series(test_pred.flatten()).map(reverse_map)

submission = pd.DataFrame({
    "ID": ids,
    "RENDIMIENTO_GLOBAL": test_pred
})

submission.to_csv("submission_03.csv", index=False)
print("Archivo submission_03.csv generado!")


Collecting catboost
  Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp312-cp312-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


  df["RENDIMIENTO_GLOBAL"] = df["RENDIMIENTO_GLOBAL"].replace(mapa)


0:	learn: 0.4058368	test: 0.4053044	best: 0.4053044 (0)	total: 8.7s	remaining: 57m 51s
50:	learn: 0.4353632	test: 0.4331585	best: 0.4331585 (50)	total: 6m 16s	remaining: 42m 54s
100:	learn: 0.4422944	test: 0.4389772	best: 0.4389772 (97)	total: 13m 1s	remaining: 38m 33s
150:	learn: 0.4463185	test: 0.4408644	best: 0.4408829 (143)	total: 20m 2s	remaining: 33m 2s
200:	learn: 0.4487677	test: 0.4420392	best: 0.4422058 (199)	total: 26m 50s	remaining: 26m 34s
250:	learn: 0.4512284	test: 0.4434454	best: 0.4434454 (250)	total: 33m 57s	remaining: 20m 9s
300:	learn: 0.4536868	test: 0.4431493	best: 0.4434454 (250)	total: 40m 53s	remaining: 13m 27s
350:	learn: 0.4561036	test: 0.4436396	best: 0.4438709 (338)	total: 47m 51s	remaining: 6m 40s
399:	learn: 0.4577618	test: 0.4438801	best: 0.4441207 (396)	total: 54m 42s	remaining: 0us

bestTest = 0.4441206672
bestIteration = 396

Shrink model to first 397 iterations.
Accuracy: 0.44412066716620874
Archivo submission_03.csv generado!
