Imports

In [25]:
import os
import time
import random
from dataclasses import dataclass
from typing import Dict, List, Tuple, Any

import numpy as np
import pandas as pd

import requests
from tqdm.auto import tqdm

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier

Configuración

In [None]:
# =========================
# CONFIGURACIÓN GENERAL
# =========================

RUTA_TRAIN = "../Detoxis_train_kaggle.csv"
RUTA_TEST  = "../Detoxis_test_kaggle.csv"

COL_ID = "id"
COL_TEXTO_TRAIN = "text"
COL_TEXTO_TEST  = "comment"
COL_Y = "label"

PERSPECTIVE_API_KEY = os.getenv("PERSPECTIVE_API_KEY", "")

ATRIBUTOS_PERSPECTIVE = [
    "TOXICITY",
    "INSULT",
    "THREAT",
    "PROFANITY",
    "IDENTITY_ATTACK",
]
IDIOMAS = ["es"]

SEGUNDOS_MIN_ENTRE_LLAMADAS = 0.9
MAX_REINTENTOS = 6
TIMEOUT = 30

CSV_SCORES_TRAIN = "../train_perspective_scores.csv"
CSV_SCORES_TEST  = "../test_perspective_scores.csv"

CSV_SUBMISSION = "submission_Perspective_RF_Gate.csv"

SEMILLA = 42
N_SPLITS = 5
PASO_UMBRALES = 0.02

# Puerta lógica
USAR_PUERTA = True
DELTA = 0.05  

Cargar datos + limpieza

In [27]:
train_df = pd.read_csv(RUTA_TRAIN)
test_df  = pd.read_csv(RUTA_TEST)

assert COL_ID in train_df.columns
assert COL_TEXTO_TRAIN in train_df.columns
assert COL_Y in train_df.columns

assert COL_ID in test_df.columns
assert COL_TEXTO_TEST in test_df.columns

train_df[COL_TEXTO_TRAIN] = train_df[COL_TEXTO_TRAIN].fillna("").astype(str)
test_df[COL_TEXTO_TEST]   = test_df[COL_TEXTO_TEST].fillna("").astype(str)
train_df[COL_Y]           = train_df[COL_Y].astype(int)

train_df[COL_ID] = train_df[COL_ID].astype(str)
test_df[COL_ID]  = test_df[COL_ID].astype(str)

print("Train:", train_df.shape, "Test:", test_df.shape)
print(train_df[COL_Y].value_counts().sort_index())
train_df.head()

Train: (3463, 3) Test: (891, 2)
label
0    2316
1     809
2     269
3      69
Name: count, dtype: int64


Unnamed: 0,id,text,label
0,0,Pensó: Zumo para restar.,1
1,1,Como les gusta el afeitado en seco a esta gente.,1
2,2,"asi me gusta, que se maten entre ellos y en al...",2
3,3,"Loss mas valientes, los que mejor cortan nuest...",1
4,4,Costumbres...,1


Cliente Perspective + caché

In [28]:
@dataclass
class ClientePerspective:
    api_key: str
    atributos: List[str]
    idiomas: List[str]
    segundos_min_entre_llamadas: float = 1.0
    timeout: int = 30
    max_reintentos: int = 6
    max_chars_texto: int = 3000

    def __post_init__(self):
        self._ultimo_t = 0.0
        self._url = f"https://commentanalyzer.googleapis.com/v1alpha1/comments:analyze?key={self.api_key}"

    def _rate_limit(self):
        transcurrido = time.time() - self._ultimo_t
        if transcurrido < self.segundos_min_entre_llamadas:
            time.sleep(self.segundos_min_entre_llamadas - transcurrido)
        self._ultimo_t = time.time()

    def analizar(self, texto: str) -> Dict[str, float]:
        if not self.api_key:
            raise ValueError("Falta PERSPECTIVE_API_KEY en entorno.")

        texto = (texto or "")
        texto = " ".join(texto.split())
        texto = texto[: self.max_chars_texto]

        payload = {
            "comment": {"text": texto},
            "languages": self.idiomas,
            "requestedAttributes": {a: {} for a in self.atributos},
        }

        for intento in range(1, self.max_reintentos + 1):
            try:
                self._rate_limit()
                r = requests.post(self._url, json=payload, timeout=self.timeout)

                if r.status_code in (429, 500, 502, 503, 504):
                    raise RuntimeError(f"HTTP {r.status_code}: {r.text[:200]}")

                r.raise_for_status()
                js = r.json()

                out = {}
                attr_scores = js.get("attributeScores", {})
                for a in self.atributos:
                    v = attr_scores.get(a, {}).get("summaryScore", {}).get("value", np.nan)
                    out[a] = float(v) if v is not None else np.nan
                return out

            except Exception:
                if intento == self.max_reintentos:
                    return {a: np.nan for a in self.atributos}
                time.sleep(min(2 ** intento, 30) + random.random())

        return {a: np.nan for a in self.atributos}


def crear_o_cargar_scores_perspective(df, ruta_csv, cliente, col_id, col_texto):
    if os.path.exists(ruta_csv):
        cache = pd.read_csv(ruta_csv)
        if col_id not in cache.columns:
            raise ValueError(f"Cache {ruta_csv} sin columna {col_id}")
        cache[col_id] = cache[col_id].astype(str)
        ids_cache = set(cache[col_id].tolist())
    else:
        cache = pd.DataFrame(columns=[col_id] + cliente.atributos)
        ids_cache = set()

    df = df.copy()
    df[col_id] = df[col_id].astype(str)

    ids_df = df[col_id].tolist()
    por_consultar = df.loc[~df[col_id].isin(ids_cache), [col_id, col_texto]].copy()

    print(f"[Caché] {ruta_csv}: {len(ids_cache)} ya guardados.")
    print(f"[Consulta] Faltan: {len(por_consultar)} filas.")

    nuevas_filas = []
    for _, row in tqdm(por_consultar.iterrows(), total=len(por_consultar), desc=f"Perspective -> {os.path.basename(ruta_csv)}"):
        rid = str(row[col_id])
        texto = str(row[col_texto])
        scores = cliente.analizar(texto)
        nuevas_filas.append({col_id: rid, **scores})

        if len(nuevas_filas) % 50 == 0:
            tmp = pd.DataFrame(nuevas_filas)
            cache = pd.concat([cache, tmp], ignore_index=True)
            cache[col_id] = cache[col_id].astype(str)
            cache = cache.drop_duplicates(subset=[col_id], keep="last")
            cache.to_csv(ruta_csv, index=False)
            nuevas_filas = []

    if nuevas_filas:
        tmp = pd.DataFrame(nuevas_filas)
        cache = pd.concat([cache, tmp], ignore_index=True)
        cache[col_id] = cache[col_id].astype(str)
        cache = cache.drop_duplicates(subset=[col_id], keep="last")
        cache.to_csv(ruta_csv, index=False)

    cache[col_id] = cache[col_id].astype(str)
    out = cache.set_index(col_id).loc[ids_df].reset_index()
    return out

Obtener scores (train + test)

In [29]:
cliente = ClientePerspective(
    api_key=PERSPECTIVE_API_KEY,
    atributos=ATRIBUTOS_PERSPECTIVE,
    idiomas=IDIOMAS,
    segundos_min_entre_llamadas=SEGUNDOS_MIN_ENTRE_LLAMADAS,
    timeout=TIMEOUT,
    max_reintentos=MAX_REINTENTOS
)

scores_train_df = crear_o_cargar_scores_perspective(
    train_df[[COL_ID, COL_TEXTO_TRAIN]],
    CSV_SCORES_TRAIN,
    cliente,
    col_id=COL_ID,
    col_texto=COL_TEXTO_TRAIN
)

scores_test_df = crear_o_cargar_scores_perspective(
    test_df[[COL_ID, COL_TEXTO_TEST]],
    CSV_SCORES_TEST,
    cliente,
    col_id=COL_ID,
    col_texto=COL_TEXTO_TEST
)

scores_train_df.head()

[Caché] ../train_perspective_scores.csv: 3463 ya guardados.
[Consulta] Faltan: 0 filas.


Perspective -> train_perspective_scores.csv: 0it [00:00, ?it/s]

[Caché] ../test_perspective_scores.csv: 891 ya guardados.
[Consulta] Faltan: 0 filas.


Perspective -> test_perspective_scores.csv: 0it [00:00, ?it/s]

Unnamed: 0,id,TOXICITY,INSULT,THREAT,PROFANITY,IDENTITY_ATTACK
0,0,0.000628,0.004869,0.005272,0.007792,0.000134
1,1,0.040926,0.023029,0.00591,0.023311,0.005254
2,2,0.659969,0.513094,0.673875,0.235672,0.51358
3,3,0.377512,0.402434,0.348043,0.088724,0.277209
4,4,0.009739,0.010515,0.005309,0.010809,0.00061


Features + imputación

In [30]:
train_aug = train_df.merge(scores_train_df, on=COL_ID, how="left")
test_aug  = test_df.merge(scores_test_df,  on=COL_ID, how="left")

def anadir_features_basicas(df: pd.DataFrame, col_texto: str) -> pd.DataFrame:
    out = df.copy()
    txt = out[col_texto].fillna("").astype(str)
    out["long_car"] = txt.str.len().astype(float)
    out["num_pal"]  = txt.str.split().str.len().astype(float)
    out["exclam"]   = txt.str.count("!").astype(float)
    out["interrog"] = txt.str.count(r"\?").astype(float)
    out["ratio_mayus"] = txt.apply(lambda s: (sum(c.isupper() for c in s) / max(len(s), 1))).astype(float)
    out["num_links"] = txt.str.count(r"http|www").astype(float)
    return out

def anadir_features_perspective_derivadas(df: pd.DataFrame) -> pd.DataFrame:
    out = df.copy()
    A = ATRIBUTOS_PERSPECTIVE
    out["persp_max"]  = out[A].max(axis=1)
    out["persp_mean"] = out[A].mean(axis=1)
    out["persp_std"]  = out[A].std(axis=1)
    out["persp_num_hi_07"] = (out[A] > 0.7).sum(axis=1).astype(float)
    if "TOXICITY" in out.columns and "PROFANITY" in out.columns:
        out["tox_x_prof"] = (out["TOXICITY"] * out["PROFANITY"]).astype(float)
    return out

train_aug = anadir_features_basicas(train_aug, COL_TEXTO_TRAIN)
test_aug  = anadir_features_basicas(test_aug,  COL_TEXTO_TEST)

COLS_BASE = ATRIBUTOS_PERSPECTIVE + ["long_car", "num_pal", "exclam", "interrog", "ratio_mayus", "num_links"]

for df_ in [train_aug, test_aug]:
    df_[COLS_BASE] = df_[COLS_BASE].astype(float)
    med = df_[COLS_BASE].median(numeric_only=True)
    df_[COLS_BASE] = df_[COLS_BASE].fillna(med)

train_aug = anadir_features_perspective_derivadas(train_aug)
test_aug  = anadir_features_perspective_derivadas(test_aug)

COLS_FEATURES = COLS_BASE + ["persp_max", "persp_mean", "persp_std", "persp_num_hi_07", "tox_x_prof"]

for df_ in [train_aug, test_aug]:
    df_[COLS_FEATURES] = df_[COLS_FEATURES].astype(float)
    med = df_[COLS_FEATURES].median(numeric_only=True)
    df_[COLS_FEATURES] = df_[COLS_FEATURES].fillna(med)

train_aug[COLS_FEATURES].head()


Unnamed: 0,TOXICITY,INSULT,THREAT,PROFANITY,IDENTITY_ATTACK,long_car,num_pal,exclam,interrog,ratio_mayus,num_links,persp_max,persp_mean,persp_std,persp_num_hi_07,tox_x_prof
0,0.000628,0.004869,0.005272,0.007792,0.000134,24.0,4.0,0.0,0.0,0.083333,0.0,0.007792,0.003739,0.003268,0.0,5e-06
1,0.040926,0.023029,0.00591,0.023311,0.005254,48.0,10.0,0.0,0.0,0.020833,0.0,0.040926,0.019686,0.014778,0.0,0.000954
2,0.659969,0.513094,0.673875,0.235672,0.51358,82.0,16.0,0.0,0.0,0.012195,0.0,0.673875,0.519238,0.176208,0.0,0.155536
3,0.377512,0.402434,0.348043,0.088724,0.277209,117.0,16.0,0.0,0.0,0.017094,0.0,0.402434,0.298784,0.12645,0.0,0.033494
4,0.009739,0.010515,0.005309,0.010809,0.00061,13.0,1.0,0.0,0.0,0.076923,0.0,0.010809,0.007396,0.004395,0.0,0.000105


Umbrales + búsqueda de best_thr (OOF) con tus params fijos

In [31]:
def aplicar_umbrales(scores: np.ndarray, t1: float, t2: float, t3: float) -> np.ndarray:
    return np.where(scores < t1, 0,
           np.where(scores < t2, 1,
           np.where(scores < t3, 2, 3))).astype(int)

def buscar_mejores_umbrales_macro_f1(scores: np.ndarray, y_true: np.ndarray, paso: float = 0.02):
    mejor_thr = (1.0, 2.0, 2.6)
    mejor_f1 = -1.0
    grid = np.arange(0.0, 3.0 + 1e-9, paso)

    for t1 in grid:
        for t2 in grid:
            if t2 <= t1:
                continue
            for t3 in grid:
                if t3 <= t2:
                    continue
                y_pred = aplicar_umbrales(scores, t1, t2, t3)
                f1 = f1_score(y_true, y_pred, average="macro")
                if f1 > mejor_f1:
                    mejor_f1 = float(f1)
                    mejor_thr = (float(t1), float(t2), float(t3))
    return mejor_thr, mejor_f1


# Tus hiperparámetros fijos:
BEST_PARAMS = {
    "n_estimators": 1200,
    "max_depth": None,
    "min_samples_leaf": 10,
    "min_samples_split": 13,
    "max_features": "sqrt",
    "bootstrap": True,
}

def obtener_oof_scores_con_params_fijos(df_train_aug, cols_features, col_y, params, n_splits=5, random_state=42):
    X = df_train_aug[cols_features]
    y = df_train_aug[col_y].astype(int).values

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)
    oof = np.zeros(len(X), dtype=float)

    for fold, (idx_tr, idx_va) in enumerate(skf.split(X, y), 1):
        X_tr, X_va = X.iloc[idx_tr], X.iloc[idx_va]
        y_tr = y[idx_tr].astype(float)

        model = RandomForestRegressor(
            **params,
            random_state=random_state,
            n_jobs=-1
        )
        model.fit(X_tr, y_tr)

        s_va = np.clip(model.predict(X_va), 0.0, 3.0)
        oof[idx_va] = s_va
        print(f"Fold {fold}/{n_splits} listo | min/max={s_va.min():.3f}/{s_va.max():.3f}")

    return oof

oof_scores = obtener_oof_scores_con_params_fijos(
    train_aug, COLS_FEATURES, COL_Y, BEST_PARAMS,
    n_splits=N_SPLITS, random_state=SEMILLA
)

best_thr, best_f1 = buscar_mejores_umbrales_macro_f1(oof_scores, train_aug[COL_Y].values, paso=PASO_UMBRALES)

print("\n✅ Umbrales OOF (best_thr):", best_thr)
print("✅ Macro-F1 OOF:", best_f1)

Fold 1/5 listo | min/max=0.013/2.415
Fold 2/5 listo | min/max=0.014/2.373
Fold 3/5 listo | min/max=0.030/2.464
Fold 4/5 listo | min/max=0.018/2.396
Fold 5/5 listo | min/max=0.022/2.451

✅ Umbrales OOF (best_thr): (0.44, 1.18, 1.82)
✅ Macro-F1 OOF: 0.49197335007532544


Reporte OOF con best_thr

In [32]:
pred_oof = aplicar_umbrales(oof_scores, *best_thr)
y_true = train_aug[COL_Y].values

print("Reporte OOF:")
print(classification_report(y_true, pred_oof, digits=4))

print("Matriz confusión OOF:")
print(confusion_matrix(y_true, pred_oof))

print("Distribución clases predichas (OOF):")
print(pd.Series(pred_oof).value_counts().sort_index())

Reporte OOF:
              precision    recall  f1-score   support

           0     0.8125    0.7897    0.8010      2316
           1     0.3814    0.4512    0.4134       809
           2     0.3978    0.2677    0.3200       269
           3     0.4189    0.4493    0.4336        69

    accuracy                         0.6633      3463
   macro avg     0.5027    0.4895    0.4920      3463
weighted avg     0.6718    0.6633    0.6657      3463

Matriz confusión OOF:
[[1829  470   15    2]
 [ 355  365   74   15]
 [  61  110   72   26]
 [   6   12   20   31]]
Distribución clases predichas (OOF):
0    2251
1     957
2     181
3      74
Name: count, dtype: int64


Puerta lógica (expertos binarios)

In [33]:
def entrenar_expertos_binarios(train_aug, cols_features, col_y, random_state=42):
    expertos = {}
    for a, b in [(0,1), (1,2), (2,3)]:
        df_sub = train_aug[train_aug[col_y].isin([a,b])].copy()
        Xb = df_sub[cols_features]
        yb = (df_sub[col_y].values == b).astype(int)

        clf = RandomForestClassifier(
            n_estimators=1200,
            max_depth=None,
            min_samples_leaf=5,
            min_samples_split=10,
            max_features="sqrt",
            bootstrap=True,
            random_state=random_state,
            n_jobs=-1,
            class_weight="balanced_subsample"
        )
        clf.fit(Xb, yb)
        expertos[(a,b)] = clf
        print(f"Experto ({a} vs {b}) entrenado | muestras={len(df_sub)} | %b={yb.mean()*100:.2f}%")
    return expertos


def aplicar_umbrales_con_puerta(scores, thr, X_features, expertos, delta=0.03):
    t1, t2, t3 = thr
    pred = aplicar_umbrales(scores, t1, t2, t3).copy()

    idx_01 = np.where(np.abs(scores - t1) <= delta)[0]
    idx_12 = np.where(np.abs(scores - t2) <= delta)[0]
    idx_23 = np.where(np.abs(scores - t3) <= delta)[0]

    if len(idx_01) > 0:
        proba_1 = expertos[(0,1)].predict_proba(X_features.iloc[idx_01])[:, 1]
        pred[idx_01] = (proba_1 >= 0.5).astype(int)

    if len(idx_12) > 0:
        proba_2 = expertos[(1,2)].predict_proba(X_features.iloc[idx_12])[:, 1]
        pred[idx_12] = np.where(proba_2 >= 0.5, 2, 1)

    if len(idx_23) > 0:
        proba_3 = expertos[(2,3)].predict_proba(X_features.iloc[idx_23])[:, 1]
        pred[idx_23] = np.where(proba_3 >= 0.5, 3, 2)

    return pred

Entrenar FINAL con tus params + aplicar best_thr (OOF) + puerta opcional

In [None]:
# Entrenar modelo final con TODO el train usando tus params fijos
X_full = train_aug[COLS_FEATURES]
y_full = train_aug[COL_Y].astype(float).values

modelo_final = RandomForestRegressor(
    **BEST_PARAMS,
    random_state=SEMILLA,
    n_jobs=-1
)
modelo_final.fit(X_full, y_full)

# Scores train/test
scores_train = np.clip(modelo_final.predict(X_full), 0.0, 3.0)
scores_test  = np.clip(modelo_final.predict(test_aug[COLS_FEATURES]), 0.0, 3.0)

# Predicciones con umbrales OOF best_thr 
if USAR_PUERTA:
    expertos = entrenar_expertos_binarios(train_aug, COLS_FEATURES, COL_Y, random_state=SEMILLA)
    pred_train = aplicar_umbrales_con_puerta(scores_train, best_thr, X_full, expertos, delta=DELTA)
    pred_test  = aplicar_umbrales_con_puerta(scores_test,  best_thr, test_aug[COLS_FEATURES], expertos, delta=DELTA)
else:
    pred_train = aplicar_umbrales(scores_train, *best_thr)
    pred_test  = aplicar_umbrales(scores_test,  *best_thr)

def oof_pred_con_puerta(
    df_train_aug,
    cols_features,
    col_y,
    base_params,
    thr,
    n_splits=5,
    random_state=42,
    delta=0.03
):
    X = df_train_aug[cols_features]
    y = df_train_aug[col_y].astype(int).values

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=random_state)

    pred_oof_gate = np.zeros(len(X), dtype=int)

    for fold, (idx_tr, idx_va) in enumerate(skf.split(X, y), 1):
        X_tr, X_va = X.iloc[idx_tr], X.iloc[idx_va]
        y_tr = y[idx_tr]

        # ---- modelo base del fold ----
        base = RandomForestRegressor(
            **base_params,
            random_state=random_state,
            n_jobs=-1
        )
        base.fit(X_tr, y_tr.astype(float))
        scores_va = np.clip(base.predict(X_va), 0.0, 3.0)

        # ---- expertos entrenados SOLO con train-fold ----
        expertos = {}
        for a, b in [(0,1), (1,2), (2,3)]:
            mask = np.isin(y_tr, [a, b])
            Xb = X_tr.iloc[mask]
            yb = (y_tr[mask] == b).astype(int)

            clf = RandomForestClassifier(
                n_estimators=base_params["n_estimators"],
                max_depth=base_params["max_depth"],
                min_samples_leaf=base_params["min_samples_leaf"],
                min_samples_split=base_params["min_samples_split"],
                max_features=base_params["max_features"],
                bootstrap=base_params["bootstrap"],
                random_state=random_state,
                n_jobs=-1,
                class_weight="balanced_subsample"
            )
            clf.fit(Xb, yb)
            expertos[(a, b)] = clf

        # ---- aplicar puerta SOLO en val ----
        pred_va = aplicar_umbrales_con_puerta(
            scores=scores_va,
            thr=thr,
            X_features=X_va,
            expertos=expertos,
            delta=delta
        )

        pred_oof_gate[idx_va] = pred_va
        print(f"Fold {fold}/{n_splits} listo (puerta en VAL)")

    return pred_oof_gate

pred_oof_gate = oof_pred_con_puerta(
    train_aug,
    COLS_FEATURES,
    COL_Y,
    BEST_PARAMS,
    best_thr,
    n_splits=N_SPLITS,
    random_state=SEMILLA,
    delta=DELTA
)

print("\nVAL / OOF (con puerta lógica)")
print(classification_report(train_aug[COL_Y].values, pred_oof_gate, digits=4))
print("Confusión VAL / OOF (puerta):")
print(confusion_matrix(train_aug[COL_Y].values, pred_oof_gate))

print("----------------------------------------------------------------------")

print("\nTEST (distribución)")
print(pd.Series(pred_test).value_counts().sort_index())
print("Rango scores_test:", float(scores_test.min()), float(scores_test.max()))

Experto (0 vs 1) entrenado | muestras=3125 | %b=25.89%
Experto (1 vs 2) entrenado | muestras=1078 | %b=24.95%
Experto (2 vs 3) entrenado | muestras=338 | %b=20.41%
Fold 1/5 listo (puerta en VAL)
Fold 2/5 listo (puerta en VAL)
Fold 3/5 listo (puerta en VAL)
Fold 4/5 listo (puerta en VAL)
Fold 5/5 listo (puerta en VAL)

VAL / OOF (con puerta lógica)
              precision    recall  f1-score   support

           0     0.8113    0.7910    0.8010      2316
           1     0.3784    0.4425    0.4080       809
           2     0.3807    0.2491    0.3011       269
           3     0.3735    0.4493    0.4079        69

    accuracy                         0.6607      3463
   macro avg     0.4860    0.4830    0.4795      3463
weighted avg     0.6680    0.6607    0.6626      3463

Confusión VAL / OOF (puerta):
[[1832  465   17    2]
 [ 361  358   71   19]
 [  59  112   67   31]
 [   6   11   21   31]]
----------------------------------------------------------------------

TEST (distribución)


Submission

In [35]:
submission = pd.DataFrame({
    "id": test_aug[COL_ID].values,
    "label": pred_test.astype(int)
})

submission.to_csv(CSV_SUBMISSION, index=False)
print("Guardado:", CSV_SUBMISSION)
submission.head()


Guardado: submission_Perspective_RF_Gate.csv


Unnamed: 0,id,label
0,0,1
1,1,1
2,2,0
3,3,1
4,4,0
