# **Re entrenamiento del modelo SVM+TF-IDF con datos actualizados**

CÃ³mo usar este notebook en la rutina de revisiÃ³n del BoletÃ­n Oficial:

1. Inferencia diaria (BO_SVM.ipynb) â†’ _svm_preds.csv.

2. RevisiÃ³n en la interfaz Gradio (BO_SVM_feedback.ipynb) â†’ actualiza train_feedback_master.csv.

Una vez que se haya juntado una cantidad suficiente de nuevos registros etiquetados, se corre este notebook para realizar el reentrenamiento de SVM con TRAIN + FEEDBACK, usa VAL para recalcular umbrales y evalÃºa en TEST.

Guarda un nuevo svm_tfidf_pipeline.joblib e imprime el nuevo thr_f2, el cual debe actualizarse en el notebook de inferencia (BO_SVM.ipynb, celda 4)

## **Celda 1 â€“ Montar Drive, imports y rutas**

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
import pandas as pd
import numpy as np

BASE = "/content/drive/MyDrive/IA/Proyectos/AnÃ¡lisis BoletÃ­n Oficial/boletin-ml"
CSV_TRAIN = os.path.join(BASE, "data", "labels", "dataset_train_final.csv")
CSV_VAL   = os.path.join(BASE, "data", "labels", "dataset_val_final.csv")
CSV_TEST  = os.path.join(BASE, "data", "labels", "dataset_test_final.csv")

# Feedback acumulado desde la interfaz Gradio
CSV_FEEDBACK = os.path.join(BASE, "data", "labels", "train_feedback_master.csv")

# DÃ³nde guardamos el modelo SVM+TFIDF (mismo que usa la demo)
MODEL_DIR  = os.path.join(BASE, "models", "demo_svm")
os.makedirs(MODEL_DIR, exist_ok=True)
MODEL_PATH = os.path.join(MODEL_DIR, "svm_tfidf_pipeline.joblib")

print("BASE:", BASE)
print("TRAIN:", CSV_TRAIN)
print("VAL:", CSV_VAL)
print("TEST:", CSV_TEST)
print("FEEDBACK:", CSV_FEEDBACK)
print("MODEL_PATH:", MODEL_PATH)

## **Celda 2 â€“ Funciones para cargar y unificar datasets**

In [None]:
def read_csv_semicolon(path):
    return pd.read_csv(
        path,
        sep=";",
        encoding="utf-8-sig",
        dtype=str,
        keep_default_na=False
    )

def coerce_label(series):
    """Asegura que label sea 0/1 (int). Lanza error si hay valores raros."""
    s = series.astype(str).str.strip()
    s = s.replace({"True": "1", "False": "0"})
    if not set(s.unique()).issubset({"0", "1"}):
        raise ValueError(f"Valores no binarios en 'label': {s.unique()}")
    return s.astype(int)

def load_split(path, name=""):
    df = read_csv_semicolon(path)
    if "contexto" not in df.columns:
        raise ValueError(f"{name}: falta columna 'contexto'")
    if "label" not in df.columns:
        raise ValueError(f"{name}: falta columna 'label' (0/1)")

    df["contexto"] = df["contexto"].astype(str).str.strip()
    df["label"] = coerce_label(df["label"])

    # Eliminar filas sin texto
    before = len(df)
    df = df[df["contexto"].str.strip().ne("")]
    after = len(df)

    print(f"{name}: {before} filas, {after} con texto no vacÃ­o.")
    print(f"{name}: distribuciÃ³n de clases:\n{df['label'].value_counts().sort_index()}\n")
    return df

train_df = load_split(CSV_TRAIN, "TRAIN")
val_df   = load_split(CSV_VAL,   "VAL")
test_df  = load_split(CSV_TEST,  "TEST")

if os.path.exists(CSV_FEEDBACK):
    fb_df = load_split(CSV_FEEDBACK, "FEEDBACK")
    print("FEEDBACK cargado.")
else:
    fb_df = pd.DataFrame(columns=train_df.columns)
    print("No se encontrÃ³ feedback, se usarÃ¡ solo TRAIN base.")

# ðŸ‘‰ Para reentrenar en producciÃ³n:
# entrenamiento = TRAIN base + FEEDBACK
train_full = pd.concat([train_df, fb_df], ignore_index=True)

print("=== RESUMEN ===")
print("train_full:", len(train_full))
print("val:", len(val_df))
print("test:", len(test_df))
print("DistribuciÃ³n train_full:\n", train_full["label"].value_counts().sort_index())

## **Celda 3 â€“ Definir SVM+TF-IDF y GridSearchCV**

In [None]:
!pip install scikit-learn > /dev/null

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import StratifiedKFold, GridSearchCV

from sklearn.metrics import (
    average_precision_score,
    roc_auc_score,
)

TEXT_COL = "contexto"

X_train = train_full[TEXT_COL].values
y_train = train_full["label"].values

# Pipeline TF-IDF + SVM (ajusta si querÃ©s cambiar n-gramas, min_df, etc.)
svm_pipe = Pipeline([
    ("tfidf", TfidfVectorizer(
        lowercase=True,
        strip_accents=None,
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9,
    )),
    ("clf", LinearSVC())
])

param_grid = {
    "clf__C": [0.2, 1.0, 5.0]  # mismo rango que en el baseline
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

scoring = {
    "ap": "average_precision",  # AUC-PR
    "roc": "roc_auc",
}

grid = GridSearchCV(
    svm_pipe,
    param_grid=param_grid,
    scoring=scoring,
    refit="ap",   # se queda con el mejor modelo segÃºn AUC-PR
    cv=cv,
    n_jobs=-1,
    verbose=2
)

print("Entrenando GridSearchCV sobre train_full...")
grid.fit(X_train, y_train)

print("Mejores hiperparÃ¡metros:", grid.best_params_)
print("Mejor AUC-PR en CV:", grid.best_score_)

## **Celda 4 â€“ Buscar umbral t_F2 usando el set de validaciÃ³n**

In [None]:
from sklearn.metrics import precision_recall_fscore_support

best_svm = grid.best_estimator_

X_val = val_df[TEXT_COL].values
y_val = val_df["label"].values

# LinearSVC no tiene predict_proba, usamos decision_function
scores_val = best_svm.decision_function(X_val)

def eval_at_threshold(y_true, scores, thr, beta=2.0):
    y_pred = (scores >= thr).astype(int)
    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    tn = int(((y_true == 0) & (y_pred == 0)).sum())
    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    if prec + rec > 0:
        f_beta = (1 + beta**2) * prec * rec / (beta**2 * prec + rec)
    else:
        f_beta = 0.0
    return dict(
        thr=thr,
        precision=prec,
        recall=rec,
        f_beta=f_beta,
        tp=tp, fn=fn, fp=fp, tn=tn
    )

# Para no evaluar en TODOS los scores, tomamos una grilla de cuantiles
unique_scores = np.unique(scores_val)
qs = np.linspace(0.05, 0.95, 50)
thr_candidates = np.quantile(unique_scores, qs)

rows = [eval_at_threshold(y_val, scores_val, thr) for thr in thr_candidates]
thr_df = pd.DataFrame(rows)

best_f2_row = thr_df.sort_values("f_beta", ascending=False).iloc[0]
thr_f2 = float(best_f2_row["thr"])

print("=== Mejor umbral t_F2 en VALIDACIÃ“N ===")
print(best_f2_row)

auc_roc_val = roc_auc_score(y_val, scores_val)
auc_pr_val  = average_precision_score(y_val, scores_val)
print(f"AUC-ROC (val): {auc_roc_val:.3f}")
print(f"AUC-PR  (val): {auc_pr_val:.3f}")

## **Celda 5 â€“ Evaluar en TEST con t_F2**

In [None]:
from sklearn.metrics import classification_report, confusion_matrix

X_test = test_df[TEXT_COL].values
y_test = test_df["label"].values

scores_test = best_svm.decision_function(X_test)

def eval_on_test(y_true, scores, thr):
    y_pred = (scores >= thr).astype(int)
    tp = int(((y_true == 1) & (y_pred == 1)).sum())
    fn = int(((y_true == 1) & (y_pred == 0)).sum())
    fp = int(((y_true == 0) & (y_pred == 1)).sum())
    tn = int(((y_true == 0) & (y_pred == 0)).sum())

    prec = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    rec  = tp / (tp + fn) if (tp + fn) > 0 else 0.0
    f1   = (2 * prec * rec / (prec + rec)) if (prec + rec) > 0 else 0.0

    beta = 2.0
    if prec + rec > 0:
        f2 = (1 + beta**2) * prec * rec / (beta**2 * prec + rec)
    else:
        f2 = 0.0

    auc_roc = roc_auc_score(y_true, scores)
    auc_pr  = average_precision_score(y_true, scores)

    return {
        "precision": prec,
        "recall": rec,
        "f1": f1,
        "f2": f2,
        "tp": tp, "fn": fn, "fp": fp, "tn": tn,
        "auc_roc": auc_roc,
        "auc_pr": auc_pr,
    }

test_metrics = eval_on_test(y_test, scores_test, thr_f2)

print("=== MÃ©tricas en TEST (usando t_F2) ===")
for k, v in test_metrics.items():
    if isinstance(v, float):
        print(f"{k}: {v:.3f}")
    else:
        print(f"{k}: {v}")

print("\nMatriz de confusiÃ³n (TEST):")
print(confusion_matrix(y_test, (scores_test >= thr_f2).astype(int)))

print("\nReporte de clasificaciÃ³n (TEST):")
print(classification_report(y_test, (scores_test >= thr_f2).astype(int)))

## **Celda 6 â€“ Guardar el modelo y (opcional) los umbrales**

In [None]:
from joblib import dump
import json

dump(best_svm, MODEL_PATH)
print("Modelo guardado en:", MODEL_PATH)
print("CopiÃ¡ este valor a THRESHOLD_SVM en el notebook de inferencia:")
print("thr_f2 =", thr_f2)

# Opcional: guardar umbrales en JSON
thr_info = {
    "thr_f2": float(thr_f2),
    "thr_rtarget": float(thr_rtarget) if "thr_rtarget" in locals() else None,
}
THR_JSON = os.path.join(MODEL_DIR, "svm_thresholds.json")
with open(THR_JSON, "w", encoding="utf-8") as f:
    json.dump(thr_info, f, indent=2, ensure_ascii=False)

print("Umbrales guardados en:", THR_JSON)