In [3]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Cargar modelo en español
nlp = spacy.load("es_core_news_sm")

# ============================================================================
# CONFIGURACIÓN DE TRIGGERS Y TÉRMINOS
# ============================================================================
df = pd.read_csv(
    "datasets/unified_dataset.csv",
    header=None,
    names=["type", "content", "source"],
    dtype=str
).dropna()

df["binary"] = np.where(df["type"] == "ninguno", "ninguno", "pattern")
df["type"] = df["type"].str.strip().str.lower()
df["content"] = df["content"].astype(str).str.strip()
df["source"] = df["source"].astype(str).str.strip()

valid_types = {"fake_urgency", "fake_scarcity", "shaming", "ninguno"}
df = df[df["type"].isin(valid_types)]
df = df[(df["content"] != "") & (df["source"] != "")].reset_index(drop=True)

URGENCY_TRIGGERS = [
    "apurate", "apúrate", "ya", "no te lo pierdas", "última", "ultima", "oportunidad",
    "comprá", "compra", "reservá", "reserva", "oferta", "últimas", "ultimas", "flash", "sale",
    "relámpago", "relampago", "aprovecha", "ahora o nunca", "por tiempo limitado", "ultimo día",
    "último día", "última oportunidad", "ultima oportunidad", "solo hoy", "sólo hoy",
    "solo ahora", "sólo ahora", "termina en", "finaliza en", "quedan", "queda",
    "últimos", "ultimos", "stock bajo", "casi agotado"
]

TECH_NOUNS = [
    "sesión", "sesion", "batería", "bateria", "dispositivo", "equipo", "sistema",
    "conexión", "conexion", "proceso", "operación", "operacion", "pantalla",
    "aplicación", "aplicacion", "instancia", "entorno", "pedido", "token"
]

END_VERBS = [
    "expira", "expirar", "caduca", "caducar", "vence", "vencer", "cierra", "cerrar",
    "finaliza", "finalizar", "termina", "terminar", "se agotará", "se agotara",
    "agotarse", "apaga", "apagarse", "desconecta", "desconectarse", "bloquea", "bloquearse"
]

EVENT_TERMS = ["clase", "evento", "live", "streaming", "stream", "partido", "examen"]
EVENT_START_VERBS = ["empieza", "comienza", "inicia", "arranca", "comenzar", "empezar", "iniciar"]

# ============================================================================
# REGEX SOLO PARA TIEMPO Y RELOJ (difícil de hacer con matchers)
# ============================================================================

re_in_time = re.compile(r"\b\d+\s*(?:segundos?|minutos?|horas?|hs|h|m|s|d[ií]as?|d[ií]a)\b", re.IGNORECASE)
re_clock = re.compile(r"\b\d{1,2}:\d{2}(?::\d{2})?\b")

# ============================================================================
# INICIALIZAR MATCHERS
# ============================================================================

matcher = Matcher(nlp.vocab)
shaming_matcher = Matcher(nlp.vocab)  # Matcher específico para shaming
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

# ============================================================================
# MATCHERS PARA SHAMING (tienen prioridad - NO descartar)
# ============================================================================

# Primera persona - verbos conjugados en primera persona singular
shaming_matcher.add("FP_VERB", [[{"POS": "VERB", "MORPH": {"IS_SUPERSET": ["Person=1", "Number=Sing"]}}]])
shaming_matcher.add("FP_COPULA", [[{"DEP": "cop", "POS": "AUX", "MORPH": {"IS_SUPERSET": ["Person=1", "Number=Sing"]}}]])
shaming_matcher.add("FP_ME_VERB", [[{"POS": "PRON", "MORPH": {"IS_SUPERSET": ["Person=1", "Number=Sing"]}}, {"POS": "VERB"}]])

# Perífrasis (voy a + verbo)
shaming_matcher.add("FP_PERIFRASIS_VOY_A", [[
    {"DEP": "aux", "POS": "AUX", "MORPH": {"IS_SUPERSET": ["Person=1", "Number=Sing"]}},
    {"DEP": "mark", "POS": "ADP"},
    {"POS": "VERB"},
]])

# Patrones "es lo mío"
shaming_matcher.add("FP_ES_LO_MIO", [[
    {"LEMMA": {"IN": ["seguir", "ignorar", "ser", "hacer"]}, "POS": {"IN": ["VERB", "AUX"]}},
    {"OP": "+", "POS": {"NOT_IN": ["PUNCT"]}},
    {"LEMMA": "ser", "POS": {"IN": ["AUX", "VERB"]}},
    {"LOWER": "lo"},
    {"LOWER": {"IN": ["mío", "mio"]}},
]])

# Ironía
shaming_matcher.add("IRONIA_PREFIERO_NO", [[{"LEMMA": "preferir", "POS": "VERB"}, {"LOWER": "no"}, {"POS": "VERB"}]])
shaming_matcher.add("IRONIA_QUIEN_NECESITA", [[{"LOWER": {"IN": ["quién", "quien"]}}, {"LEMMA": "necesitar", "POS": "VERB"}]])
shaming_matcher.add("IRONIA_PORQUE_HABRIA_DE", [[
    {"LOWER": "por"}, {"LOWER": {"IN": ["qué", "que"]}}, {"LEMMA": "haber", "POS": "AUX"}, {"LOWER": "de"}, {"POS": "VERB"}
]])

# Metáforas con "es mi"
shaming_matcher.add("META_VERBOS_ES_MI", [[
    {"LEMMA": {"IN": ["ignorar", "vivir", "ser", "estar", "perder", "arruinar", "hacer", "rechazar", "fracasar", "seguir"]}},
    {"OP": "*"},
    {"LOWER": "es"},
    {"LOWER": "mi"},
    {"OP": "+"}
]])

# ============================================================================
# MATCHERS PARA METADATA (debe ser TODO el texto)
# ============================================================================

# Patrón: solo "N colores/tamaños/talles/piezas/unidades"
matcher.add("METADATA_UNITS_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["colores", "color", "tamaños", "tamaño", "talles", "talle", "piezas", "pieza", "unidades", "unidad"]}},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# Patrón: solo "N ventas"
matcher.add("METADATA_SALES_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "ventas"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# Patrón: "en N carritos" o "N carritos"
matcher.add("METADATA_CARTS_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "en"},
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["carritos", "carrito"]}},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["carritos", "carrito"]}},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# Patrón: "(N disponibles)" o "Stock disponible: (N disponibles)"
matcher.add("METADATA_AVAILABLE_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"TEXT": "("},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["disponibles", "disponible", "restantes", "restante"]}},
        {"TEXT": ")"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "stock"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "disponible"},
        {"TEXT": ":"},
        {"IS_SPACE": True, "OP": "*"},
        {"TEXT": "("},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["disponibles", "disponible"]}},
        {"TEXT": ")"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# Patrón: "disponible en" (solo eso)
matcher.add("METADATA_AVAILABLE_IN", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "disponible"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "en"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# ============================================================================
# MATCHERS PARA CTAs NEUTRALES (debe ser TODO el texto)
# ============================================================================

# "no gracias" (solo eso)
matcher.add("NEUTRAL_NO_THANKS_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "no"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "gracias"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "quiero saber más", "quiero más info", etc. - REMOVIDO "quiero" genérico para evitar conflicto con shaming
matcher.add("NEUTRAL_WANT_INFO_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "saber"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["más", "mas"]}},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["más", "mas"]}},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "info"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "aprender"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "seo"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "mi"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["código", "codigo"]}},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "una"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["consultoría", "consultoria"]}},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "digital"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "descubrir ahora", "descubrirlo ahora"
matcher.add("NEUTRAL_DISCOVER_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["descubrir", "descubrirlo"]}},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "ahora"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "descargar ahora"
matcher.add("NEUTRAL_DOWNLOAD_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "descargar"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "ahora"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "cancelar mi suscripción" - REMOVIDO para evitar conflicto con primera persona
# (Si es shaming, será detectado por FP_VERB o FP_ME_VERB)

# "sí, por favor", "si lo quiero", etc. - REMOVIDOS los que usan primera persona
matcher.add("NEUTRAL_YES_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["si", "sí"]}},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "por"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "favor"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "recibí/suscribite/anotate + alerta/aviso/lanzamiento" - REMOVIDO "recibí" (primera persona)
matcher.add("NEUTRAL_ALERTS_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "suscribite"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "al"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "aviso"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "de"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "lanzamiento"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "anotate"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "para"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "recibir"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "una"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "alerta"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "cuando"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "lancemos"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "mostrame N recetas..."
matcher.add("NEUTRAL_SHOW_RECIPES", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "mostrame"},
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "recetas"}
    ]
])

# "participa ya"
matcher.add("NEUTRAL_PARTICIPATE", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "participa"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "ya"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# ============================================================================
# MATCHERS PARA LANZAMIENTOS (debe ser TODO el texto o empezar así)
# ============================================================================

# "muy pronto"
matcher.add("LAUNCH_SOON_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "muy"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "pronto"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "disponible próximamente"
matcher.add("LAUNCH_AVAILABLE_SOON", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "disponible"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["próximamente", "proximamente"]}},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "lanzamiento..." o "drop..." (inicio del texto)
matcher.add("LAUNCH_TERMS_START", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["lanzamiento", "drop"]}}
    ]
])

# "cuenta regresiva..." o "tiempo restante..." (inicio)
matcher.add("LAUNCH_COUNTDOWN_START", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "cuenta"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "regresiva"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "tiempo"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "restante"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "tu"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "contador"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "regresivo"}
    ]
])

# "en breve..." (inicio)
matcher.add("LAUNCH_BRIEF_START", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "en"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "breve"}
    ]
])

# ============================================================================
# PHRASE MATCHERS PARA DETECCIÓN DE TÉRMINOS (no todo el texto)
# ============================================================================

# Triggers de urgencia
urgency_patterns = [nlp.make_doc(trigger) for trigger in URGENCY_TRIGGERS]
phrase_matcher.add("URGENCY_TRIGGERS", urgency_patterns)

# Términos técnicos
tech_noun_patterns = [nlp.make_doc(noun) for noun in TECH_NOUNS]
phrase_matcher.add("TECH_NOUNS", tech_noun_patterns)

# Verbos de finalización
end_verb_patterns = [nlp.make_doc(verb) for verb in END_VERBS]
phrase_matcher.add("END_VERBS", end_verb_patterns)

# Términos de eventos
event_patterns = [nlp.make_doc(term) for term in EVENT_TERMS]
phrase_matcher.add("EVENT_TERMS", event_patterns)

# Verbos de inicio de eventos
event_start_patterns = [nlp.make_doc(verb) for verb in EVENT_START_VERBS]
phrase_matcher.add("EVENT_START", event_start_patterns)

# ============================================================================
# FUNCIONES AUXILIARES
# ============================================================================

def has_shaming_pattern(doc):
    """Detecta si hay patrones de shaming - TIENEN PRIORIDAD"""
    matches = shaming_matcher(doc)
    return len(matches) > 0

def has_urgency_trigger(doc):
    """Detecta si hay triggers de urgencia en CUALQUIER parte del texto"""
    matches = phrase_matcher(doc, as_spans=False)
    return any(nlp.vocab.strings[match_id] == "URGENCY_TRIGGERS" for match_id, _, _ in matches)

def check_full_text_match(doc, label_prefixes):
    """
    Verifica si un match cubre TODO el texto (ignorando espacios).
    label_prefixes: lista de prefijos de labels a buscar (ej: ["METADATA_", "NEUTRAL_"])
    """
    matches = matcher(doc)

    # Tokens que no son espacios
    non_space_tokens = [i for i, token in enumerate(doc) if not token.is_space]

    if not non_space_tokens:
        return False

    first_token = non_space_tokens[0]
    last_token = non_space_tokens[-1]

    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]

        # Verificar si el label comienza con alguno de los prefijos
        if any(label.startswith(prefix) for prefix in label_prefixes):
            # El match debe cubrir desde el primer token no-espacio hasta el último
            if start <= first_token and end > last_token:
                return True

    return False

def check_start_match(doc, label_prefixes):
    """
    Verifica si hay un match al inicio del texto.
    Útil para patrones como "lanzamiento...", "cuenta regresiva...", etc.
    """
    matches = matcher(doc)

    # Primer token no-espacio
    non_space_tokens = [i for i, token in enumerate(doc) if not token.is_space]
    if not non_space_tokens:
        return False

    first_token = non_space_tokens[0]

    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]

        if any(label.startswith(prefix) for prefix in label_prefixes):
            # El match debe empezar en el primer token (o cerca)
            if start <= first_token + 1:  # +1 para dar algo de margen
                return True

    return False

# ============================================================================
# FUNCIONES PRINCIPALES
# ============================================================================

def is_safe_non_pattern(text: str) -> bool:
    """
    Valida que TODO el texto sea metadata, CTA neutral o info de lanzamiento.
    Usa matchers de spaCy con validación de texto completo.
    """
    doc = nlp(text.lower())

    if has_urgency_trigger(doc):
        return False

    # Verificar metadata (debe cubrir TODO el texto)
    if check_full_text_match(doc, ["METADATA_"]):
        return True

    # Verificar CTAs neutrales (debe cubrir TODO el texto)
    if check_full_text_match(doc, ["NEUTRAL_"]):
        return True

    # Verificar lanzamientos (puede ser todo el texto o empezar así)
    if check_full_text_match(doc, ["LAUNCH_"]) or check_start_match(doc, ["LAUNCH_"]):
        return True

    return False

def is_anti_dark_fp(text: str) -> bool:
    """
    Detecta falsos positivos: eventos, sesiones técnicas, etc.
    """
    doc = nlp(text.lower())
    text_lower = text.lower()

    if has_urgency_trigger(doc):
        return False

    # Detectar términos con phrase matcher
    phrase_matches = phrase_matcher(doc, as_spans=False)

    has_event = any(nlp.vocab.strings[match_id] == "EVENT_TERMS" for match_id, _, _ in phrase_matches)
    has_start = any(nlp.vocab.strings[match_id] == "EVENT_START" for match_id, _, _ in phrase_matches)
    has_tech_noun = any(nlp.vocab.strings[match_id] == "TECH_NOUNS" for match_id, _, _ in phrase_matches)
    has_end_verb = any(nlp.vocab.strings[match_id] == "END_VERBS" for match_id, _, _ in phrase_matches)

    # Eventos con inicio
    if has_event and has_start:
        return True

    # Evento que comienza "en X tiempo"
    if has_start and " en " in text_lower:
        if re_in_time.search(text_lower) or re_clock.search(text_lower):
            return True

    # Tech nouns + end verbs + tiempo
    if has_tech_noun and has_end_verb:
        if re_in_time.search(text_lower) or re_clock.search(text_lower) or " en " in text_lower:
            return True

    return False

def prefilter_to_none(text: str) -> bool:
    """
    Función principal de prefiltro.
    Retorna True si el texto debe clasificarse como "ninguno" (no es dark pattern).

    IMPORTANTE: Primero chequea patrones de shaming que DEBEN pasar adelante.
    """
    doc = nlp(text.lower())

    # PRIORIDAD 1: Si tiene patrones de shaming, NO descartar (dejar pasar al modelo)
    if has_shaming_pattern(doc):
        return False

    # PRIORIDAD 2: Si no tiene shaming, aplicar filtros normales
    return is_anti_dark_fp(text) or is_safe_non_pattern(text)

X = df["content"]
y = df["binary"]
groups = df["source"]

pipeline_stage1 = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 3), min_df=1, max_df=0.95)),
    ("clf", LogisticRegression(max_iter=3000, class_weight="balanced"))
])

pipeline_stage2 = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 3), min_df=1, max_df=0.95)),
    ("clf", LinearSVC(class_weight="balanced"))
])

gkf1 = GroupKFold(n_splits=5)
oof_model_stage1 = np.empty(len(df), dtype=object)
oof_system_stage1 = np.empty(len(df), dtype=object)
discarded = np.zeros(len(df), dtype=bool)

for tr, te in gkf1.split(X, y, groups):
    pipeline_stage1.fit(X.iloc[tr], y.iloc[tr])

    te_texts = X.iloc[te]
    preds_model = pipeline_stage1.predict(te_texts)
    oof_model_stage1[te] = preds_model

    disc = te_texts.apply(lambda x: prefilter_to_none(x)).values
    discarded[te] = disc

    preds_sys = preds_model.copy()
    preds_sys[disc] = "ninguno"
    oof_system_stage1[te] = preds_sys

print("=== STAGE 1: MODEL ONLY ===")
print(classification_report(y, oof_model_stage1, digits=3, zero_division=0))

print("\n=== STAGE 1: SYSTEM (PREFILTER + MODEL) ===")
print(classification_report(y, oof_system_stage1, digits=3, zero_division=0))

print("\nDescartados por prefilter:", int(discarded.sum()))
print(df.loc[discarded, ["type", "source", "content"]].to_string(index=False))

df_err1 = df.copy()
df_err1["y_true"] = y.values
df_err1["y_pred_model"] = oof_model_stage1
df_err1["y_pred_system"] = oof_system_stage1

err_model = df_err1[df_err1["y_true"] != df_err1["y_pred_model"]]
err_sys = df_err1[df_err1["y_true"] != df_err1["y_pred_system"]]

print("\n=== ERRORES STAGE 1 (MODEL ONLY) ===")
print(f"Total errores: {len(err_model)}")
print("\n--- FALSOS NEGATIVOS (pattern → ninguno) ---")
for _, r in err_model[(err_model["y_true"] == "pattern") & (err_model["y_pred_model"] == "ninguno")].iterrows():
    print(f"[{r['type']} | {r['source']}] {r['content']}")
print("\n--- FALSOS POSITIVOS (ninguno → pattern) ---")
for _, r in err_model[(err_model["y_true"] == "ninguno") & (err_model["y_pred_model"] == "pattern")].iterrows():
    print(f"[{r['type']} | {r['source']}] {r['content']}")

print("\n=== ERRORES STAGE 1 (SYSTEM = PREFILTER + MODEL) ===")
print(f"Total errores: {len(err_sys)}")
print("\n--- FALSOS NEGATIVOS (pattern → ninguno) ---")
for _, r in err_sys[(err_sys["y_true"] == "pattern") & (err_sys["y_pred_system"] == "ninguno")].iterrows():
    print(f"[{r['type']} | {r['source']}] {r['content']}")
print("\n--- FALSOS POSITIVOS (ninguno → pattern) ---")
for _, r in err_sys[(err_sys["y_true"] == "ninguno") & (err_sys["y_pred_system"] == "pattern")].iterrows():
    print(f"[{r['type']} | {r['source']}] {r['content']}")

df_p = df[df["type"] != "ninguno"].reset_index(drop=True)

X2 = df_p["content"]
y2 = df_p["type"]
groups2 = df_p["source"]

gkf2 = GroupKFold(n_splits=5)
oof_stage2 = np.empty(len(df_p), dtype=object)

for tr, te in gkf2.split(X2, y2, groups2):
    pipeline_stage2.fit(X2.iloc[tr], y2.iloc[tr])
    oof_stage2[te] = pipeline_stage2.predict(X2.iloc[te])

print("\n=== STAGE 2: MODEL ONLY (patterns) ===")
print(classification_report(y2, oof_stage2, digits=3, zero_division=0))

df_err2 = df_p.copy()
df_err2["y_true"] = y2.values
df_err2["y_pred"] = oof_stage2
df_err2 = df_err2[df_err2["y_true"] != df_err2["y_pred"]]

print("\n=== ERRORES STAGE 2 ===")
print(f"Total errores: {len(df_err2)}")
for _, r in df_err2.iterrows():
    print(f"[{r['source']}] true={r['y_true']} pred={r['y_pred']} | {r['content']}")

def fit_final_models():
    pipeline_stage1.fit(df["content"], df["binary"])
    pipeline_stage2.fit(df_p["content"], df_p["type"])
    return pipeline_stage1, pipeline_stage2

final_stage1, final_stage2 = fit_final_models()




=== STAGE 1: MODEL ONLY ===
              precision    recall  f1-score   support

     ninguno      0.500     0.395     0.441        81
     pattern      0.729     0.805     0.765       164

    accuracy                          0.669       245
   macro avg      0.615     0.600     0.603       245
weighted avg      0.653     0.669     0.658       245


=== STAGE 1: SYSTEM (PREFILTER + MODEL) ===
              precision    recall  f1-score   support

     ninguno      0.593     0.630     0.611        81
     pattern      0.811     0.787     0.799       164

    accuracy                          0.735       245
   macro avg      0.702     0.708     0.705       245
weighted avg      0.739     0.735     0.737       245


Descartados por prefilter: 27
         type      source                                         content
fake_scarcity      bidcom                                 (6 disponibles)
fake_scarcity        boca              Stock disponible: (19 disponibles)
fake_scarcity       

In [4]:
import spacy
from spacy.matcher import Matcher, PhraseMatcher
import re
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report

# Cargar modelo en español
nlp = spacy.load("es_core_news_sm")

# ============================================================================
# CONFIGURACIÓN DE TRIGGERS Y TÉRMINOS
# ============================================================================
df = pd.read_csv(
    "datasets/unified_dataset.csv",
    header=None,
    names=["type", "content", "source"],
    dtype=str
).dropna()

df["binary"] = np.where(df["type"] == "ninguno", "ninguno", "pattern")
df["type"] = df["type"].str.strip().str.lower()
df["content"] = df["content"].astype(str).str.strip()
df["source"] = df["source"].astype(str).str.strip()

valid_types = {"fake_urgency", "fake_scarcity", "shaming", "ninguno"}
df = df[df["type"].isin(valid_types)]
df = df[(df["content"] != "") & (df["source"] != "")].reset_index(drop=True)

URGENCY_TRIGGERS = [
    "apurate", "apúrate", "ya", "no te lo pierdas", "última", "ultima", "oportunidad",
    "comprá", "compra", "reservá", "reserva", "oferta", "últimas", "ultimas", "flash", "sale",
    "relámpago", "relampago", "aprovecha", "ahora o nunca", "por tiempo limitado", "ultimo día",
    "último día", "última oportunidad", "ultima oportunidad", "solo hoy", "sólo hoy",
    "solo ahora", "sólo ahora", "termina en", "finaliza en", "quedan", "queda",
    "últimos", "ultimos", "stock bajo", "casi agotado"
]

TECH_NOUNS = [
    "sesión", "sesion", "batería", "bateria", "dispositivo", "equipo", "sistema",
    "conexión", "conexion", "proceso", "operación", "operacion", "pantalla",
    "aplicación", "aplicacion", "instancia", "entorno", "pedido", "token"
]

END_VERBS = [
    "expira", "expirar", "caduca", "caducar", "vence", "vencer", "cierra", "cerrar",
    "finaliza", "finalizar", "termina", "terminar", "se agotará", "se agotara",
    "agotarse", "apaga", "apagarse", "desconecta", "desconectarse", "bloquea", "bloquearse"
]

EVENT_TERMS = ["clase", "evento", "live", "streaming", "stream", "partido", "examen"]
EVENT_START_VERBS = ["empieza", "comienza", "inicia", "arranca", "comenzar", "empezar", "iniciar"]

# ============================================================================
# REGEX SOLO PARA TIEMPO Y RELOJ (difícil de hacer con matchers)
# ============================================================================

re_in_time = re.compile(r"\b\d+\s*(?:segundos?|minutos?|horas?|hs|h|m|s|d[ií]as?|d[ií]a)\b", re.IGNORECASE)
re_clock = re.compile(r"\b\d{1,2}:\d{2}(?::\d{2})?\b")

# ============================================================================
# INICIALIZAR MATCHERS
# ============================================================================

matcher = Matcher(nlp.vocab)
phrase_matcher = PhraseMatcher(nlp.vocab, attr="LOWER")

# ============================================================================
# MATCHERS PARA METADATA (debe ser TODO el texto)
# ============================================================================

# Patrón: solo "N colores/tamaños/talles/piezas/unidades"
matcher.add("METADATA_UNITS_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},  # espacios opcionales al inicio
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["colores", "color", "tamaños", "tamaño", "talles", "talle", "piezas", "pieza", "unidades", "unidad"]}},
        {"IS_SPACE": True, "OP": "*"}  # espacios opcionales al final
    ]
])

# Patrón: solo "N ventas"
matcher.add("METADATA_SALES_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "ventas"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# Patrón: "en N carritos" o "N carritos"
matcher.add("METADATA_CARTS_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "en"},
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["carritos", "carrito"]}},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["carritos", "carrito"]}},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# Patrón: "(N disponibles)" o "Stock disponible: (N disponibles)"
matcher.add("METADATA_AVAILABLE_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"TEXT": "("},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["disponibles", "disponible", "restantes", "restante"]}},
        {"TEXT": ")"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "stock"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "disponible"},
        {"TEXT": ":"},
        {"IS_SPACE": True, "OP": "*"},
        {"TEXT": "("},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["disponibles", "disponible"]}},
        {"TEXT": ")"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# Patrón: "disponible en" (solo eso)
matcher.add("METADATA_AVAILABLE_IN", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "disponible"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "en"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# ============================================================================
# MATCHERS PARA CTAs NEUTRALES (debe ser TODO el texto)
# ============================================================================

# "no gracias" (solo eso)
matcher.add("NEUTRAL_NO_THANKS_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "no"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "gracias"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "quiero saber más", "quiero más info", "quiero aprender seo", etc.
matcher.add("NEUTRAL_WANT_INFO_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "saber"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["más", "mas"]}},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["más", "mas"]}},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "info"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "aprender"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "seo"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "mi"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["código", "codigo"]}},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "una"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["consultoría", "consultoria"]}},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "digital"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "descubrir ahora", "descubrirlo ahora"
matcher.add("NEUTRAL_DISCOVER_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["descubrir", "descubrirlo"]}},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "ahora"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "descargar ahora"
matcher.add("NEUTRAL_DOWNLOAD_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "descargar"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "ahora"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "cancelar mi suscripción"
matcher.add("NEUTRAL_CANCEL_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "cancelar"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "mi"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["suscripción", "suscripcion"]}},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "sí, por favor", "si lo quiero", etc.
matcher.add("NEUTRAL_YES_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["si", "sí"]}},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "por"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "favor"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["si", "sí"]}},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "lo"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["si", "sí"]}},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "me"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quiero"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "registrar"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["si", "sí"]}},
        {"IS_PUNCT": True, "OP": "?"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "me"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "voy"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "a"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "quedar"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "el"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "producto"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "recibí/suscribite/anotate + alerta/aviso/lanzamiento"
matcher.add("NEUTRAL_ALERTS_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["recibí", "recibi"]}},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["una", "alertas"]}, "OP": "?"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["alerta", "alertas"]}, "OP": "?"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "cuando"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["esté", "este"]}},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "disponible"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "suscribite"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "al"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "aviso"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "de"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "lanzamiento"},
        {"IS_SPACE": True, "OP": "*"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "anotate"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "para"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "recibir"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "una"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "alerta"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "cuando"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "lancemos"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "mostrame N recetas..."
matcher.add("NEUTRAL_SHOW_RECIPES", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "mostrame"},
        {"IS_SPACE": True, "OP": "*"},
        {"LIKE_NUM": True},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "recetas"}
    ]
])

# "participa ya"
matcher.add("NEUTRAL_PARTICIPATE", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "participa"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "ya"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# ============================================================================
# MATCHERS PARA LANZAMIENTOS (debe ser TODO el texto o empezar así)
# ============================================================================

# "muy pronto"
matcher.add("LAUNCH_SOON_FULL", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "muy"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "pronto"},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "disponible próximamente"
matcher.add("LAUNCH_AVAILABLE_SOON", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "disponible"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["próximamente", "proximamente"]}},
        {"IS_SPACE": True, "OP": "*"}
    ]
])

# "lanzamiento..." o "drop..." (inicio del texto)
matcher.add("LAUNCH_TERMS_START", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": {"IN": ["lanzamiento", "drop"]}}
    ]
])

# "cuenta regresiva..." o "tiempo restante..." (inicio)
matcher.add("LAUNCH_COUNTDOWN_START", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "cuenta"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "regresiva"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "tiempo"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "restante"}
    ],
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "tu"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "contador"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "regresivo"}
    ]
])

# "en breve..." (inicio)
matcher.add("LAUNCH_BRIEF_START", [
    [
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "en"},
        {"IS_SPACE": True, "OP": "*"},
        {"LOWER": "breve"}
    ]
])

# ============================================================================
# PHRASE MATCHERS PARA DETECCIÓN DE TÉRMINOS (no todo el texto)
# ============================================================================

# Triggers de urgencia
urgency_patterns = [nlp.make_doc(trigger) for trigger in URGENCY_TRIGGERS]
phrase_matcher.add("URGENCY_TRIGGERS", urgency_patterns)

# Términos técnicos
tech_noun_patterns = [nlp.make_doc(noun) for noun in TECH_NOUNS]
phrase_matcher.add("TECH_NOUNS", tech_noun_patterns)

# Verbos de finalización
end_verb_patterns = [nlp.make_doc(verb) for verb in END_VERBS]
phrase_matcher.add("END_VERBS", end_verb_patterns)

# Términos de eventos
event_patterns = [nlp.make_doc(term) for term in EVENT_TERMS]
phrase_matcher.add("EVENT_TERMS", event_patterns)

# Verbos de inicio de eventos
event_start_patterns = [nlp.make_doc(verb) for verb in EVENT_START_VERBS]
phrase_matcher.add("EVENT_START", event_start_patterns)

# ============================================================================
# FUNCIONES AUXILIARES
# ============================================================================

def has_urgency_trigger(doc):
    """Detecta si hay triggers de urgencia en CUALQUIER parte del texto"""
    matches = phrase_matcher(doc, as_spans=False)
    return any(nlp.vocab.strings[match_id] == "URGENCY_TRIGGERS" for match_id, _, _ in matches)

def check_full_text_match(doc, label_prefixes):
    """
    Verifica si un match cubre TODO el texto (ignorando espacios).
    label_prefixes: lista de prefijos de labels a buscar (ej: ["METADATA_", "NEUTRAL_"])
    """
    matches = matcher(doc)

    # Tokens que no son espacios
    non_space_tokens = [i for i, token in enumerate(doc) if not token.is_space]

    if not non_space_tokens:
        return False

    first_token = non_space_tokens[0]
    last_token = non_space_tokens[-1]

    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]

        # Verificar si el label comienza con alguno de los prefijos
        if any(label.startswith(prefix) for prefix in label_prefixes):
            # El match debe cubrir desde el primer token no-espacio hasta el último
            if start <= first_token and end > last_token:
                return True

    return False

def check_start_match(doc, label_prefixes):
    """
    Verifica si hay un match al inicio del texto.
    Útil para patrones como "lanzamiento...", "cuenta regresiva...", etc.
    """
    matches = matcher(doc)

    # Primer token no-espacio
    non_space_tokens = [i for i, token in enumerate(doc) if not token.is_space]
    if not non_space_tokens:
        return False

    first_token = non_space_tokens[0]

    for match_id, start, end in matches:
        label = nlp.vocab.strings[match_id]

        if any(label.startswith(prefix) for prefix in label_prefixes):
            # El match debe empezar en el primer token (o cerca)
            if start <= first_token + 1:  # +1 para dar algo de margen
                return True

    return False

# ============================================================================
# FUNCIONES PRINCIPALES
# ============================================================================

def is_safe_non_pattern(text: str) -> bool:
    """
    Valida que TODO el texto sea metadata, CTA neutral o info de lanzamiento.
    Usa matchers de spaCy con validación de texto completo.
    """
    doc = nlp(text.lower())

    if has_urgency_trigger(doc):
        return False

    # Verificar metadata (debe cubrir TODO el texto)
    if check_full_text_match(doc, ["METADATA_"]):
        return True

    # Verificar CTAs neutrales (debe cubrir TODO el texto)
    if check_full_text_match(doc, ["NEUTRAL_"]):
        return True

    # Verificar lanzamientos (puede ser todo el texto o empezar así)
    if check_full_text_match(doc, ["LAUNCH_"]) or check_start_match(doc, ["LAUNCH_"]):
        return True

    return False

def is_anti_dark_fp(text: str) -> bool:
    """
    Detecta falsos positivos: eventos, sesiones técnicas, etc.
    """
    doc = nlp(text.lower())
    text_lower = text.lower()

    if has_urgency_trigger(doc):
        return False

    # Detectar términos con phrase matcher
    phrase_matches = phrase_matcher(doc, as_spans=False)

    has_event = any(nlp.vocab.strings[match_id] == "EVENT_TERMS" for match_id, _, _ in phrase_matches)
    has_start = any(nlp.vocab.strings[match_id] == "EVENT_START" for match_id, _, _ in phrase_matches)
    has_tech_noun = any(nlp.vocab.strings[match_id] == "TECH_NOUNS" for match_id, _, _ in phrase_matches)
    has_end_verb = any(nlp.vocab.strings[match_id] == "END_VERBS" for match_id, _, _ in phrase_matches)

    # Eventos con inicio
    if has_event and has_start:
        return True

    # Evento que comienza "en X tiempo"
    if has_start and " en " in text_lower:
        if re_in_time.search(text_lower) or re_clock.search(text_lower):
            return True

    # Tech nouns + end verbs + tiempo
    if has_tech_noun and has_end_verb:
        if re_in_time.search(text_lower) or re_clock.search(text_lower) or " en " in text_lower:
            return True

    return False

def prefilter_to_none(text: str) -> bool:
    """
    Función principal de prefiltro.
    Retorna True si el texto debe clasificarse como "ninguno" (no es dark pattern).
    """
    return is_anti_dark_fp(text) or is_safe_non_pattern(text)

X = df["content"]
y = df["binary"]
groups = df["source"]

pipeline_stage1 = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 3), min_df=1, max_df=0.95)),
    ("clf", LogisticRegression(max_iter=3000, class_weight="balanced"))
])

pipeline_stage2 = Pipeline([
    ("tfidf", TfidfVectorizer(ngram_range=(1, 3), min_df=1, max_df=0.95)),
    ("clf", LinearSVC(class_weight="balanced"))
])

gkf1 = GroupKFold(n_splits=5)
oof_model_stage1 = np.empty(len(df), dtype=object)
oof_system_stage1 = np.empty(len(df), dtype=object)
discarded = np.zeros(len(df), dtype=bool)

for tr, te in gkf1.split(X, y, groups):
    pipeline_stage1.fit(X.iloc[tr], y.iloc[tr])

    te_texts = X.iloc[te]
    preds_model = pipeline_stage1.predict(te_texts)
    oof_model_stage1[te] = preds_model

    disc = te_texts.apply(lambda x: prefilter_to_none(x)).values
    discarded[te] = disc

    preds_sys = preds_model.copy()
    preds_sys[disc] = "ninguno"
    oof_system_stage1[te] = preds_sys

print("=== STAGE 1: MODEL ONLY ===")
print(classification_report(y, oof_model_stage1, digits=3, zero_division=0))

print("\n=== STAGE 1: SYSTEM (PREFILTER + MODEL) ===")
print(classification_report(y, oof_system_stage1, digits=3, zero_division=0))

print("\nDescartados por prefilter:", int(discarded.sum()))
print(df.loc[discarded, ["type", "source", "content"]].to_string(index=False))

df_err1 = df.copy()
df_err1["y_true"] = y.values
df_err1["y_pred_model"] = oof_model_stage1
df_err1["y_pred_system"] = oof_system_stage1

err_model = df_err1[df_err1["y_true"] != df_err1["y_pred_model"]]
err_sys = df_err1[df_err1["y_true"] != df_err1["y_pred_system"]]

print("\n=== ERRORES STAGE 1 (MODEL ONLY) ===")
print(f"Total errores: {len(err_model)}")
print("\n--- FALSOS NEGATIVOS (pattern → ninguno) ---")
for _, r in err_model[(err_model["y_true"] == "pattern") & (err_model["y_pred_model"] == "ninguno")].iterrows():
    print(f"[{r['type']} | {r['source']}] {r['content']}")
print("\n--- FALSOS POSITIVOS (ninguno → pattern) ---")
for _, r in err_model[(err_model["y_true"] == "ninguno") & (err_model["y_pred_model"] == "pattern")].iterrows():
    print(f"[{r['type']} | {r['source']}] {r['content']}")

print("\n=== ERRORES STAGE 1 (SYSTEM = PREFILTER + MODEL) ===")
print(f"Total errores: {len(err_sys)}")
print("\n--- FALSOS NEGATIVOS (pattern → ninguno) ---")
for _, r in err_sys[(err_sys["y_true"] == "pattern") & (err_sys["y_pred_system"] == "ninguno")].iterrows():
    print(f"[{r['type']} | {r['source']}] {r['content']}")
print("\n--- FALSOS POSITIVOS (ninguno → pattern) ---")
for _, r in err_sys[(err_sys["y_true"] == "ninguno") & (err_sys["y_pred_system"] == "pattern")].iterrows():
    print(f"[{r['type']} | {r['source']}] {r['content']}")

df_p = df[df["type"] != "ninguno"].reset_index(drop=True)

X2 = df_p["content"]
y2 = df_p["type"]
groups2 = df_p["source"]

gkf2 = GroupKFold(n_splits=5)
oof_stage2 = np.empty(len(df_p), dtype=object)

for tr, te in gkf2.split(X2, y2, groups2):
    pipeline_stage2.fit(X2.iloc[tr], y2.iloc[tr])
    oof_stage2[te] = pipeline_stage2.predict(X2.iloc[te])

print("\n=== STAGE 2: MODEL ONLY (patterns) ===")
print(classification_report(y2, oof_stage2, digits=3, zero_division=0))

df_err2 = df_p.copy()
df_err2["y_true"] = y2.values
df_err2["y_pred"] = oof_stage2
df_err2 = df_err2[df_err2["y_true"] != df_err2["y_pred"]]

print("\n=== ERRORES STAGE 2 ===")
print(f"Total errores: {len(df_err2)}")
for _, r in df_err2.iterrows():
    print(f"[{r['source']}] true={r['y_true']} pred={r['y_pred']} | {r['content']}")

def fit_final_models():
    pipeline_stage1.fit(df["content"], df["binary"])
    pipeline_stage2.fit(df_p["content"], df_p["type"])
    return pipeline_stage1, pipeline_stage2

final_stage1, final_stage2 = fit_final_models()

=== STAGE 1: MODEL ONLY ===
              precision    recall  f1-score   support

     ninguno      0.500     0.395     0.441        81
     pattern      0.729     0.805     0.765       164

    accuracy                          0.669       245
   macro avg      0.615     0.600     0.603       245
weighted avg      0.653     0.669     0.658       245


=== STAGE 1: SYSTEM (PREFILTER + MODEL) ===
              precision    recall  f1-score   support

     ninguno      0.611     0.679     0.643        81
     pattern      0.832     0.787     0.809       164

    accuracy                          0.751       245
   macro avg      0.722     0.733     0.726       245
weighted avg      0.759     0.751     0.754       245


Descartados por prefilter: 37
         type      source                                         content
fake_scarcity      bidcom                                 (6 disponibles)
fake_scarcity        boca              Stock disponible: (19 disponibles)
fake_scarcity       