In [1]:
from spacy.matcher import Matcher
import spacy
NLP = spacy.load("es_core_news_lg")

from DarkPatternPredictor import DarkStrategy

dark_matcher = Matcher(NLP.vocab)
anti_dark_matcher = Matcher(NLP.vocab)

URGENCY_TRIGGERS = [
    "apurate",
    "ya",
    "no te lo pierdas",
    "última,"
    "oportunidad",
    "comprá",
    "reservá",
    "oferta",
    "últimas",
    "flash",
    "sale" ,
    "relámpago",
    "aprovecha"
]

TECH_NOUNS = [
    "sesión", "batería", "dispositivo", "equipo", "sistema",
    "conexión", "proceso", "operación", "pantalla", "aplicación",
    "instancia", "entorno"
]

END_VERBS = [
    "expirar", "caducar", "vencer", "cerrar",
    "finalizar", "terminar", "agotarse",
    "apagarse", "desconectarse", "bloquearse"
]

TIME_UNITS = [
    "segundo", "segundos",
    "minuto", "minutos",
    "hora", "horas",
    "s", "m", "h"
]

def get_patterns():
    return {
        #SHAMING STRATEGIES

        # Primera persona
        "FP_VERB": [[{"POS": "VERB", "MORPH": {"IS_SUPERSET": ["Person=1", "Number=Sing"]}}]],
        "FP_COPULA": [[{"DEP": "cop", "POS": "AUX", "MORPH": {"IS_SUPERSET": ["Person=1", "Number=Sing"]}}]],
        "FP_ME_VERB": [[{"POS": "PRON", "MORPH": {"IS_SUPERSET": ["Person=1", "Number=Sing"]}}, {"POS": "VERB"}]],

        # Perífrasis
        "FP_PERIFRASIS_VOY_A": [[
            {"DEP": "aux", "POS": "AUX", "MORPH": {"IS_SUPERSET": ["Person=1", "Number=Sing"]}},
            {"DEP": "mark", "POS": "ADP"},
            {"POS": "VERB"},
        ]],

        # Ser desordenado es lo mío
        "FP_ES_LO_MIO": [[
            {"LEMMA": {"IN": ["seguir", "ignorar", "ser", "hacer"]}, "POS": {"IN": ["VERB", "AUX"]}},
            {"OP": "+", "POS": {"NOT_IN": ["PUNCT"]}},
            {"LEMMA": "ser", "POS": {"IN": ["AUX", "VERB"]}},
            {"LOWER": "lo"},
            {"LOWER": "mío"},
        ]],

        # Ironía
        "IRONIA_PREFIERO_NO": [[{"LEMMA": "preferir", "POS": "VERB"}, {"LOWER": "no"}, {"POS": "VERB"}]],
        "IRONIA_QUIEN_NECESITA": [[{"LOWER": "quién"}, {"LEMMA": "necesitar", "POS": "VERB"}]],
        "IRONIA_PORQUE_HABRIA_DE": [[
            {"LOWER": "por"}, {"LOWER": "qué"}, {"LEMMA": "haber", "POS": "AUX"}, {"LOWER": "de"}, {"POS": "VERB"}
        ]],

        # Metáforas
        "META_VERBOS_ES_MI": [[
            {"LEMMA": {"IN": ["ignorar", "vivir", "ser", "estar", "perder", "arruinar", "hacer", "rechazar", "fracasar", "seguir"]}},
            {"OP": "*"},
            {"LOWER": "es"},
            {"LOWER": "mi"},
            {"OP": "+"}
        ]],
    }

def get_anti_patterns():
    return {
        "INFO_EVENT": [[
            {"LEMMA": {"IN": ["comenzar", "empezar", "iniciar"]}},
            {"LOWER": "en"},
            {"LIKE_NUM": True, "OP": "?"},
            {"LOWER": {"IN": ["minutos", "horas", "segundos"]}}
        ]],

        "STREAMING": [[
            {"LOWER": {"IN": ["streaming", "clase", "evento", "live"]}},
            {"LOWER": {"IN": ["empieza", "comienza", "inicia"]}, "OP": "?"},
            {"LOWER": "en", "OP": "?"},
            {"LIKE_NUM": True, "OP": "?"}
        ]],

        "TECH_TIMEOUT_ABSTRACT": [[
            {"LOWER": {"IN": ["tu", "su", "la", "el"]}, "OP": "?"},
            {"LEMMA": {"IN": TECH_NOUNS}},
            {"POS": {"IN": ["AUX", "VERB"]}, "OP": "*"},
            {"LEMMA": {"IN": END_VERBS}},
            {"LOWER": "en", "OP": "?"},
            {"LIKE_NUM": True, "OP": "?"},
            {"LOWER": {"IN": TIME_UNITS}, "OP": "?"}
        ]]
    }

def init_matchers():
    # Dark patterns (si los usás)
    for name, patterns in get_patterns().items():
        dark_matcher.add(name, patterns)

    # Anti-patterns (DESCARTE)
    for name, patterns in get_anti_patterns().items():
        anti_dark_matcher.add(name, patterns)

init_matchers()

def has_urgency_trigger(doc):
    text = doc.text.lower()
    return any(trigger in text for trigger in URGENCY_TRIGGERS)

def is_anti_dark_fp(text):
    doc = NLP(text)

    # Regla de oro: si hay lenguaje de urgencia, NO descartar
    if has_urgency_trigger(doc):
        return False

    matches = anti_dark_matcher(doc)
    return len(matches) > 0




In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GroupKFold
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
df = pd.read_csv("datasets/unified_dataset.csv").dropna(
    subset=["type", "content", "source"]
)

df["binary"] = df["type"].apply(
    lambda x: "pattern" if x in [
        "fake_urgency", "fake_scarcity", "shaming"
    ] else "ninguno"
)

X = df["content"]
y = df["binary"]
groups = df["source"]


# ============================================================
# STAGE 1 — PATTERN vs NINGUNO (WITH NLP FILTER)
# ============================================================

pipeline_stage1 = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 3),
        min_df=1,
        max_df=0.95
    )),
    ("clf", LogisticRegression(
        max_iter=3000,
        class_weight={"pattern": 1.0, "ninguno": 2.0}
    ))
])

gkf = GroupKFold(n_splits=5)
oof_pred = np.empty(len(df), dtype=object)
discarded_by_nlp = []

for tr, te in gkf.split(X, y, groups):
    pipeline_stage1.fit(X.iloc[tr], y.iloc[tr])
    preds = pipeline_stage1.predict(X.iloc[te])

    for i, idx in enumerate(te):
        text = X.iloc[idx]

        if is_anti_dark_fp(text):
            oof_pred[idx] = "ninguno"
            discarded_by_nlp.append(text)
        else:
            oof_pred[idx] = preds[i]


print("=== STAGE 1: pattern vs ninguno ===")
print(classification_report(y, oof_pred, digits=3))
print(f"Descartados por NLP (stage 0): {len(discarded_by_nlp)}")


# ============================================================
# STAGE 2 — MULTICLASS (solo patterns reales)
# ============================================================

df_p = df[df["type"] != "ninguno"]

X2 = df_p["content"]
y2 = df_p["type"]
groups2 = df_p["source"]

pipeline_stage2 = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 3),
        min_df=1,
        max_df=0.95
    )),
    ("clf", LinearSVC(class_weight="balanced"))
])

gkf = GroupKFold(n_splits=5)
oof_pred2 = np.empty(len(df_p), dtype=object)

for tr, te in gkf.split(X2, y2, groups2):
    pipeline_stage2.fit(X2.iloc[tr], y2.iloc[tr])
    oof_pred2[te] = pipeline_stage2.predict(X2.iloc[te])

print("\n=== STAGE 2: urgency vs scarcity vs shaming ===")
print(classification_report(y2, oof_pred2, digits=3))


# ============================================================
# ERROR ANALYSIS — STAGE 1
# ============================================================

df_err = df.copy()
df_err["y_true"] = y.values
df_err["y_pred"] = oof_pred

df_err = df_err[df_err["y_true"] != df_err["y_pred"]]

print(f"\nTotal errores Stage 1: {len(df_err)}")

fn = df_err[
    (df_err["y_true"] == "pattern") &
    (df_err["y_pred"] == "ninguno")
]

print("\n=== FALSOS NEGATIVOS (pattern → ninguno) ===")
for _, r in fn.iterrows():
    print(f"- [{r['source']}] {r['content']}")

fp = df_err[
    (df_err["y_true"] == "ninguno") &
    (df_err["y_pred"] == "pattern")
]

print("\n=== FALSOS POSITIVOS (ninguno → pattern) ===")
for _, r in fp.iterrows():
    print(f"- [{r['source']}] {r['content']}")

=== STAGE 1: pattern vs ninguno ===
              precision    recall  f1-score   support

     ninguno      0.800     0.211     0.333        38
     pattern      0.829     0.986     0.901       147

    accuracy                          0.827       185
   macro avg      0.814     0.598     0.617       185
weighted avg      0.823     0.827     0.784       185

Descartados por NLP (stage 0): 5

=== STAGE 2: urgency vs scarcity vs shaming ===
               precision    recall  f1-score   support

fake_scarcity      0.830     0.812     0.821        48
 fake_urgency      0.824     0.889     0.855        63
      shaming      1.000     0.889     0.941        36

     accuracy                          0.864       147
    macro avg      0.884     0.863     0.872       147
 weighted avg      0.869     0.864     0.865       147


Total errores Stage 1: 32

=== FALSOS NEGATIVOS (pattern → ninguno) ===
- [Desconocido] Olvídalo, quiero juegos geniales
- [NeilPatel] Si, quiero más tráfico

=== FAL