En este script obtenemos las respuestas de los clickbaits con contenido empleando el modelo basado en NER (usando Stanza). Buscamos la entidad esperada como respuesta en el titular y después extraemos la primera entidad del contenido que se corresponda con la esperada.

In [None]:
# Instalar y cargar librerías necesarias
!pip install stanza pandas
import pandas as pd
import stanza

# Descargar modelo de español de Stanza
stanza.download('es')
nlp = stanza.Pipeline('es', processors='tokenize,ner')

# Cargar el CSV desde ruta local
file_path = 'dataset_clickbait_QA_limpio.csv'
df = pd.read_csv(file_path)

# Funciones auxiliares
import re

def infer_expected_entity(title):
    title = title.lower()

    # Patrones agrupados por tipo de entidad
    patterns = {
        "PER": [
            r"\bquién(es)?\b", r"\bfamos[oa]s?\b", r"\bperson(a|aje|alidad)(es)?\b",
            r"\bnombre(s)?\b", r"\bcelebridad(es)?\b", r"\bfigura(s)? pública(s)?\b"
        ],
        "LOC": [
            r"\bdónde\b", r"\blugar(es)?\b", r"\bciudad(es)?\b", r"\bpaís(es)?\b",
            r"\bubicación(es)?\b", r"\bregión(es)?\b", r"\bprovincia(s)?\b", r"\bsitio(s)?\b"
        ],
        "DATE": [
            r"\bcuándo\b", r"\bfecha(s)?\b", r"\bd[íi]a(s)?\b", r"\bañ[oa]s?\b",
            r"\bépoca(s)?\b", r"\btemporada(s)?\b", r"\bmomento(s)?\b", r"\bhora(s)?\b"
        ],
        "NUM": [
            r"\bcu[aá]nt[ao]s?\b", r"\bcantidad(es)?\b", r"\bn[uú]mer[oa]s?\b",
            r"\bcifra(s)?\b", r"\bporcentaje(s)?\b", r"\bdinero\b", r"\bvalor(es)?\b",
            r"\btotal(es)?\b", r"\bingreso(s)?\b", r"\bpes[o|os]\b", r"\beuro(s)?\b"
        ]
    }

    for entity, regex_list in patterns.items():
        for pattern in regex_list:
            if re.search(pattern, title):
                return entity

    return None


def extract_entity(text, expected_label):
    doc = nlp(text)
    for sent in doc.sentences:
        for ent in sent.ents:
            if ent.type == expected_label:
                return ent.text
    return None

# Generar la columna answer
answers = []
for i, row in df.iterrows():
    expected = infer_expected_entity(row['title'])
    if expected:
        answer = extract_entity(row['content'], expected)
    else:
        answer = None
    answers.append(answer)

df['answer'] = answers

# Filtrar solo las noticias con respuesta detectada
df = df[df['answer'].notnull()].reset_index(drop=True)
print(f"Número de filas tras filtrar respuestas detectadas: {len(df)}")

# Exportar como nuevo CSV
output_file = "qa_output_ner_stanza.csv"
df.to_csv(output_file, index=False)
print(f"Archivo guardado en: {output_file}")

Collecting stanza
  Downloading stanza-1.10.1-py3-none-any.whl.metadata (13 kB)
Collecting emoji (from stanza)
  Downloading emoji-2.14.1-py3-none-any.whl.metadata (5.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.3.0->stanza)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.3.0->stanza)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.3.0->stanza)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata 

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Downloading default packages for language: es (Spanish) ...


Downloading https://huggingface.co/stanfordnlp/stanza-es/resolve/v1.10.0/models/default.zip:   0%|          | …

INFO:stanza:Downloaded file to /root/stanza_resources/es/default.zip
INFO:stanza:Finished downloading models and saved to /root/stanza_resources
INFO:stanza:Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json:   0%|  …

INFO:stanza:Downloaded file to /root/stanza_resources/resources.json
INFO:stanza:Loading these models for language: es (Spanish):
| Processor | Package  |
------------------------
| tokenize  | combined |
| mwt       | combined |
| ner       | conll02  |

INFO:stanza:Using device: cpu
INFO:stanza:Loading: tokenize
INFO:stanza:Loading: mwt
INFO:stanza:Loading: ner
INFO:stanza:Done loading processors!


Número de filas tras filtrar respuestas detectadas: 29
Archivo guardado en: dataset_con_respuestas_stanza.csv
