# Esto es el código correspondiente a todo el flujo de trabajo.

Está dividido en distintas celdas donde cada una contiene las funciones de cada parte del procesamiento.

1. **Celda 1**: Obtención de contexto. Procesamiento de preguntas, extracción de páginas de Wikipedia, generación de resumen (_ground truth_).
2. **Celda 2**: Llamada a OpenAI. A partir del contexto obtenido previamente se contestan las preguntas del ds con GPT-4o (_ground truth GPT_).
3. **Celda 3**: Comparación entre las respuestas del dataset de Shroom con las respuestas que nosotros estamos generando, y generación de DS final.
4. **Celda 4**: Evaluación (Script oficial de Mushroom)
5. **Próximamente**: Modificación de formato y evaluación. (Va a depender de que la celda 3 se expanda entre comparación.

In [1]:
import wikipedia
import pandas as pd
import spacy
import random
import torch
import numpy as np
import transformers 
import os
import re
import argparse as ap
import time

from FlagEmbedding import BGEM3FlagModel
from openai import OpenAI
from scipy.stats import spearmanr
from transformers import pipeline
from huggingface_hub import login

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16 = True)
os.environ["OPENAI_API_KEY"] = "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"

login(token = "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")

model_instruct = "meta-llama/Llama-3.2-3B-Instruct"
pipe1 = pipeline(
    "text-generation",
    model = model_instruct,
    torch_dtype = torch.bfloat16,
    device_map = "auto",
)

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### CELDA 1: CONTEXTO

In [3]:
# CELDA 1: OBTENCIÓN DE CONTEXTO

def noun_list(a, lang):
    """
    Filtra la pregunta y obtiene las PoST relevantes.
    
    a = list; Lista de preguntas del dataset
    lang = 'es' or 'en'; Idioma a trabajar
    """
    if lang == 'es':
        post_spacy = spacy.load("es_core_news_sm")
    else:
        post_spacy = spacy.load("en_core_web_sm")
    noun_list = []
    nums = ['0','1','2','3','4','5','6','7','8','9']

    for _ in a:
        doc = post_spacy(_)
        sub_noun = []
        for token in doc:
            if token.pos_ == "NOUN" or token.pos_ == "PROPN" or token.pos_ == "NUM":
                sub_noun.append(token.text)
            if token.pos_ == "ADJ" and token.text[0] in nums:
                sub_noun.append(token.text)
        noun_list.append(sub_noun)
    return noun_list


def keyword_por_preg(n_list):
    """
    Junta lista de PoST previo a pasarlo por el API de Wikipedia.
    
    n_list = list; Obtenida de la función noun_list().
    """
    keyword_list = []
    for i in n_list:
        keyword = ''
        for j in i:
            keyword = keyword + j + ' '
        keyword_list.append(keyword)
    return keyword_list


def get_wikipage(text, lang, page_total):
    """
    Regresa las n páginas de Wikipedia más relevantes al query

    text = str; Texto proveniente de la función keyword_por_preg()
    lang = 'es' or 'en'; Lenguaje necesario para wikipedia
    page_total = int; Cantidad de páginas a regresar
    """
    if lang == 'es':
        wikipedia.set_lang('es')
    if lang == 'en':
        wikipedia.set_lang('en')
    page_title = wikipedia.search(text, results = page_total)
    return page_title


def wikipipeline(dataset, lang, page_count):
    """
    Genera los resúmenes que sirven como contexto de cada pregunta. 

    dataset = pd.DataFrame ; El nombre del dataset a procesar
    dataset = list; ya sea el nombre del dataset en formato dataset["model_input"] o list(set(dataset["model_input"]))
    lang = 'es' or 'en'; Idioma a trabajar, debe de coincidir con el del dataset para no generar algo incoherente
    page_count = int; Cantidad de páginas de Wikipedia a extraer
    """
    
    noun_list_perrona = noun_list(dataset, lang)
    key_list = keyword_por_preg(noun_list_perrona)

    resumen_list = []
    question_list = []
    iterador = 0
    for i in key_list:
        pages = get_wikipage(i, lang, page_count)
        resumen = ''
        for x in pages:
            try:
                page = wikipedia.WikipediaPage(x)
                page_sum = page.summary
                resumen = resumen + '' + page_sum
            except wikipedia.exceptions.DisambiguationError: # Se usa para evitar problemas al encontrar la página adecuada.
                #print(i, page)
                print(i, x)
            #page_sum = page.summary
            #resumen = resumen + '' + page_sum
        resumen_list.append(resumen)
        question_list.append(dataset[iterador])
        iterador += 1
    return resumen_list, question_list


def generate_embeddings(sum_list, q, ruta):
    """
    Genera dataframe de embeddings, y los guarda en un directorio de nuestra elección.

    sum_list = list ; Lista de resúmenes obtenida previamente.
    q = list ; Lista de preguntas obtenida previamente.
    ruta = str ; Directorio para guardar. 
    """
    len_list = [len(_) for _ in sum_list]
    max_length = max(len_list)

    embs = model.encode(
        sum_list,
        batch_size = 12,
        max_length = max_length,
    )['dense_vecs']

    embs_loco = [_ for _ in embs]
    dic = {'Embedding':embs_loco, 'Texto':sum_list, 'Keywords Pregunta': q}
    embs_df = pd.DataFrame(data=dic)
    embs_df.to_csv(ruta)
    print(f"Embedding guardados en la ruta {ruta} . Saludos")


def full_context_pipeline(dataset, lang, num, ruta):
    """
    Ejecuta todo el pipeline (todo junto alv compa).
    
    dataset = pd.DataFrame[_column_name_] ;  Columna del dataset a trabajar
    lang = 'en' or 'es'; Lenguaje a trabajar.
    num = int; Cantidad de páginas de wikipedia a recolectar.
    ruta = str; Dirección de guardado de dataset. (DEBE DE TENER NOMBRE DEL ARCHIVO)
    """
    conjunto = list(set(dataset))
    sum_set, q_set = wikipipeline(conjunto, lang, num)
    generate_embeddings(sum_set, q_set, ruta)

### CELDA 2: OPENAI y LLAMA

In [10]:
def get_questions(path):
    """
    Returns dataset questions in a list.

    path = str; File location.
    """
    dataset = pd.read_json(path, lines = True)
    questions  = dataset["model_input"]
    return [_ for _ in questions]

def get_embs(path):
    """
    Returns embeddings as a pandas DataFrame.

    path = str; File location.
    """
    embs = pd.read_csv(path)
    embs = embs.drop(columns = ["Unnamed: 0"])
    embs_text = embs["Texto"]
    embs_vec = embs["Embedding"]
    embs_txt = [_ for _ in embs_text]
    return embs_txt, embs_vec


def llama_gen(ques, context):
    prompt = f"""
        You are a bot that answers trivia questions.
        Be brief, answer in short sentences highlighting important information.
        If the given text doesn't answer the question, answer as truthfully as you can with your own information.
        
        This is the trivia question you need to answer:
        {ques}
        
        This is the text that you should use:
        {context}
    """
    messages = [
        {"role": "system", "content": "You are a chatbot that responds to general knowledge questions with high fidelity based on a given text. If the text doesn't give the correct information, answer with your own information."},
        {"role": "user", "content": prompt}
    ]
    outputs = pipe1(
        messages,
        max_new_tokens = 256,
    )
    return outputs[0]["generated_text"][-1]
    

def gen_answer(ques, retr):
    """
    Generates GPT based answer given a question and a context window.

    ques = str; Question extracted from the task dataset.
    retr = str; Context extracted from embeddings.
    """
    client = OpenAI()
    prompt = f"""
        You are a bot that answers trivia questions.
        Be brief, answer in short sentences highlighting important information.
        If the following text doesn't answer the question, answer as truthfully as you can.

        This is the trivia question you need to answer:
        {ques}.

        This is text that you should use to answer the question:
        {retr}.
    """

    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature = 0.1,
    ).choices[0].message
    
    return response.content


def llm_full(emb_dataset, ruta, llm):
    """
    Implementación completa del pipeline

    emb_datset = pd.DataFrame ; Debe de ser el dataset que contenga el contexto de cada pregunta.
    ruta = str ; Ruta para guardar el archivo.
    llm = str ; gpt o llama -> Para elegir el LLM a usar.
    """
    preguntas = emb_dataset["Keywords Pregunta"]
    contexto = emb_dataset["Texto"]
    llm_answer = []
    for i in range(len(contexto)):
        if llm == 'gpt':
            answer = gen_answer(preguntas[i], contexto[i])
        elif llm == 'llama':
            answer = llama_gen(preguntas[i], contexto[i])
        llm_answer.append(answer)
    dic = {'Pregunta': preguntas, 'Respuesta LLM': llm_answer, 'Contexto': contexto}
    full_ds = pd.DataFrame(data=dic)
    full_ds.to_csv(ruta)
    print(f"Se guardó el dataset de respuestas de Chapi en: {ruta}. Viva Leo Messi gigante dios")

### CELDA 3: COMPARACIÓN Y AJUSTE DE FORMATO

In [None]:
# 3.0 Beta
def get_ner(text, nlp):
  """It returns the entities from a text"""
  doc = nlp(text)
  val = ["NOUN","PROPN","NUM","ADJ",'0','1','2','3','4','5','6','7','8','9', 'yes', 'not', 'sí', 'no']
  entities = []
  for token in doc:
      if token.pos_ in val or token.text in val:
        entities.append(token.text)
  return entities

def join_label(label):
  """It joins the labels that the difference is less than 2"""
  label = sorted(label, key=lambda x: x[0])
  final = []
  if len(label) == 1:
    return label
  for i in range(len(label)-1):
    if label[i+1][0] - label[i][1] <= 2:
      final.append([label[i][0],label[i+1][1]])
    else:
      final.append(label[i])
  if final and final[-1][1] < label[-1][1]:
    final.append(label[-1])
  return final

def check_subword(diff, ner_e, claim):
  """It checks if some word it's a subword in ner e"""
  soft = []
  for d in diff:
    #Si no es un número o palabras clave entonces vemos si se parece a otra
    if not bool(re.search(r'\d+', d)):
      if d not in ['yes', 'not', 'sí', 'no']:
        for word in ner_e:
          dist = nltk.edit_distance(d, word)
          if dist < 3:
            soft.append({'start': claim.index(d),
                         'prob': dist/10,
                         'end': claim.index(d)+len(d)})
            diff.remove(d)
            break
  return diff, soft

def get_hard_labels(claims, evidences, lang):
  """It returns the index of words that differs in claims and evidences"""
  nlp = spacy.load("es_core_news_sm") if lang == 'es' else spacy.load("en_core_web_sm")
  hard_labels = []
  ner_llm = []
  soft_labels = []
  for c, e in zip(claims, evidences):
    c = c.lower()
    e = e.lower()
    ner_e = set(get_ner(e))
    diff = list(set(get_ner(c)) - set(get_ner(e)))
    diff, soft = check_subword(diff, ner_e, c)
    label = []
    #print(diff)
    for i in diff:
      start = c.index(i)
      label.append([start, start+len(i)])
    label = join_label(label)
    hard_labels.append(label)
    ner_llm.append(' '.join(list(ner_e)))
    soft_labels.append(soft)
  return hard_labels, ner_llm, soft_labels
    

def is_nounP(text, nlp):
  doc = nlp(text)
  for token in doc:
      if token.pos_ == "PROPN":
        return True
  return False

def get_prob(prob):
  prob = prob.item()
  if prob > 0.3:
    prob = 1.0
  else:
    prob = 0.666
  return prob

def get_soft_labels(data, lang):
  """It returns the soft labels"""
  nlp = spacy.load("es_core_news_sm") if lang == 'es' else spacy.load("en_core_web_sm")
  #soft_labels = []
  for i in range(len(data)):
    info = data.iloc[i]
    gpt = model_sentence.encode(info['Respuesta GPT'])
    #soft_label = info['soft_labels']
    for j in info['hard_labels']:
      word = info['model_output_text'][j[0]:j[1]]
      if bool(re.search(r'\d', word)) or is_nounP(word, nlp):
        aux =  {'start': j[0], 'prob': 1.0, 'end': j[1]}
      else:
        prob = model_sentence.similarity(model_sentence.encode(word), gpt)
        aux =  {'start': j[0], 'prob': get_prob(prob), 'end': j[1]}
      info['soft_labels'].append(aux)
    #soft_labels.append(soft_label)
  #return soft_labels

In [69]:
def json_creation(base_ds, ruta):
    """
    Crea y guarda un dataset con el formato necesario para la evaluación.

    base_ds = pd.DataFrame ;  Correspondiente al conjunto de train/test/val cuyos valores se van a modificar
    ruta = str; Ubicación donde se guardará el dataset.
    """
    if "soft_labels" and 'hard_labels' in base_ds.columns:
        new_ds = base_ds.drop(columns = ["soft_labels", "hard_labels"])
    else:
        new_ds = base_ds.copy()
    new_ds['hard_labels'], new_ds['ner_llm'], new_ds['soft_labels'] = get_hard_labels(new_ds['model_output_text'], new_ds['Respuesta LLM'])
    get_soft_labels(new_ds)
    new_ds.to_json(ruta, orient = "records", lines = True)
    print(f"Se guardó el archivo en {ruta}. Viva Messi.")

### CELDA 4: EVALUACIÓN

In [2]:
def recompute_hard_labels(soft_labels):
    """optionally, infer hard labels from the soft labels provided"""
    hard_labels = [] 
    prev_end = -1
    for start, end in (
        (lbl['start'], lbl['end']) 
        for lbl in sorted(soft_labels, key=lambda span: (span['start'], span['end']))
        if lbl['prob'] > 0.5
    ):
        if start == prev_end:
            hard_labels[-1][-1] = end
        else:
            hard_labels.append([start, end])
        prev_end = end
    return hard_labels


def infer_soft_labels(hard_labels):
    """reformat hard labels into soft labels with prob 1"""
    return [
        {
            'start': start,
            'end': end,
            'prob': 1.0,
        }
        for start, end in hard_labels
    ]


def load_jsonl_file_to_records(filename, is_ref=True):
    """read data from a JSONL file and format that as a `pandas.DataFrame`.
    Performs minor format checks (ensures that some labels are present,
    optionally compute missing labels on the fly)."""
    df = pd.read_json(filename, lines=True)
    if not is_ref:
        assert ('hard_labels' in df.columns) or ('soft_labels' in df.columns), \
            f'File {filename} contains no predicted label!'
        if 'hard_labels' not in df.columns:
            df['hard_labels'] = df.soft_labels.apply(recompute_hard_labels)
        elif 'soft_labels' not in df.columns:
            df['soft_labels'] = df.hard_labels.apply(infer_soft_labels)
    # adding an extra column for convenience
    columns = ['id', 'soft_labels', 'hard_labels']
    if is_ref:
        df['text_len'] = df.model_output_text.apply(len)
        columns += ['text_len']
    df = df[columns]
    return df.sort_values('id').to_dict(orient='records')

def score_iou(ref_dict, pred_dict):
    """computes intersection-over-union between reference and predicted hard labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the IoU, or 1.0 if neither the reference nor the prediction contain hallucinations
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict['id'] == pred_dict['id']
    # convert annotations to sets of indices
    ref_indices = {idx for span in ref_dict['hard_labels'] for idx in range(*span)}
    pred_indices = {idx for span in pred_dict['hard_labels'] for idx in range(*span)}
    # avoid division by zero
    if not pred_indices and not ref_indices: return 1.
    # otherwise compute & return IoU
    return len(ref_indices & pred_indices) / len(ref_indices | pred_indices)

def score_cor(ref_dict, pred_dict):
    """computes Spearman correlation between predicted and reference soft labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the Spearman correlation, or a binarized exact match (0.0 or 1.0) if the reference or prediction contains no variation
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict['id'] == pred_dict['id']
    # convert annotations to vectors of observations
    ref_vec = [0.] * ref_dict['text_len']
    pred_vec = [0.] * ref_dict['text_len']
    for span in ref_dict['soft_labels']:
        for idx in range(span['start'], span['end']):
            ref_vec[idx] = span['prob']
    for span in pred_dict['soft_labels']:
        for idx in range(span['start'], span['end']):
            pred_vec[idx] = span['prob']
    # constant series (i.e., no hallucination) => cor is undef
    if len({round(flt, 8) for flt in pred_vec}) == 1 or len({round(flt, 8) for flt in ref_vec}) == 1 : 
        return float(len({round(flt, 8) for flt in ref_vec}) == len({round(flt, 8) for flt in pred_vec}))
    # otherwise compute Spearman's rho
    return spearmanr(ref_vec, pred_vec).correlation

def main(ref_dicts, pred_dicts, output_file=None):
    assert len(ref_dicts) == len(pred_dicts)
    ious = np.array([score_iou(r, d) for r, d in zip(ref_dicts, pred_dicts)])
    cors = np.array([score_cor(r, d) for r, d in zip(ref_dicts, pred_dicts)])
    if output_file is not None:
        with open(output_file, 'w') as ostr:
            print(f'IoU: {ious.mean():.8f}', file=ostr)
            print(f'Cor: {cors.mean():.8f}', file=ostr)
    return ious, cors

## CASO DE USO

In [11]:
%%time
# Celda 1
test_set_en = pd.read_json(r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\test_ds\v1\mushroom.en-tst.v1.jsonl', lines = True)
full_context_pipeline(test_set_en["model_input"], 'en', 2, r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_context.csv')

traits  Trait
techniques  Technique
Stahlberg  Stahlberg (disambiguation)
region France Vaux en Amiénois  Vaux
Pasteur crater  Pasteur (disambiguation)
Black Sabbath Eternal Idol  Eternal Idol
19521 Chaos  Chaos


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Embedding guardados en la ruta C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_context.csv . Saludos
CPU times: total: 13.3 s
Wall time: 4min 21s


In [11]:
%%time
# Celda 2

en_embs = pd.read_csv(r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_context.csv')
en_embs = en_embs.drop(columns = ("Unnamed: 0"))
llm_full(en_embs, r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_llm.csv', 'llama')

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for

Se guardó el dataset de respuestas de Chapi en: C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_llm.csv. Viva Leo Messi gigante dios
CPU times: total: 24min 3s
Wall time: 17min


In [70]:
%%time

# Celda 3
test_set_en = pd.read_json(r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\test_ds\v1\mushroom.en-tst.v1.jsonl', lines = True)

llm_answers = pd.read_csv(r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_llm.csv')
merge_data = pd.merge(test_set_en, gpt_ansllm_answerswers, left_on="model_input", right_on="Pregunta")
merge_data = merge_data[['id','model_input', 'model_output_text', 'Respuesta LLM']]

json_creation(merge_data, r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_test_dataset.jsonl')

Index(['id', 'lang', 'model_input', 'model_output_text', 'model_id',
       'model_output_tokens', 'model_output_logits'],
      dtype='object')
---
Index(['id', 'lang', 'model_input', 'model_output_text', 'model_id',
       'model_output_tokens', 'model_output_logits', 'soft_labels'],
      dtype='object')
---
Index(['id', 'lang', 'model_input', 'model_output_text', 'model_id',
       'model_output_tokens', 'soft_labels', 'hard_labels',
       'model_output_logits'],
      dtype='object')
---
Se guardó el archivo en C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_test_dataset.jsonl. Viva Messi.
CPU times: total: 15.6 ms
Wall time: 10 ms


In [30]:
# Prueba express con el baseline de ellos.
test = load_jsonl_file_to_records(r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Notebooks\en-pred.jsonl', is_ref = False)
val = load_jsonl_file_to_records(r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\val_ds\mushroom.en-val.v2.jsonl')

[{'id': 'val-en-1',
  'soft_labels': [{'start': 0, 'end': 0, 'prob': 0.014805230312049},
   {'start': 0, 'end': 5, 'prob': 0.014804152771830002},
   {'start': 6, 'end': 9, 'prob': 0.47397157549858004},
   {'start': 10, 'end': 13, 'prob': 0.014803784899413001},
   {'start': 13, 'end': 18, 'prob': 0.5171299576759331},
   {'start': 19, 'end': 22, 'prob': 0.49482902884483304},
   {'start': 23, 'end': 24, 'prob': 0.481028497219085},
   {'start': 25, 'end': 31, 'prob': 0.503384828567504},
   {'start': 32, 'end': 37, 'prob': 0.014805689454078001},
   {'start': 38, 'end': 40, 'prob': 0.46621477603912304},
   {'start': 41, 'end': 44, 'prob': 0.45315995812416004},
   {'start': 45, 'end': 49, 'prob': 0.43575534224510104},
   {'start': 50, 'end': 56, 'prob': 0.014804939739406001},
   {'start': 57, 'end': 64, 'prob': 0.47238403558731},
   {'start': 64, 'end': 65, 'prob': 0.48982512950897206},
   {'start': 66, 'end': 68, 'prob': 0.478361010551452},
   {'start': 69, 'end': 76, 'prob': 0.5091674923896

In [33]:
ious, cors = main(val, test)
print(f'IoU: {ious.mean():.8f}')
print(f'Cor: {cors.mean():.8f}')

IoU: 0.19432277
Cor: 0.21401714
