# Esto es el código correspondiente a todo el flujo de trabajo.

Está dividido en distintas celdas donde cada una contiene las funciones de cada parte del procesamiento.

1. **Celda 1**: Obtención de contexto. Procesamiento de preguntas, extracción de páginas de Wikipedia, generación de resumen (_ground truth_).
2. **Celda 2**: Llamada a OpenAI. A partir del contexto obtenido previamente se contestan las preguntas del ds con GPT-4o (_ground truth GPT_).
3. **Celda 3**: Comparación entre las respuestas del dataset de Shroom con las respuestas que nosotros estamos generando, y generación de DS final.
4. **Celda 4**: Evaluación (Script oficial de Mushroom)
5. **Próximamente**: Modificación de formato y evaluación. (Va a depender de que la celda 3 se expanda entre comparación.

In [1]:
import wikipedia
import pandas as pd
import spacy
import random
import math
import numpy as np
import transformers 
import os
import argparse as ap

from FlagEmbedding import BGEM3FlagModel
from openai import OpenAI
from scipy.stats import spearmanr

model = BGEM3FlagModel('BAAI/bge-m3', use_fp16 = True)
os.environ["OPENAI_API_KEY"] = "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"

Fetching 30 files:   0%|          | 0/30 [00:00<?, ?it/s]

### CELDA 1: CONTEXTO

In [10]:
# CELDA 1: OBTENCIÓN DE CONTEXTO

def noun_list(a, lang):
    """
    Filtra la pregunta y obtiene las PoST relevantes.
    
    a = list; Lista de preguntas del dataset
    lang = 'es' or 'en'; Idioma a trabajar
    """
    if lang == 'es':
        post_spacy = spacy.load("es_core_news_sm")
    else:
        post_spacy = spacy.load("en_core_web_sm")
    noun_list = []
    nums = ['0','1','2','3','4','5','6','7','8','9']

    for _ in a:
        doc = post_spacy(_)
        sub_noun = []
        for token in doc:
            if token.pos_ == "NOUN" or token.pos_ == "PROPN" or token.pos_ == "NUM":
                sub_noun.append(token.text)
            if token.pos_ == "ADJ" and token.text[0] in nums:
                sub_noun.append(token.text)
        noun_list.append(sub_noun)
    return noun_list


def keyword_por_preg(n_list):
    """
    Junta lista de PoST previo a pasarlo por el API de Wikipedia.
    
    n_list = list; Obtenida de la función noun_list().
    """
    keyword_list = []
    for i in n_list:
        keyword = ''
        for j in i:
            keyword = keyword + j + ' '
        keyword_list.append(keyword)
    return keyword_list


def get_wikipage(text, lang, page_total):
    """
    Regresa las n páginas de Wikipedia más relevantes al query

    text = str; Texto proveniente de la función keyword_por_preg()
    lang = 'es' or 'en'; Lenguaje necesario para wikipedia
    page_total = int; Cantidad de páginas a regresar
    """
    if lang == 'es':
        wikipedia.set_lang('es')
    if lang == 'en':
        wikipedia.set_lang('en')
    page_title = wikipedia.search(text, results = page_total)
    return page_title


def wikipipeline(dataset, lang, page_count):
    """
    Genera los resúmenes que sirven como contexto de cada pregunta. 

    dataset = pd.DataFrame ; El nombre del dataset a procesar
    dataset = list; ya sea el nombre del dataset en formato dataset["model_input"] o list(set(dataset["model_input"]))
    lang = 'es' or 'en'; Idioma a trabajar, debe de coincidir con el del dataset para no generar algo incoherente
    page_count = int; Cantidad de páginas de Wikipedia a extraer
    """
    
    noun_list_perrona = noun_list(dataset, lang)
    key_list = keyword_por_preg(noun_list_perrona)

    resumen_list = []
    question_list = []
    iterador = 0
    for i in key_list:
        pages = get_wikipage(i, lang, page_count)
        resumen = ''
        for x in pages:
            try:
                page = wikipedia.WikipediaPage(x)
                page_sum = page.summary
                resumen = resumen + '' + page_sum
            except wikipedia.exceptions.DisambiguationError: # Se usa para evitar problemas al encontrar la página adecuada.
                #print(i, page)
                print(i, x)
            #page_sum = page.summary
            #resumen = resumen + '' + page_sum
        resumen_list.append(resumen)
        question_list.append(dataset[iterador])
        iterador += 1
    return resumen_list, question_list


def generate_embeddings(sum_list, q, ruta):
    """
    Genera dataframe de embeddings, y los guarda en un directorio de nuestra elección.

    sum_list = list ; Lista de resúmenes obtenida previamente.
    q = list ; Lista de preguntas obtenida previamente.
    ruta = str ; Directorio para guardar. 
    """
    len_list = [len(_) for _ in sum_list]
    max_length = max(len_list)

    embs = model.encode(
        sum_list,
        batch_size = 12,
        max_length = max_length,
    )['dense_vecs']

    embs_loco = [_ for _ in embs]
    dic = {'Embedding':embs_loco, 'Texto':sum_list, 'Keywords Pregunta': q}
    embs_df = pd.DataFrame(data=dic)
    embs_df.to_csv(ruta)
    print(f"Embedding guardados en la ruta {ruta} . Saludos")


def full_context_pipeline(dataset, lang, num, ruta):
    """
    Ejecuta todo el pipeline (todo junto alv compa).
    
    dataset = pd.DataFrame[_column_name_] ;  Columna del dataset a trabajar
    lang = 'en' or 'es'; Lenguaje a trabajar.
    num = int; Cantidad de páginas de wikipedia a recolectar.
    ruta = str; Dirección de guardado de dataset. (DEBE DE TENER NOMBRE DEL ARCHIVO)
    """
    conjunto = list(set(dataset))
    sum_set, q_set = wikipipeline(conjunto, lang, num)
    generate_embeddings(sum_set, q_set, ruta)

### CELDA 2: OPENAI

In [12]:
def get_questions(path):
    """
    Returns dataset questions in a list.

    path = str; File location.
    """
    dataset = pd.read_json(path, lines = True)
    questions  = dataset["model_input"]
    return [_ for _ in questions]

def get_embs(path):
    """
    Returns embeddings as a pandas DataFrame.

    path = str; File location.
    """
    embs = pd.read_csv(path)
    embs = embs.drop(columns = ["Unnamed: 0"])
    embs_text = embs["Texto"]
    embs_vec = embs["Embedding"]
    embs_txt = [_ for _ in embs_text]
    return embs_txt, embs_vec

def gen_answer(ques, retr):
    """
    Generates GPT based answer given a question and a context window.

    ques = str; Question extracted from the task dataset.
    retr = str; Context extracted from embeddings.
    """
    client = OpenAI()
    prompt = f"""
        You are a bot that answers trivia questions.
        Be brief, answer in short sentences highlighting important information.

        This is the trivia question you need to answer:
        {ques}.

        This is text that you should use to answer the question:
        {retr}.
    """

    response = client.chat.completions.create(
        model = "gpt-4o-mini",
        messages = [
            {"role": "system", "content": "You are a helpful assistant"},
            {
                "role": "user",
                "content": prompt,
            },
        ],
        temperature = 0.1,
    ).choices[0].message
    
    return response.content


def gpt_full(emb_dataset, ruta):
    """
    Implementación completa del pipeline

    emb_datset = pd.DataFrame ; Debe de ser el dataset que contenga el contexto de cada pregunta.
    ruta = str ; Ruta para guardar el archivo.
    """
    preguntas = emb_dataset["Keywords Pregunta"]
    contexto = emb_dataset["Texto"]
    gpt_answer = []
    for i in range(len(contexto)):
        answer = gen_answer(preguntas[i], contexto[i])
        gpt_answer.append(answer)
    dic = {'Pregunta': preguntas, 'Respuesta GPT': gpt_answer, 'Contexto': contexto}
    full_ds = pd.DataFrame(data=dic)
    full_ds.to_csv(ruta)
    print(f"Se guardó el dataset de respuestas de Chapi en: {ruta}. Viva Leo Messi gigante dios")

### CELDA 3: COMPARACIÓN Y AJUSTE DE FORMATO

In [4]:
def random_interval(texto):
    """
    Regresa una partición por intervalos aleatoria.

    texto = str; El texto a particionar
    """

    char_pos = 0
    intervalos = []
    while(char_pos < len(texto)):
        step = random.randint(1, len(texto) - char_pos)
        char_aux = char_pos + step
        intervalo_aux = [char_pos, char_aux]
        intervalos.append(intervalo_aux)
        char_pos += char_aux
    if intervalos[-1][1] == len(texto):
        return intervalos
    else:
        final_int = [char_aux, len(texto)]
        intervalos.append(final_int)
    return intervalos


def random_probalist(probs_dic):
    """
    ESTA ES UNA FUNCIÓN AUXILIAR
    Genera los soft_labels de manera aleatoria a partir de una lista de segmentación aleatoria del texto.
    
    probs_dic = dict ; Diccionario de probabilidades auxiliar 
    """
    prob_list = []
    for _ in interval_list:
        aux_list = []
        for i in _:
            prob = random.randint(0,len(probs_dic))
            proba_num = probs_dic[prob]
            probabilidades = {'start':i[0], 'prob':proba_num, 'end':i[-1]}
            aux_list.append(probabilidades)
        prob_list.append(aux_list)
    return prob_list


def json_creation(base_ds, lista_probabilidad, ruta):
    """
    Crea y guarda un dataset con el formato necesario para la evaluación.

    base_ds = pd.DataFrame ;  Correspondiente al conjunto de train/test/val cuyos valores se van a modificar
    lista_probabilidad = list ; Lista donde cada entrada corresponde a un diccionario con los "soft_labels" predecidos por nosotros.
    ruta = str; Ubicación donde se guardará el dataset.
    """
    
    new_ds = base_ds.drop(columns = ["soft_labels", "hard_labels"])
    new_ds["soft_labels"] = lista_probabilidad
    hard_labels = [recompute_hard_labels(new_ds["soft_labels"][i] for i in range(len(new_ds["soft_labels"])))]
    new_ds["hard_labels"] = hard_labels
    new_ds = new_ds.iloc[;, [0,1,2,3,4,5,7,8,5,6]]
    new_ds.to_json(ruta, orient = "records", lines = True)
    print(f"Se guardó el archivo en {ruta}. Viva Messi.")


def pipeline3(dataset, ruta):
    """
    Tercer pipeline de corrido.

    dataset = pd.DataFrame["model_output_text"] ;  Dataset a comparar. DEBE DE SER EL ARCHIVO ORIGINAL
    ruta = str ; Ruta para guardar
    """
    dic_aux = {0:0, 1:0.333333, 2:0.6666666, 3:1}
    random_split = [random_interval(_) for _ in dataset]
    proba_list = random_probalist(random_split)
    json_creation(dataset, proba_list, ruta)

### CELDA 4: EVALUACIÓN

In [14]:
def recompute_hard_labels(soft_labels):
    """optionally, infer hard labels from the soft labels provided"""
    hard_labels = [] 
    prev_end = -1
    for start, end in (
        (lbl['start'], lbl['end']) 
        for lbl in sorted(soft_labels, key=lambda span: (span['start'], span['end']))
        if lbl['prob'] > 0.5
    ):
        if start == prev_end:
            hard_labels[-1][-1] = end
        else:
            hard_labels.append([start, end])
        prev_end = end
    return hard_labels


def infer_soft_labels(hard_labels):
    """reformat hard labels into soft labels with prob 1"""
    return [
        {
            'start': start,
            'end': end,
            'prob': 1.0,
        }
        for start, end in hard_labels
    ]


def load_jsonl_file_to_records(filename, is_ref=True):
    """read data from a JSONL file and format that as a `pandas.DataFrame`.
    Performs minor format checks (ensures that some labels are present,
    optionally compute missing labels on the fly)."""
    df = pd.read_json(filename, lines=True)
    if not is_ref:
        assert ('hard_labels' in df.columns) or ('soft_labels' in df.columns), \
            f'File {filename} contains no predicted label!'
        if 'hard_labels' not in df.columns:
            df['hard_labels'] = df.soft_labels.apply(recompute_hard_labels)
        elif 'soft_labels' not in df.columns:
            df['soft_labels'] = df.hard_labels.apply(infer_soft_labels)
    # adding an extra column for convenience
    columns = ['id', 'soft_labels', 'hard_labels']
    if is_ref:
        df['text_len'] = df.model_output_text.apply(len)
        columns += ['text_len']
    df = df[columns]
    return df.sort_values('id').to_dict(orient='records')

def score_iou(ref_dict, pred_dict):
    """computes intersection-over-union between reference and predicted hard labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the IoU, or 1.0 if neither the reference nor the prediction contain hallucinations
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict['id'] == pred_dict['id']
    # convert annotations to sets of indices
    ref_indices = {idx for span in ref_dict['hard_labels'] for idx in range(*span)}
    pred_indices = {idx for span in pred_dict['hard_labels'] for idx in range(*span)}
    # avoid division by zero
    if not pred_indices and not ref_indices: return 1.
    # otherwise compute & return IoU
    return len(ref_indices & pred_indices) / len(ref_indices | pred_indices)

def score_cor(ref_dict, pred_dict):
    """computes Spearman correlation between predicted and reference soft labels, for a single datapoint.
    inputs:
    - ref_dict: a gold reference datapoint,
    - pred_dict: a model's prediction
    returns:
    the Spearman correlation, or a binarized exact match (0.0 or 1.0) if the reference or prediction contains no variation
    """
    # ensure the prediction is correctly matched to its reference
    assert ref_dict['id'] == pred_dict['id']
    # convert annotations to vectors of observations
    ref_vec = [0.] * ref_dict['text_len']
    pred_vec = [0.] * ref_dict['text_len']
    for span in ref_dict['soft_labels']:
        for idx in range(span['start'], span['end']):
            ref_vec[idx] = span['prob']
    for span in pred_dict['soft_labels']:
        for idx in range(span['start'], span['end']):
            pred_vec[idx] = span['prob']
    # constant series (i.e., no hallucination) => cor is undef
    if len({round(flt, 8) for flt in pred_vec}) == 1 or len({round(flt, 8) for flt in ref_vec}) == 1 : 
        return float(len({round(flt, 8) for flt in ref_vec}) == len({round(flt, 8) for flt in pred_vec}))
    # otherwise compute Spearman's rho
    return spearmanr(ref_vec, pred_vec).correlation

def main(ref_dicts, pred_dicts, output_file=None):
    assert len(ref_dicts) == len(pred_dicts)
    ious = np.array([score_iou(r, d) for r, d in zip(ref_dicts, pred_dicts)])
    cors = np.array([score_cor(r, d) for r, d in zip(ref_dicts, pred_dicts)])
    if output_file is not None:
        with open(output_file, 'w') as ostr:
            print(f'IoU: {ious.mean():.8f}', file=ostr)
            print(f'Cor: {cors.mean():.8f}', file=ostr)
    return ious, cors

## CASO DE USO

In [11]:
%%time
# Celda 1
test_set_en = pd.read_json(r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\test_ds\v1\mushroom.en-tst.v1.jsonl', lines = True)
full_context_pipeline(test_set_en["model_input"], 'en', 2, r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_context.csv')

traits  Trait
techniques  Technique
Stahlberg  Stahlberg (disambiguation)
region France Vaux en Amiénois  Vaux
Pasteur crater  Pasteur (disambiguation)
Black Sabbath Eternal Idol  Eternal Idol
19521 Chaos  Chaos


You're using a XLMRobertaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Embedding guardados en la ruta C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_context.csv . Saludos
CPU times: total: 13.3 s
Wall time: 4min 21s


In [13]:
%%time
# Celda 2

en_embs = pd.read_csv(r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_context.csv')
gpt_full(en_embs, r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_gpt.csv')

RateLimitError: Error code: 429 - {'error': {'message': 'Rate limit reached for gpt-4o-mini in organization org-AGQH6BhmQW0c8kz43DO15Fc3 on requests per min (RPM): Limit 3, Used 3, Requested 1. Please try again in 20s. Visit https://platform.openai.com/account/rate-limits to learn more.', 'type': 'requests', 'param': None, 'code': 'rate_limit_exceeded'}}

In [15]:
%%time

# Celda 3
pipeline3(test_set_en, r'C:\Users\FLopezP\Documents\GitHub\Mu-SHROOM-GIL\Datasets\full_pipeline_datasets\en_test_dataset.jsonl')

NameError: name 'pipeline3' is not defined