# **Haystack RAG para few-shot con GPT y Ollama**

In [None]:
## recomendado python 3.10
#%pip install haystack-ai==2.2.1 trafilatura==1.10.0 qdrant-haystack==3.8.0
#%pip install ipywidgets widgetsnbextension pandas-profiling
#%pip install ollama-haystack==0.0.7

Requiere:
- Contenedor con qdrant (bd vectorial) y reportes en un archivo con la ruta data\LosCarrera_labeled\etiquetado_1-456_v1.01\train.jsonl
- Contenedor con langfuse y otro con postgres (se usa para tracing de las consultas)
- Archivo .env con:
    - OPENAI_API_KEY - asociada a una cuenta que tenga saldo en openai
    - LANGFUSE_HOST - url y puerto (si es contenedor docker, puede ser http://localhost:3000)
    - LANGFUSE_SECRET_KEY - key generada por langfuse
    - LANGFUSE_PUBLIC_KEY - key generada por langfuse
    - HAYSTACK_CONTENT_TRACING_ENABLED = True - requerido para habilitar el tracing
- Archivo con reportes en la ruta data\LosCarrera_labeled\etiquetado_1-456_v1.01\test.jsonl

Imports

In [None]:
import os
from dotenv import load_dotenv

from haystack import Pipeline
from haystack.dataclasses import Document
from haystack_integrations.components.retrievers.qdrant import QdrantEmbeddingRetriever
from haystack_integrations.document_stores.qdrant import QdrantDocumentStore
from haystack_integrations.components.connectors.langfuse import LangfuseConnector

# from haystack.components.converters import TextFileToDocument
# from haystack.components.preprocessors import DocumentCleaner, DocumentSplitter
from haystack.components.embedders import OpenAIDocumentEmbedder, OpenAITextEmbedder
from haystack.components.writers import DocumentWriter
from haystack.components.builders import PromptBuilder
from haystack.components.generators import OpenAIGenerator
from haystack_integrations.components.generators.ollama import OllamaGenerator

# from haystack.components.evaluators import DocumentRecallEvaluator

import pandas as pd
load_dotenv()

### SELECCIONAR MODELO

In [None]:
# SELECCIONAR gpt-4o, gpt-4o-mini o llama3.1
CHAT_MODEL = "gpt-4o"

Función para convertir la numeración de los ids en números partiendo desde el 0

(Esto es para que sea más fácil para el modelo asociar ids)

Esta función se utiliza tanto al guardar los reportes en la bd vectorial, como al enviar el prompt y evaluar los resultados.

In [None]:
def convert_ids(entities, relations):
    # convertir los ids de las entidades y relaciones de manera que comiencen desde 0 en las entidades, y se mantenga la correlación con las relaciones
    entities_dict = {entity["id"]: i for i, entity in enumerate(entities)}
    relations = [{"from_id": entities_dict[rel["from_id"]], "to_id": entities_dict[rel["to_id"]], "type": rel["type"]} for rel in relations]
    entities = [{"id": entities_dict[entity["id"]], "label": entity["label"], "start_offset": entity["start_offset"], "end_offset": entity["end_offset"]} for entity in entities]
    return entities, relations


### Preprocesado dataset 
strings -> documentos

Se requiere convertir los datos del jsonl (reportes etiquetados) a un formato compatible con Haystack (clase Document).

In [None]:
# leer dataset y convertir conjuntamente en un txt y en una lista de Documents

data_path = "data/LosCarrera_labeled/etiquetado_1-456_v1.01/"
jsonl_file_path = data_path + "test.jsonl"

documents = []

for i, row in pd.read_json(jsonl_file_path, lines=True).iterrows():
    # eliminar entidades de tipo "GANGLIOS"
    entities = [entity for entity in row["entities"] if entity["label"] != "GANGLIOS"]
    entities, relations = convert_ids(entities, row["relations"]) 
    documents.append(Document(content=row["text"], meta={"id": row["id"], "entities": entities, "relations": relations}))

Ver un reporte random extraído

In [None]:
import random

random_num = random.randint(0, len(documents)-1)

print("DOCUMENTO:\n")
print(documents[random_num])

print("\n\nCONTENIDO:\n")
print(documents[random_num].content)

### Embedding pipeline (qdrant y OpenAI)

documentos -> vectores

Se crea un pipeline que permite subir los datos a Qdrant. Actualmente, se puede montar qdrant en un docker, asegurándose que use el puerto configurado abajo.

El pipeline usa el modelo de OpenAI para generar los vectores (text-embedding-3-large), usando la librería OpenAIDocumentEmbedder.

In [None]:
# setear y correr pipeline de indexado en la BD

document_store = QdrantDocumentStore(url="http://localhost",
                                     port=6333,
                                     embedding_dim=3072,
                                     index="AIMA_LosCarrera_RE_v1.01",)

if document_store.count_documents() == 0:
    embedder = OpenAIDocumentEmbedder(model="text-embedding-3-large")
    writer = DocumentWriter(document_store=document_store)

    indexing_pipeline = Pipeline()
    indexing_pipeline.add_component("tracer", LangfuseConnector("Qdrant Document Embedder"))
    indexing_pipeline.add_component("embedder", embedder)
    indexing_pipeline.add_component("writer", writer)

    indexing_pipeline.connect("embedder.documents", "writer.documents")
    indexing_pipeline.connect("embedder.documents", "writer.documents")

    result = indexing_pipeline.run(data={"documents":documents})

In [None]:
document_store.count_documents()

### RAG pipeline

Embeddings - OpenAI (se usa para convertir el nuevo reporte a vector y hacer RAG)<br>
Chat - OpenAI o LLama (genera el etiquetado una vez hecho el RAG)<br>
BD vectorial - Qdrant<br>

In [None]:
# Setear pipeline y prompt
# from haystack.components.validators import JsonSchemaValidator

NUM_EXAMPLES = 0

text_embedder = OpenAITextEmbedder(model="text-embedding-3-large")
retriever = QdrantEmbeddingRetriever(document_store ,top_k=NUM_EXAMPLES)
if CHAT_MODEL == "gpt-4o-mini" or CHAT_MODEL == "gpt-4o":
    if NUM_EXAMPLES > 0:
        template = open(data_path + "RE/prompt_gpt.txt", "r").read()
    elif NUM_EXAMPLES == 0:
        template = open(data_path + "RE/prompt_gpt_zero-shot.txt", "r").read()
        retriever = QdrantEmbeddingRetriever(document_store ,top_k=1)
    llm = OpenAIGenerator(model=CHAT_MODEL)
elif CHAT_MODEL == "llama3.1":
    template = open(data_path + "RE/prompt_ollama.txt", "r").read()
    llm = OllamaGenerator(model=CHAT_MODEL, url="http://localhost:11434/api/generate")
else:
    raise ValueError("CHAT_MODEL debe ser 'gpt-4o', 'gpt-4o-mini' o 'llama3.1'")

prompt_builder = PromptBuilder(template=template)
rag_pipeline = Pipeline()

rag_pipeline.add_component("tracer", LangfuseConnector("Mammography Few-Shot RAG Rel Ext "+ CHAT_MODEL))
rag_pipeline.add_component("text_embedder", text_embedder)
rag_pipeline.add_component("retriever", retriever)
rag_pipeline.add_component("prompt_builder", prompt_builder)
rag_pipeline.add_component("llm", llm)
# rag_pipeline.add_component("schema_validator", JsonSchemaValidator())

rag_pipeline.connect("text_embedder.embedding", "retriever.query_embedding")
rag_pipeline.connect("retriever.documents", "prompt_builder.documents")
rag_pipeline.connect("prompt_builder", "llm")
# rag_pipeline.connect("llm.response", "schema_validator.data")

#### Prompts a la API del modelo

Definir dataset a enviar mediante prompts

In [None]:
reports_file_path = data_path + "test.jsonl"

# load validation data

prompt_reports = pd.read_json(reports_file_path, lines=True)

##### Loop de envío

In [None]:
import json

responses = []
# Setear número de reportes a enviar a la API, es un prompt por reporte
NUM_REPORTS = 69 # 69 es el total del conjunto de test

# 
processed_ids_path = "processed_ids.txt"
if os.path.exists(processed_ids_path):
    with open(processed_ids_path, "r") as f:
        processed_ids_list = f.read().splitlines()
else:
    processed_ids_list = []

for i, informe in enumerate(prompt_reports["text"][:NUM_REPORTS]):
    entities, relations = convert_ids(prompt_reports["entities"][i], prompt_reports["relations"][i])
    # para cada entidad, se agrega el texto del span en la llave "span_text"
    for entity in entities:
        entity["span_text"] = informe[entity["start_offset"]:entity["end_offset"]]
    query = informe + "\nEntidades:\n" + str(json.dumps(entities, indent=4, ensure_ascii=False))
    report_id = prompt_reports["id"][i]
    # si el id del reporte ya está en la lista de ids procesados, no se envía a la API
    if report_id in processed_ids_list:
        print("Reporte ", i+1, " de ", NUM_REPORTS, " ya procesado")
        continue
    result = rag_pipeline.run(data={"prompt_builder": {"query":query}, "text_embedder": {"text": query}})
    # if the string contains ```json and ``` remove them
    result_str = result["llm"]["replies"][0]
    result_str = result_str.replace("```json", "")
    result_str = result_str.replace("```", "")
    # convert result from string to a list of dictionaries
    try:
        result_json = eval(result_str)
        result_json = {"id": report_id, "relations": result_json}
        responses.append(result_json)
        print("Procesado reporte ", i+1, " de ", NUM_REPORTS)
        processed_ids_list.append(report_id)
    except:
        print("Error en el reporte ", i+1, " de ", NUM_REPORTS)
    print("id: ", report_id)
    # print(informe)
    # print(json.dumps(result_json, indent=4, ensure_ascii=False))

# Guardar el listado de ids procesados en un archivo txt en la carpeta actual
with open(processed_ids_path, "w") as f:
    for item in processed_ids_list:
        f.write("%s\n" % item)
print("Número de reportes enviados a la API: ", len(responses))

##### Arreglar spans y guardar respuestas en una lista

In [None]:
# guardar resultados en un archivo jsonl en la carpeta actual, append mode
output_file_path = "output.jsonl"

with open(output_file_path, "a") as f:
    for item in responses:
        f.write(json.dumps(item, ensure_ascii=False) + "\n")

### Evaluación de desempeño

##### Funciones

In [None]:
def compare_relations(relation1, relation2):
    if relation1["type"] == relation2["type"] and relation1["from_id"] == relation2["from_id"] and relation1["to_id"] == relation2["to_id"]:
        return True
    return False

    
# contar tp, fp, fn para cada clase para un reporte
def calculate_tp_fp_fn_report(gold_relations, generated_relations, classes):
    num_classes = len(classes)
    tp = [0]*num_classes
    fp = [0]*num_classes
    fn = [0]*num_classes

    for gold_relation in gold_relations:
        found = False
        for generated_relation in generated_relations:
            if compare_relations(gold_relation, generated_relation):
                tp[classes.index(gold_relation["type"])] += 1
                found = True
                break
        if not found:
            fn[classes.index(gold_relation["type"])] += 1

    for generated_relation in generated_relations:
        found = False
        for gold_relation in gold_relations:
            if compare_relations(gold_relation, generated_relation):
                found = True
                break
        if not found:
            if generated_relation["type"] in classes:
                fp[classes.index(generated_relation["type"])] += 1

    return tp, fp, fn

# calcular precision, recall y f1 dados tp, fp y fn
def calculate_metrics(tp, fp, fn):
    precision = 0.0
    recall = 0.0
    f1 = 0.0

    precision = tp/(tp+fp) if tp+fp > 0 else 0
    recall = tp/(tp+fn) if tp+fn > 0 else 0
    f1 = 2*precision*recall/(precision+recall) if precision+recall > 0 else 0

    return precision, recall, f1

##### Ejecución (cálculo de métricas)

In [None]:
# leer entidades desde el archivo jsonl output.jsonl
fixed_responses = []
with open(output_file_path, "r") as f:
    for line in f:
        fixed_responses.append(json.loads(line))

print("Número de informes con relaciones extraídas: ", len(fixed_responses))

In [None]:
# create or open csv file to save results
results_file_path = data_path + "RE/results.csv"
if not os.path.exists(results_file_path):
    results_df = pd.DataFrame(columns=["chat_model","fecha/hora", "num_reports", "macro_f1", "micro_f1", "ubicar_f1", "describir_f1"])
    results_df.to_csv(results_file_path, index=False)
else:
    results_df = pd.read_csv(results_file_path)

# calcular tp, fp, fn para cada clase
classes = ["ubicar", "describir"]
gold_relations_counts = {}
tp_total = [0]*len(classes)
fp_total = [0]*len(classes)
fn_total = [0]*len(classes)

        
# por cada informe, llamar a la función calculate_tp_fp_fn_report. Sumar los resultados de cada informe para calcular tp, fp, fn totales.
for i, informe in enumerate(prompt_reports["text"][:NUM_REPORTS]):
    gold_entities, gold_relations = convert_ids(prompt_reports["entities"][i], prompt_reports["relations"][i])
    # contar cantidad de entidades de cada clase
    for relation in gold_relations:
        if relation["type"] in gold_relations_counts:
            gold_relations_counts[relation["type"]] += 1
        else:
            gold_relations_counts[relation["type"]] = 1

    # buscar entidades corregidas del informe, buscando por id en la lista fixed_responses
    generated_relations = []
    for response in fixed_responses:
        if response["id"] == prompt_reports["id"][i]:
            generated_relations = response["relations"]
    if len(generated_relations) == 0:
        print("No se encontraron entidades corregidas para el informe ", i)
        continue
    tp, fp, fn = calculate_tp_fp_fn_report(gold_relations, generated_relations, classes)
    tp_total = [sum(x) for x in zip(tp_total, tp)]
    fp_total = [sum(x) for x in zip(fp_total, fp)]
    fn_total = [sum(x) for x in zip(fn_total, fn)]

# calcular precision, recall y f1 para cada clase
metrics_per_class = []
for i, class_name in enumerate(classes):
    print("Clase: ", class_name)
    metrics = calculate_metrics(tp_total[i], fp_total[i], fn_total[i])
    print("Precision: ", metrics[0])
    print("Recall: ", metrics[1])
    print("F1: ", metrics[2])
    metrics_per_class.append(metrics)
    # mostrar cantidad de entidades de la clase
    if class_name in gold_relations_counts:
        print("Cantidad de entidades de la clase: ", gold_relations_counts[class_name])

# macro-average
macro_precision = 0
macro_recall = 0
macro_f1 = 0

for i, class_name in enumerate(classes):
    macro_precision += metrics_per_class[i][0]/len(classes)
    macro_recall += metrics_per_class[i][1]/len(classes)
    macro_f1 += metrics_per_class[i][2]/len(classes)

print("Macro-average")
print("Precision: ", macro_precision)
print("Recall: ", macro_recall)
print("F1: ", macro_f1)

# micro-average

micro_precision, micro_recall, micro_f1 = calculate_metrics(sum(tp_total), sum(fp_total), sum(fn_total))

print("Micro-average")
print("Precision: ", micro_precision)
print("Recall: ", micro_recall)
print("F1: ", micro_f1)

In [None]:
# guardar resultados en csv
import datetime
results_df = pd.concat([results_df, pd.DataFrame([[CHAT_MODEL,datetime.datetime.now(), NUM_REPORTS, macro_f1, micro_f1, metrics_per_class[0][2], metrics_per_class[1][2]]], columns=results_df.columns)], ignore_index=True)

results_df.to_csv(results_file_path, index=False)