In [None]:
from get_response import PoliGPT
import rag_metrics as rm
import json

poligpt = PoliGPT(faiss_index_dir='../01_data/project_faiss')

FAISS inicializado - Vectores: 139471 | Dimensión: 768


In [3]:
def read_json(json_path):
    preguntas, respuestas_modelo = [], []
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)

    for entry in data:
        for preg in entry["preguntas"]:
            preguntas.append(preg["pregunta"])
            respuestas_modelo.append(preg["respuesta"])

    return preguntas, respuestas_modelo

preguntas, respuestas_modelo = read_json("../01_data/preguntas.json")

In [None]:
respuestas = []
for pregunta in preguntas:  
    respuesta = poligpt.query_poligpt(pregunta, k_context=2)
    if "error" in respuesta:
        print(pregunta)
        continue
    elif respuesta['response'] != 'No dispongo de información suficiente en el contexto proporcionado.':
        respuestas.append(respuesta)

    else:
        pr

In [5]:
respuestas

[]

In [13]:
def extract_context_data(respuestas):
    """
    Toma la salida de PoliGPT (lista de dicts con 'context_used') y
    devuelve:
      - all_contexts: para cada respuesta, la lista de textos de contexto.
      - all_metadata: para cada respuesta, la lista de tuplas (doc_id, página).
    """
    all_contexts = []
    all_metadata = []
    for rsp in respuestas:
        ctxs = []
        metas = []
        for doc, score in rsp['context_used']:
            # doc es un objeto Document con atributos .metadata y .id
            ctxs.append(doc.page_content)
            # extraemos el id y la página:
            doc_id = getattr(doc, 'id', None)
            page   = doc.metadata.get('page')
            metas.append((doc_id, page))
        all_contexts.append(ctxs)
        all_metadata.append(metas)
    return all_contexts, all_metadata

contextos, metadatos = extract_context_data(respuestas)
respuestas_final = [dic['response'] for dic in respuestas]
preguntas_final = [dic['query'] for dic in respuestas]

In [14]:
def evaluate_dataset(preguntas, respuestas, contextos):
    """Aggregate the three metrics over an iterable of RAG records."""
    g_scores, f1s, qc_sims = [], [], []
    for answer, contexts, question in zip(respuestas, contextos,preguntas):
        g_scores.append(rm.grounding_score(answer, contexts))
        f1s.append(rm.context_overlap_f1(answer, contexts))
        qc_sims.append(rm.question_context_similarity(question, contexts))
    n = max(len(g_scores), 1)
    return {
        "Grounding": sum(g_scores) / n,
        "ContextOverlapF1": sum(f1s) / n,
        "QuestionContextSim": sum(qc_sims) / n,
    }

In [None]:
def evaluate_with_references(preguntas, respuestas, contextos, preguntas_referencia, model_name):
    """Aggregate both reference-based and reference-free metrics.  
    Records must include keys:
      - "respuesta" (prediction)
      - "reference" (ground truth answer)
      - "pregunta", "contextos"
    """
    ems, f1_r, rouge_l, gs, f1s, sims, c_cxt_anw, c_qst_anw, c_qst_cxt, l2_cxt_anw, l2_qst_anw, l2_qst_cxt, c_ref_anw, l2_ref_anw = [], [], [], [], [], [], [], [], [], [], [], [], [], []
    for pred, ref, ctx, q in zip(respuestas, preguntas_referencia, contextos, preguntas):
        ems.append(rm.exact_match_score(pred, ref))
        f1_r.append(rm.token_f1_score(pred, ref))
        rouge_l.append(rm.rouge_l_score(pred, ref))
        gs.append(rm.grounding_score(pred, ctx))
        f1s.append(rm.context_overlap_f1(pred, ctx))
        sims.append(rm.question_context_similarity(q, ctx))
        c_cxt_anw.append(rm.cosine_similarity_score_context(pred, ctx, model_name))
        c_qst_anw.append(rm.cosine_similarity_score(pred, q, model_name))
        c_qst_cxt.append(rm.cosine_similarity_score_context(q, ctx, model_name))
        c_ref_anw.append(rm.cosine_similarity_score(ref, pred, model_name))
        l2_cxt_anw.append(rm.avg_l2_distance_context(pred, ctx, model_name))
        l2_qst_anw.append(rm.avg_l2_distance(pred, q, model_name))
        l2_qst_cxt.append(rm.avg_l2_distance_context(q, ctx, model_name))
        l2_ref_anw.append(rm.avg_l2_distance(ref, pred, model_name))

    n = max(len(ems), 1)
    return {
        "ExactMatch": sum(ems)/n,
        "TokenF1":    sum(f1_r)/n,
        "ROUGE_L":    sum(rouge_l)/n,
        "GroundingScore":  sum(gs)/n,
        "ContextOverlapF1":  sum(f1s)/n,
        "QuestionContextSim": sum(sims)/n,
        "CosineContextAnswer": sum(c_cxt_anw)/n,
        "CosineQuestionAnswer": sum(c_qst_anw)/n,
        "CosineQuestionContext": sum(c_qst_cxt)/n,
        "CosineReferenceAnswer": sum(c_ref_anw)/n,
        "L2ContextAnswer": sum(l2_cxt_anw)/n,
        "L2QuestionAnswer": sum(l2_qst_anw)/n,
        "L2QuestionContext": sum(l2_qst_cxt)/n,
        "L2ReferenceAnswer": sum(l2_ref_anw)/n,
    }

In [16]:
print(len(preguntas_final), len(respuestas_final), len(contextos))

6 6 6


In [17]:
evaluate_dataset(preguntas_final, respuestas_final, contextos)

{'Grounding': 0.6742382380292731,
 'ContextOverlapF1': 0.5018434104390614,
 'QuestionContextSim': 0.1981582161117383}

In [18]:
evaluate_with_references(preguntas_final, respuestas_final, contextos, respuestas_modelo)

{'ExactMatch': 0.0,
 'TokenF1': 0.06753010169702935,
 'ROUGE_L': 0.0642938557423368,
 'GroundingScore': 0.6742382380292731,
 'ContextOverlapF1': 0.5018434104390614,
 'QuestionContextSim': 0.1981582161117383,
 'CosineContextAnswer': 0.854008803764979,
 'CosineQuestionAnswer': 0.8704183300336202,
 'CosineQuestionContext': 0.7997889518737793,
 'CosineReferenceAnswer': 0.20269746085007986,
 'L2ContextAnswer': 1.4268150726954143,
 'L2QuestionAnswer': 1.4146085778872173,
 'L2QuestionContext': 1.7336900035540264,
 'L2ReferenceAnswer': 3.3856134017308555}