In [1]:
import numpy as np
# from sklearn.metrics.pairwise import cosine_similarity

import sys, os

project_root = os.path.abspath(os.path.join(os.getcwd(), "../.."))
if project_root not in sys.path:
    sys.path.append(project_root)
    
from rag_model.model.Final_pipeline.final_doc_processor import *
from rag_model.model.RE.final_re import *

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
phobert = PhoBertEmbedding()

In [3]:
def cosine(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))

In [None]:
def evaluate_embedding(referenced_context: List[str], retrieved_context: List[str], threshold=0.60):

    referenced_set = list(set(referenced_context))
    retrieved_set  = list(set(retrieved_context))

    tp = 0 

    # For each referenced item, check if ANY retrieved item passes threshold
    for ref in referenced_set:
        max_sim = -1

        for ret in retrieved_set:
            ref_emb = text_embedding(ref, 1)
            ret_emb = text_embedding(ret, 1)
            sim_score = cosine(ref_emb, ret_emb)

            if sim_score > max_sim:
                max_sim = sim_score

        if max_sim >= threshold:
            tp += 1

    precision = tp / len(retrieved_set) if len(retrieved_set) > 0 else 0
    recall    = tp / len(referenced_set) if len(referenced_set) > 0 else 0
    f1_score  = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return f'Precision: {precision}, Recall: {recall}, F1-Score: {f1_score}'

In [23]:
print(evaluate_embedding(referenced_context=['Hà Nội là thủ đô của Việt Nam', 'Hà Nội có Lăng Bác', 'TP. Hồ Chí Minh ở miền Nam'], retrieved_context=['TP. Hồ Chí Minh là thành phố lớn', 'Hà Nội là thủ đô của Việt Nam']))

Precision: 1.0, Recall: 0.6666666666666666, F1-Score: 0.8


In [20]:
def jaccard(a, b): 
    A = set(a.lower().split()) 
    B = set(b.lower().split()) 
    return len(A & B) / len(A | B) if len(A | B) > 0 else 0

def evaluate_jaccard(referenced_context: List[str], retrieved_context: List[str], threshold=0.6):

    referenced_set = list(set(referenced_context))
    retrieved_set  = list(set(retrieved_context))

    used = set()   # retrieved indices already matched
    tp = 0

    for ref in referenced_set:
        best_match = None
        best_score = 0

        for i, ret in enumerate(retrieved_set):
            if i in used:
                continue

            score = jaccard(ref, ret)
            if score > best_score:
                best_score = score
                best_match = i

        if best_score >= threshold:
            tp += 1
            used.add(best_match)

    precision = tp / len(retrieved_set) if len(retrieved_set) > 0 else 0
    recall    = tp / len(referenced_set) if len(referenced_set) > 0 else 0
    f1_score  = 2 * precision * recall / (precision + recall) if precision + recall > 0 else 0

    return f'Precision: {precision}, Recall: {recall}, F1-Score: {f1_score}'


In [22]:
print(evaluate_jaccard(referenced_context=['Hà Nội là thủ đô của Việt Nam', 'Hà Nội có Lăng Bác', 'TP. Hồ Chí Minh ở miền Nam'], retrieved_context=['TP. Hồ Chí Minh là thành phố lớn', 'Hà Nội là thủ đô của Việt Nam']))

Precision: 0.5, Recall: 0.3333333333333333, F1-Score: 0.4


### Use RAGAS

In [None]:
import dotenv
dotenv.load_dotenv()

from ragas.metrics import (
    ContextPrecision,
    LLMContextRecall
)
from tqdm import tqdm
from langchain_openai import ChatOpenAI, OpenAIEmbeddings
from ragas import evaluate
from datasets import Dataset as HFDataset

metrics = [
    LLMContextRecall(),
    ContextPrecision()
]

In [None]:
df = pd.DataFrame([
    {
        "user_input": "What is CPU scheduling in operating systems?",
        "retrieved_contexts": [
            "CPU scheduling determines which process runs on the CPU.",
            "Schedulers improve efficiency and throughput."
        ],
        "reference": "CPU scheduling is the method by which the OS selects a process to run next.",
        'synthesizer_name': 'gpt-3.5-turbo'
    },
    {
        "user_input": "Explain what virtual memory is.",
        "retrieved_contexts": [
            "Virtual memory uses disk space to extend RAM.",
            "It allows executing programs larger than physical memory."
        ],
        "reference": "Virtual memory allows a computer to compensate for physical memory limitations using disk space.",
        'synthesizer_name': 'gpt-3.5-turbo'
    }
])

#### Use OpenAI

In [None]:
llm = ChatOpenAI(model="gpt-4o-mini", temperature=0)
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


df["retrieved_contexts"] = df["retrieved_contexts"].apply(
    lambda x: x if isinstance(x, list) else [x]
)

hf_ds = HFDataset.from_pandas(df)
results = evaluate(hf_ds, metrics=metrics, llm=llm, embeddings=embeddings)
print(results)

#### Or use HuggingFace

In [None]:
from langchain_community.llms import HuggingFacePipeline
from transformers import AutoTokenizer, AutoModelForCausalLM, pipeline
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from ragas import evaluate

# LLM
model_id = "Qwen/Qwen2.5-1.5B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id)
pipe = pipeline("text-generation", model=model, tokenizer=tokenizer, max_new_tokens=512)
llm = HuggingFacePipeline(pipeline=pipe)

# Embeddings
embeddings = HuggingFaceBgeEmbeddings(model_name="BAAI/bge-base-en-v1.5")

# Evaluate
results = evaluate(hf_ds, metrics=metrics, llm=llm, embeddings=embeddings)
print(results)