# Análisis de los diferentes modelos

## 1. Importación de librerías y modulos

In [1]:
#%pip install -r requirements.txt

In [2]:
import os
from dotenv import load_dotenv
from datasets import Dataset
from ragas import evaluate
from ragas.metrics import faithfulness, answer_relevancy, context_precision, context_recall
from langchain_huggingface.llms import HuggingFacePipeline
from langchain_huggingface import HuggingFaceEmbeddings
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document
from langchain_community.llms import Ollama
from langchain_community.vectorstores import Chroma
from module import *

  from .autonotebook import tqdm as notebook_tqdm
Found Intel OpenMP ('libiomp') and LLVM OpenMP ('libomp') loaded at
the same time. Both libraries are known to be incompatible and this
can cause random crashes or deadlocks on Linux when loaded in the
same Python program.
Using threadpoolctl may cause crashes or deadlocks. For more
information and possible workarounds, please see
    https://github.com/joblib/threadpoolctl/blob/master/multiple_openmp.md



In [3]:
emb_model = "baai_small"
llm_model = "llama3"

## 2. Carga de datos 

In [4]:
document_loader = PyPDFDirectoryLoader("./data_testing")
documents = document_loader.load()

In [5]:
chunks = split_documents(documents)

## 3. Creación de la base de datos vectorial

In [6]:
db = Chroma(
    persist_directory="./database_testing", embedding_function=get_embedding_function(emb_model)
)

# Calculate Page IDs.
chunks_with_ids = calculate_chunk_ids(chunks)

# Add or Update the documents.
existing_items = db.get(include=[])  # IDs are always included by default
existing_ids = set(existing_items["ids"])
print(f"Number of existing documents in DB: {len(existing_ids)}")

# Only add documents that don't exist in the DB.
new_chunks = []
for chunk in chunks_with_ids:
    if chunk.metadata["id"] not in existing_ids:
        new_chunks.append(chunk)

if len(new_chunks):
    print(f"👉 Adding new documents: {len(new_chunks)}")
    new_chunk_ids = [chunk.metadata["id"] for chunk in new_chunks]
    db.add_documents(new_chunks, ids=new_chunk_ids)
    db.persist()
else:
    print("✅ No new documents to add")



Number of existing documents in DB: 0
👉 Adding new documents: 29


  warn_deprecated(


## 4. Preguntas de interés

Formulamos las preguntas y buscamos en la base de datos los chunks que nos ofrezcan mejor contexto para responderlas.

In [7]:
questions_en = [
    "In what year and city was Miguel de Cervantes born?",
    "In what year and city was William Shakespeare born?",
    "In what year and city was Joan Ramis i Ramis born?",
]

questions_es = [
    "¿En qué año y ciudad nació Miguel de Cervantes?",
    "¿En qué año y ciudad nació William Shakespeare?",
    "¿En qué año y ciudad nació Joan Ramis i Ramis?",
]

questions_cat = [
    "En quin any i ciutat va néixer Miguel de Cervantes?",
    "En quin any i ciutat va néixer William Shakespeare?",
    "En quin any i ciutat va néixer Joan Ramis i Ramis?",
]

questions = questions_en + questions_es + questions_cat

In [13]:
contexts = []

for question in questions:
    # Get the top 5 most relevant documents
    results = db.similarity_search_with_score(question, k=5)
    context = "\n\n---\n\n".join([doc.page_content for doc, _score in results])

    # Append the context to the list of contexts
    contexts.append(context)

Ahora creamos las respuestas esperadas para cada pregunta.

In [14]:
answers_en = [
    "Miguel de Cervantes was born in 1547 in Alcalá de Henares.",
    "William Shakespeare was born in 1564 in Stratford-upon-Avon.",
    "Joan Ramis i Ramis was born in 1746 in Palma.",
]

answers_es = [
    "Miguel de Cervantes nació en 1547 en Alcalá de Henares.",
    "William Shakespeare nació en 1564 en Stratford-upon-Avon.",
    "Joan Ramis i Ramis nació en 1746 en Palma.",
]

answers_cat = [
    "Miguel de Cervantes va néixer el 1547 a Alcalá de Henares.",
    "William Shakespeare va néixer el 1564 a Stratford-upon-Avon.",
    "Joan Ramis i Ramis va néixer el 1746 a Palma.",
]

answers = answers_en + answers_es + answers_cat

Finalmente creamos el DataFrame para evaluar los modelos.

In [15]:
data_samples = {
    'question': questions,
    'answer': answers,
    'contexts' : contexts,
    'ground_truth': answers
}

dataset = Dataset.from_dict(data_samples)

## 5. Evaluación de los modelos

In [17]:
embeddings = get_embedding_function(emb_model)
llm = Ollama(model=llm_model)



In [18]:
score = evaluate(
    dataset,
    metrics=[faithfulness, answer_relevancy, context_precision, context_recall],
    llm=llm,
    embeddings=embeddings
)

df_score = score.to_pandas()
print(df_score)

ValueError: Dataset feature "contexts" should be of type Sequence[string], got <class 'datasets.features.features.Value'>