# RAG assistant witrh Chroma

Notebook consisting in two parts:
- Generation of the vectorstore from the 4 PDF documents provided.
- Model to inference the answer from a question using RAG retrieving from the vectorstore generated

The model used for the embeddings and the text generation are from OpenAI.
And the library using for generate the embeddings and the vectorstore is Chroma.

In [1]:
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai import OpenAIEmbeddings
from langchain.chains.question_answering import load_qa_chain
from langchain.vectorstores import Chroma
import chromadb
from langchain_community.document_loaders import PyMuPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import os

from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI


In [2]:
# Embeddings model and LLM
embedding=OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
llm = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.2, api_key=os.getenv("OPENAI_API_KEY"))

## Create the embeddings of the docuemts

In [3]:
# Function to embed a document or a list of documents
def load_doc(paths):
    """
    Function to extract text from pdf, create the embeddings of the documents and save them in the file "chroma_db".

    Argument:
    path: the directory where to find the pdf file/s. It can be a string or a list of directories.
    
    It uses OpenAI model, so it will need to define the OpenAI_key.

    """
    # Converting path as a list
    if isinstance(paths, str):
        paths = [paths] 

    # Loop to embed all the document of the list
    for pth in paths:
        loader = PyMuPDFLoader(pth)
        data = loader.load()

        # Split document in chunks
        text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=0)
        all_splits = text_splitter.split_documents(data)

        # Embed the splits and store them in the vectorstore as "chroma_db"
        embedding = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))
        vectorstore = Chroma.from_documents(documents=all_splits, embedding=embedding, persist_directory="./chroma_db")
        print(f"Vectorstore for {pth} created and stored.")
    return vectorstore


In [9]:
# List of paths to the documents to be embedded
path = [r"..\documents\doc_2023_12_Posicionamiento_Environment.pdf", 
              r"..\documents\Ficha tecnica CI Environment ISR_240317.pdf", 
              r"..\documents\NORRCO004_V28_Catalogo de productos de activo vigentes.pdf", 
              r"..\documents\Tarifas transferencias Extranjero.pdf",
             ]

# Executing the function to embed documents
load_doc(path)

Vectorstore for ..\documents\doc_2023_12_Posicionamiento_Environment.pdf created and stored.
Vectorstore for ..\documents\Ficha tecnica CI Environment ISR_240317.pdf created and stored.
Vectorstore for ..\documents\NORRCO004_V28_Catalogo de productos de activo vigentes.pdf created and stored.
Vectorstore for ..\documents\Tarifas transferencias Extranjero.pdf created and stored.


<langchain_community.vectorstores.chroma.Chroma at 0x15cf1abe480>

## Model using RAG

In [10]:
# RAG prompt
from langchain import hub
prompt = hub.pull("rlm/rag-prompt")

In [11]:
# Load the previously created Chroma index from the "chroma_db" directory
vectorstore = Chroma(persist_directory="./chroma_db", embedding_function=embedding)

# RetrievalQA chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectorstore.as_retriever(),
    chain_type_kwargs={"prompt": prompt}
)

## Testing the model

In [12]:
# Question and invoking the chain for quering
question = "¿Puedes indicarme el coste de recibir una transferencia de 5.000€ en Yenes?"
result = qa_chain.invoke({"query": question})
result

{'query': '¿Puedes indicarme el coste de recibir una transferencia de 5.000€ en Yenes?',
 'result': 'El coste de recibir una transferencia de 5.000€ en Yenes sería de 5€ como mínimo y un máximo de 250€, con una comisión de pago del 2,5‰.'}

In [13]:
question_2 = "¿Noruega tiene convenio con la UE?"
result_2 = qa_chain.invoke({"query": question_2})
result_2

{'query': '¿Noruega tiene convenio con la UE?',
 'result': 'Sí, Noruega tiene un convenio con la UE a través del Espacio Económico Europeo (EEE), que incluye a Islandia y Liechtenstein. El EEE permite a Noruega participar en el mercado único de la UE sin ser miembro de la Unión Europea.'}