# RAG assitant with FAISS

Notebook consisting in several parts:
- Function that receives a document and a question and provides an answer. IT IS WORKING
- Separate the function in 2 parts:
    - Generation of the vectorstore from the 4 PDF documents provided.
    - Model to inference the answer from a question using RAG retrieving from the vectorstore generated

The model used for the embeddings and the text generation are from OpenAI.
And the library using for generate the embeddings and the vectorstore is FAISS.

In [3]:
import os
import sys
from dotenv import load_dotenv
from pathlib import Path
from configparser import ConfigParser
from langchain_community.chat_models import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from langchain_community.vectorstores.faiss import FAISS
from langchain.chains import StuffDocumentsChain, LLMChain
from langchain_core.prompts import PromptTemplate
from langchain_community.document_loaders import PyMuPDFLoader
from langchain_text_splitters import CharacterTextSplitter
from datetime import datetime

load_dotenv()

True

# Code for one question on one document

In [7]:
# Function that receives the path of a document and a question and returns the answer.
def summarize_2(file_path: str, query: str):

    """
    Function that extracts the information from the PDF files, then cuts the text in chunks, embed them and save them in a vectorstore.
    Then, with the retrieve information and the question, it provides an answer.
    
    The argumants are:

    file_path:
        Path of the file to embed.
    query:
        Question we need the assistant answers

    It uses OpenAI model, so it will need to define the OpenAI_key.
    """

    # Initialize the model
    model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.2, api_key=os.getenv("OPENAI_API_KEY"))
    embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))

    # Determine the FAISS index path
    file_name = str(file_path).split('/')[-1]
    faiss_index = str(file_path).replace(file_name, 'faiss_index')

    # Attempt to load existing FAISS index
    try:
        db = FAISS.load_local(faiss_index, embeddings, allow_dangerous_deserialization=True)
        print(f"Embeddings ya existentes del documento cargados")
        print(f"Numero de documentos indexados en FAISS: {db.index.ntotal}")
    except:
        # Initialize FAISS index from scratch if load fails
        init = datetime.now()
        loader = PyMuPDFLoader(file_path)
        documents = loader.load()
        print(f"Numero de paginas del pdf: {len(documents)}")
        text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
        docs = text_splitter.split_documents(documents)
        full_docs = [doc for doc in docs if len(doc.page_content) > 150]
        db = FAISS.from_documents(full_docs, embeddings)
        print(f"Numero de documentos indexados en FAISS: {db.index.ntotal}")
        db.save_local(faiss_index)
        fin = datetime.now()
        delta = round((fin-init).total_seconds()/60, 2)
        print(f"Tiempo transcurrido en carga del pdf y embeddings: {delta} min")

    # Retrieval setup
    retriever_pdf = db.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 10,
        })

    init = datetime.now()

    # Preparing the LLM chain with the prompt template
    prompt_resum = f"""The following documents contain information about accounting and finances:
    ----------
    {{docs}}
    ----------
    Your objective as assistant is to concisely answer the input question with the information from the documents.
    The answer will be in the same language of the question.
    ----------
    Question: {query}
    Answer: 
    """
    prompt = PromptTemplate.from_template(prompt_resum)

    llm_chain = LLMChain(llm=model, prompt=prompt)
    chain = StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name="docs"
    )

    relevant_docs = retriever_pdf.get_relevant_documents(query=query)
    # Using invoke with correct parameters
    resum = chain.invoke({"input_documents": relevant_docs, "query": query}).get("output_text")
    pages = [doc.metadata.get('page')+1 for doc in relevant_docs]



    # Calculate elapsed time for LLM calls
    fin = datetime.now()
    delta = round((fin-init).total_seconds()/60, 2)
    print(f"Tiempo transcurrido durante las llamadas al LLM: {delta} min")

    return resum, pages

# Test the function
query = "¿Puedes indicarme el coste de recibir una transferencia de 5.000€ en Yenes?"
file_path = r"docu_ejemplo\Tarifas transferencias Extranjero.pdf"
summarize_2(file_path, query)


Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 2
Tiempo transcurrido durante las llamadas al LLM: 0.04 min


('Lo siento, pero la información proporcionada en los documentos no incluye detalles sobre el coste de recibir una transferencia de 5.000€ en Yenes.',
 [2, 1])

In [6]:
# Test the function
# List of paths to the documents to be embedded
paths = [r"..\documents\doc_2023_12_Posicionamiento_Environment.pdf", 
              r"..\documents\Ficha tecnica CI Environment ISR_240317.pdf", 
              r"..\documents\NORRCO004_V28_Catalogo de productos de activo vigentes.pdf", 
              r"..\documents\Tarifas transferencias Extranjero.pdf",
             ]

# Executing the function to embed documents
query = "¿Puedes indicarme el coste de recibir una transferencia de 5.000€ en Yenes?"
summarize_2(paths, query)

Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 2
Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 2
Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 2
Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 2
Tiempo transcurrido durante las llamadas al LLM: 0.05 min


('Lo siento, pero la información proporcionada en los documentos no incluye detalles sobre el coste de recibir una transferencia de 5.000€ en Yenes.',
 [2, 1])

# Function to create embeding of all documents

In [42]:
# Function that creates the mebedding for all the documents and save it in the vectorstore "faiss_index"
def embeddings(list_paths):
    """
    Function that creates the mebedding for all the documents and save it in the vectorstore "faiss_index".
        
    The argumants are:

    list_paths:
        List of paths of the files we want to embed.

    It uses OpenAI model, so it will need to define the OpenAI_key.
    
    """

    # Initialize the model
    embeddings = OpenAIEmbeddings(api_key=os.getenv("OPENAI_API_KEY"))

    for file_path in list_paths:
        # Determine the FAISS index path
        file_name = str(file_path).split('/')[-1]
        faiss_index = str(file_path).replace(file_name, 'faiss_index')

        # Attempt to load existing FAISS index
        try:
            db = FAISS.load_local(faiss_index, embeddings, allow_dangerous_deserialization=True)
            print(f"Embeddings ya existentes del documento cargados")
            print(f"Numero de documentos indexados en FAISS: {db.index.ntotal}")
        except:
            # Initialize FAISS index from scratch if load fails
            init = datetime.now()
            loader = PyMuPDFLoader(file_path)
            documents = loader.load()
            print(f"Numero de paginas del pdf: {len(documents)}")
            text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
            docs = text_splitter.split_documents(documents)
            full_docs = [doc for doc in docs if len(doc.page_content) > 150]
            db = FAISS.from_documents(full_docs, embeddings)
            print(f"Numero de documentos indexados en FAISS: {db.index.ntotal}")
            db.save_local(faiss_index)
            fin = datetime.now()
            delta = round((fin-init).total_seconds()/60, 2)
            print(f"Tiempo transcurrido en carga del pdf y embeddings: {delta} min")

In [43]:
list_paths = ["docu_ejemplo/doc_2023_12_Posicionamiento_Environment.pdf", 
              "docu_ejemplo/Ficha tecnica CI Environment ISR_240317.pdf", 
              "docu_ejemplo/NORRCO004_V28_Catalogo de productos de activo vigentes.pdf", 
              "docu_ejemplo/Tarifas transferencias Extranjero.pdf"]

embeddings(list_paths)

Numero de paginas del pdf: 2
Numero de documentos indexados en FAISS: 2
Tiempo transcurrido en carga del pdf y embeddings: 0.01 min
Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 2
Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 2
Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 2


# Function to answer questions

In [58]:
def summarize(query):
    """
    Function that creates the mebedding for all the documents and save it in the vectorstore "faiss_index".
        
    The argumants are:

    query:
        Question we need the assistant answers

    It uses OpenAI model, so it will need to define the OpenAI_key.
    
    """

    model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.2, api_key=os.getenv("OPENAI_API_KEY"))
    
    db = FAISS.load_local("faiss_index", model, allow_dangerous_deserialization=True)

    # Retrieval setup
    retriever_pdf = db.as_retriever(
        search_type="similarity",
        search_kwargs={
            "k": 10,
        })

    init = datetime.now()

    # Preparing the LLM chain with the prompt template
    #prompt_resum = 
    """The following documents contain information about accounting and finances:
    ----------
    {docs}
    ----------
    Your objective as assistant is to concisely answer the input question with the information from the documents.
    The answer will be in the same language of the question.
    ----------
    Question: {query}
    Answer: 
    """

    prompt_resum ="""You are an assistant for question-answering tasks. Use the following pieces of retrieved context to answer the question. If you don't know the answer, just say that you don't know. Use three sentences maximum and keep the answer concise.

    Question: {query} 

    Context: {docs} 

    Answer:
    """

    prompt = PromptTemplate.from_template(prompt_resum)

    llm_chain = LLMChain(llm=model, prompt=prompt)
    chain = StuffDocumentsChain(
        llm_chain=llm_chain,
        document_variable_name="docs"
    )

    #resum = retriever_pdf.invoke("what did he say about ketanji brown jackson")

    relevant_docs = retriever_pdf.get_relevant_documents(query=query)
    # Using invoke with correct parameters
    resum = chain.invoke({"docs": relevant_docs, "query": query}).get("output_text")
    pages = [doc.metadata.get('page')+1 for doc in relevant_docs]



    # Calculate elapsed time for LLM calls
    fin = datetime.now()
    delta = round((fin-init).total_seconds()/60, 2)
    print(f"Tiempo transcurrido durante las llamadas al LLM: {delta} min")

    return resum, pages


In [59]:
from langchain_core.messages import HumanMessage
# Test the function
query = "¿Puedes indicarme el coste de recibir una transferencia de 5.000€ en Yenes?"
summarize(query)

`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


AttributeError: 'str' object has no attribute 'content'

In [61]:
from langchain_openai import ChatOpenAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough


model = ChatOpenAI(model="gpt-3.5-turbo-0125", temperature=0.2, api_key=os.getenv("OPENAI_API_KEY"))

db = FAISS.load_local("faiss_index", model, allow_dangerous_deserialization=True)

# Retrieval setup
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs={
        "k": 10,
    })

template = """Answer the question based only on the following context:

{docs}

Question: {question}
"""
prompt = ChatPromptTemplate.from_template(template)

def format_docs(docs):
    return "\n\n".join([d.page_content for d in docs])

def print_chain_input(chain_object):
    print(chain_object)
    return chain_object

chain = (
    {"context": retriever | format_docs, "question": RunnablePassthrough()}
    | print_chain_input
    | prompt
    | model
    | StrOutputParser()
)

chain.invoke("¿Puedes indicarme el coste de recibir una transferencia de 5.000€ en Yenes?")


`embedding_function` is expected to be an Embeddings object, support for passing in a function will soon be removed.


TypeError: unsupported operand type(s) for |: 'dict' and 'function'

In [41]:
query = "Qué moneda tiene el target-2?"
file_path = r"docu_ejemplo\Tarifas transferencias Extranjero.pdf"
summarize(file_path, query)

Embeddings ya existentes del documento cargados
Numero de documentos indexados en FAISS: 4


ValueError: Missing some input keys: {'input_documents'}