In [None]:
#import argparse
from langchain_chroma import Chroma
from langchain.prompts import ChatPromptTemplate
from langchain_community.llms.ollama import Ollama
from langchain_community.document_loaders.pdf import PyPDFDirectoryLoader
#from get_embedding_function import get_embedding_function
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain.schema.document import Document


- **argparse**: Facilitates the creation of command-line commands and allows the code to receive parameters directly via CLI.

- **Chroma**: A vector database for storing and retrieving text embeddings. Utilizes the persistent directory `CHROMA_PATH`.

- **ChatPromptTemplate**: Manages prompts for language models.

- **Ollama**: Interface for using the Ollama language model, which responds to questions based on prompts.

- **PyPDFDirectoryLoader**: Loads PDF documents from a directory.

- **get_embedding_function**: Imports a custom function that creates text embeddings.

- **RecursiveCharacterTextSplitter**: Splits documents into smaller chunks while maintaining overlap between them.

- **Document**: Class used to organize and manipulate chunks. 

In [None]:
CHROMA_PATH = 'chroma_directory'
DATA_PATH = "pdfs"

PROMPT_TEMPLATE = """
Answer the question based only on the following context:

{context}

---

Answer the question based on the above context: {question}
"""

- **CHROMA_PATH**: Directory where the Chroma database will be stored.

- **DATA_PATH**: Directory from which PDFs will be loaded.

- **PROMPT_TEMPLATE**: A prompt template used to request responses from the model, providing context and a question. 

In [None]:
def load_and_split_documents():
    document_loader = PyPDFDirectoryLoader(DATA_PATH)
    print("ETAPA 01 " + str(document_loader))
    
    # Carrega os documentos do diretório
    documents = document_loader.load()
    
    # Verifica e exibe quais arquivos foram encontrados
    if not documents:
        print("Nenhum arquivo PDF encontrado no diretório especificado.")
    else:
        print(f"ETAPA 02: {len(documents)} arquivos encontrados:")
        for doc in documents:
            print(f" - Arquivo: {doc.metadata['source']}")
    
    # Divide os documentos em chunks
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=2000,
        chunk_overlap=800,
        length_function=len
    )
    chunks = text_splitter.split_documents(documents)
    print("ETAPA 03: " + str(chunks))
    
    return chunks

# Chame a função para verificar
load_and_split_documents()


- **Objective**: Loads all PDF documents from the specified directory and splits them into chunks.

- **document_loader**: Initializes the PDF loader.

- **documents**: Loads all documents from the `DATA_PATH` directory.

- **text_splitter**: Configures the splitter to divide documents into chunks of up to 2000 characters with an overlap of 800 characters.

- **chunks**: Divided chunks are returned.

In [None]:
def calculate_chunk_ids(chunks):
    last_page_id = None
    current_chunk_index = 0

    for i, chunk in enumerate(chunks):
        # Extrai as informações do chunk
        source = chunk.metadata.get("source")
        page = chunk.metadata.get("page")
        current_page_id = f"{source}:{page}"

        # Debug: Exibe o estado atual
        print(f"Iteração {i + 1}:")
        print(f"  Fonte: {source}")
        print(f"  Página: {page}")
        print(f"  ID da Página Atual: {current_page_id}")
        print(f"  Último ID da Página: {last_page_id}")

        if current_page_id == last_page_id:
            current_chunk_index += 1
        else:
            current_chunk_index = 0

        # Calcula o ID do chunk
        chunk_id = f"{current_page_id}:{current_chunk_index}"
        chunk.metadata["id"] = chunk_id

        # Debug: Exibe o ID calculado e atribuído
        print(f"  ID do Chunk Calculado: {chunk_id}")
        print(f"  Chunk Metadata Atualizada: {chunk.metadata}")

        # Atualiza o último ID da página
        last_page_id = current_page_id

    return chunks

# Exemplo de chamada da função para verificação
# Suponha que você tenha uma lista de chunks para passar para a função
chunks = [
    Document(metadata={"source": "document1.pdf", "page": 1}, page_content="Conteúdo da página 1"),
    Document(metadata={"source": "document1.pdf", "page": 1}, page_content="Conteúdo da página 1 - continuação"),
    Document(metadata={"source": "document1.pdf", "page": 2}, page_content="Conteúdo da página 2"),
    Document(metadata={"source": "document2.pdf", "page": 1}, page_content="Conteúdo da página 1 do documento 2")
]

# Chame a função para verificar
calculate_chunk_ids(chunks)


- **Objective**: Calculates and assigns unique IDs to each document chunk, based on the source (document name) and page.

- **last_page_id**: Stores the ID of the last processed page.

- **current_chunk_index**: Index of the current chunk on the same page.

- **chunk_id**: Unique ID generated for each chunk, combining the document name, page number, and chunk index.

In [None]:
def add_to_chroma(chunks):
    embedding_function = get_embedding_function()
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)

    chunks_with_ids = calculate_chunk_ids(chunks)

    existing_items = db.get(include=[])
    existing_ids = set(existing_items["ids"])

    new_chunks = [chunk for chunk in chunks_with_ids if chunk.metadata["id"] not in existing_ids]

    if new_chunks:
        db.add_documents(new_chunks, ids=[chunk.metadata["id"] for chunk in new_chunks])
        db.persist()
        print(f"Added {len(new_chunks)} new documents to Chroma.")
    else:
        print("No new documents to add.")

    # Verificação adicional para garantir que os documentos foram adicionados corretamente
    existing_items = db.get(include=[])
    print(f"Existing documents in Chroma: {len(existing_items['ids'])}")

add_to_chroma(chunks)


- **Objective**: Adds chunks to the Chroma database, ensuring that only new chunks are added.

- **embedding_function**: Retrieves the custom embedding function.

- **db**: Initializes the connection to the Chroma database using the embedding function.

- **chunks_with_ids**: Prepares chunks with calculated IDs.

- **existing_items**: Retrieves existing documents in Chroma.

- **new_chunks**: Filters out new chunks that are not already in Chroma.

- **db.add_documents**: Adds the new chunks to the database.

- **db.persist**: Persists the changes to the Chroma database.

In [None]:

from langchain_community.embeddings.ollama import OllamaEmbeddings

def get_embedding_function():
    embeddings = OllamaEmbeddings(model="nomic-embed-text")
    return embeddings

- **Import**: Imports the `OllamaEmbeddings` class for using text embeddings.

- **Function get_embedding_function**: Creates and returns an instance of `OllamaEmbeddings` with the `nomic-embed-text` model.

- **Usage**: The function provides an object that can be used to generate vector representations (embeddings) for texts.

In [None]:
def query_rag(query_text):
    print("Initializing embedding function...")  # Debug: Inicializando função de embeddings
    embedding_function = get_embedding_function()
    print("Embedding function initialized.")  # Debug: Função de embeddings inicializada

    print(f"Loading Chroma database from {CHROMA_PATH}...")  # Debug: Carregando Chroma DB
    db = Chroma(persist_directory=CHROMA_PATH, embedding_function=embedding_function)
    print("Chroma database loaded.")  # Debug: Chroma DB carregado

    print(f"Performing similarity search for query: '{query_text}'...")  # Debug: Realizando busca por similaridade
    results = db.similarity_search_with_score(query_text, k=5)
    print(f"Search completed. Number of results: {len(results)}")  # Debug: Busca concluída

    context_text = "\n\n---\n\n".join([doc.page_content for doc, _score in results])
    print("Context extracted from results.")  # Debug: Contexto extraído dos resultados

    prompt_template = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
    prompt = prompt_template.format(context=context_text, question=query_text)
    print(f"Prompt created: {prompt}")  # Debug: Exibindo prompt criado

    print("Invoking the model...")  # Debug: Invocando o modelo
    # Adaptando para usar o Ollama com o servidor local
    model = Ollama(model="mistral", base_url="http://127.0.0.1:11434")
    response_text = model.invoke(prompt)
    print(f"Model response received: {response_text}")  # Debug: Resposta do modelo recebida

    sources = [doc.metadata.get("id", None) for doc, _score in results]
    formatted_response = f"Response: {response_text}\nSources: {sources}"
    print(formatted_response)  # Exibindo a resposta formatada
    return response_text



- **Initializes** the embedding function using `OllamaEmbeddings`.

- **Loads** the Chroma database.

- **Performs** a similarity search in the database using the query text.

- **Extracts** the context from the search results.

- **Creates** a prompt for the model based on the context and the question.

- **Invokes** the Ollama model to obtain a response.

- **Formats** and prints the response along with the sources of the related documents.

In [None]:

def main():
    query_text = "Quais são algumas das preocupações éticas associadas ao uso de algoritmos de inteligência artificial, conforme descrito no texto?"
    print(f"Query text received: {query_text}")  # Debug: Exibe a query recebida
    query_rag(query_text)

- **Displays** the received query text for debugging.

- **Calls** the `query_rag` function with the query text to process the search and obtain a response.

In [None]:
main()