## docs

In [None]:

https://python.langchain.com/docs/integrations/providers/chroma/#retriever
https://python.langchain.com/docs/integrations/text_embedding/
https://python.langchain.com/docs/integrations/vectorstores/
https://python.langchain.com/docs/integrations/document_loaders/
https://python.langchain.com/docs/integrations/chat/

https://python.langchain.com/docs/integrations/chat/ollama/
https://python.langchain.com/docs/integrations/llms/ollama/
https://python.langchain.com/docs/concepts/chat_models/
https://python.langchain.com/docs/concepts/structured_outputs/
https://python.langchain.com/docs/concepts/tokens/
https://python.langchain.com/docs/integrations/document_loaders/
https://python.langchain.com/docs/integrations/document_loaders/microsoft_word/
https://python.langchain.com/docs/integrations/document_loaders/image_captions/
https://python.langchain.com/docs/integrations/document_loaders/image/
https://python.langchain.com/docs/integrations/document_loaders/microsoft_excel/
https://python.langchain.com/docs/integrations/document_loaders/recursive_url/
https://python.langchain.com/docs/integrations/memory/

https://python.langchain.com/docs/additional_resources/youtube/#videos-sorted-by-views

## installs

In [None]:
%pip install -qU langchain-ollama pypdf


## imports


In [19]:
from langchain.vectorstores import Chroma
from langchain_ollama import OllamaEmbeddings
from langchain_core.prompts import ChatPromptTemplate
from langchain_ollama.llms import OllamaLLM
from langchain.document_loaders import PyPDFLoader
import asyncio
import os



## tools

In [None]:
import uuid

def generate_random_id():
    return str(uuid.uuid4())

random_id = generate_random_id()
random_id

In [None]:
def rename_and_standardize_file_path(file_path):
    # Remove spaces and replace them with underscores
    standardized_path = file_path.replace(" ", "_")
    return standardized_path
file_path = fr"file name with spaces.pdf"
path = rename_and_standardize_file_path(file_path)
path

## indexing/embeddings

In [None]:

embeddings = OllamaEmbeddings(
    model="nomic-embed-text:latest",
)
# Asynchronously load pages and store metadata
# Modify extract_pdf_pages to manually set page numbers
async def extract_pdf_pages(loader, file_path, doc_id):
    extracted_pages = []
    page_number = 1

    async for page in loader.alazy_load():  # Se der erro, use loader.load()
        extracted_pages.append({
            "page_content": page.page_content,
            "metadata": {
                "source": file_path,
                "page_number": page_number,
                "doc_id": doc_id
            }
        })
        page_number += 1  # Incrementa manualmente o número da página

    return extracted_pages

file_name = "file.pdf"
file_dir = r"jupter_notebooks/"
file_path = os.path.join(file_dir, file_name)

if not os.path.exists(file_path):
    raise FileNotFoundError(f"file not found: {file_path}")
 
async def indexer():
    doc_id = ""
    loader = PyPDFLoader(file_path)
    
    pages = await extract_pdf_pages(loader, file_path, doc_id)

    texts = [page["page_content"] for page in pages]
    metadata = [page["metadata"] for page in pages]

    return texts, metadata

texts, metadata = await indexer()

print(texts)
print(metadata)



## retrival/vector_store

In [None]:

vector_store = Chroma(
    collection_name="example_collection",
    embedding_function=embeddings,
    persist_directory="./chroma_langchain_db"
)

doc_id = ''
question = f"tell me about document id {doc_id}"

relevant_docs = vector_store.similarity_search_by_vector(
    embedding=embeddings.embed_query(question), k=3,filter={"doc_id": doc_id} 
)
for doc in relevant_docs:
    print(f"* {doc.page_content} [{doc.metadata}]")



## llm/ai


In [None]:
model = "deepseek-coder-v2:16b"

llm = OllamaLLM(model=model, temperature=0.7)

retrieved_context = "\n".join(
    [
        f"Source: {doc.metadata.get('source', 'Unknown')}, Page: {doc.metadata.get('page_number', 'Unknown')}\n{doc.page_content}"
        for doc in relevant_docs
    ]
)

# Create the robust prompt for the LLM
prompt = f"""
You are a precise and skilled assistant. Your goal is to answer the user's question strictly based on the provided context.
If the answer is not found in the context, respond with: "The answer was not found in the provided context."

Below is the context extracted from documents related to the query. For each part of your response, clearly indicate the source and the page number where the information was found.

Your task:
- Understand the user's question and rewrite the question's intent.
- Review the context and answer the question based on the provided information.
- Respond concisely, using only the context.
- For each piece of information provided, mention the source and the page number.
- If the answer cannot be found in the context, state this explicitly.
- If the answer is not found, analyze again and attempt to respond.

Question: {question}  
Context: {retrieved_context}  

"""

# Get the LLM response
response = llm(prompt)

print("LLM response:")
print(response)
