## Import RAG config

In [1]:
from importlib import resources as impresources
import finsight_rag.config as config
from finsight_rag.utils import load_yaml

rag_config_path = (impresources.files(config) / "rag_config.yaml")
rag_config = load_yaml(rag_config_path)
print("Available RAG configurations:")
print(list(rag_config.keys()))

Available RAG configurations:
['dataset_path', 'vector_store_path', 'chunk_size', 'chunk_overlap', 'batch_rows', 'embedding_model', 'gen_model', 'temperature', 'max_new_tokens', 'top_k_chunks']


In [2]:
dataset_rag_path = rag_config["dataset_path"]
vector_store_path = rag_config["vector_store_path"]
embedding_model = rag_config["embedding_model"]

print(f"RAG dataset path: {dataset_rag_path}")
print(f"RAG vector store path: {vector_store_path}")
print(f"RAG embedding model: {embedding_model}")

RAG dataset path: C:/Users/User/projects/FinSight-RAG/data/rag/annual-reports-2024
RAG vector store path: C:/Users/User/projects/FinSight-RAG/data/rag/annual-reports-2024-vector-store
RAG embedding model: sentence-transformers/all-MiniLM-L6-v2


## Accessing RAG vector store

In [3]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize the embedding function (same one used when creating)
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# Load the persisted database
vector_store = Chroma(
    collection_name="brazilian_annual_reports",  # same collection name
    persist_directory=vector_store_path,  # path where it was saved
    embedding_function=embeddings,
)

In [8]:
vector_store._chroma_collection.name

'brazilian_annual_reports'

In [5]:
num_vectors = len(vector_store.get()["ids"])
print(f"Number of vectors in the store: {num_vectors}")

Number of vectors in the store: 2161


In [6]:
vector_store.similarity_search("What was BTG's revenue?", k=5)

[Document(id='d477d2ee-aeb1-441f-bba3-a097c5edf22a', metadata={'trapped': '/False', 'moddate': '2025-07-22T13:54:00-03:00', 'page': 64, 'total_pages': 183, 'chunk_index': 1, 'creator': 'Adobe InDesign 20.4 (Macintosh)', 'file_name': 'BTG-Annual-Report-2024.pdf', 'creationdate': '2025-07-22T13:53:26-03:00', 'company': 'BTG', 'page_label': '65', 'year': '2024', 'source': 'C:/Users/lucas/projects/FinSight-RAG/data/rag/annual-reports-2024/BTG-Annual-Report-2024.pdf', 'producer': 'Adobe PDF Library 17.0'}, page_content='161%\n4T 2024\nNon-liquid assets\n(BRLbi)\nLiabilities\n3,3\n37,5\n94\n114,5\n262,9\n68,5\n66,131,1\n29,7\n163,3\n172,4\n97,3\n12,7\n79,9\n60,4\nFinancial performance\nTotal revenue Adjusted net income\nLinear (Adjusted Net income)Linear (total revenues)\n9,303.5\n13,900.6\n17,247.1\n+ 132%\n+ 157%\n21,558.9\n25,054.3\n12,321.510,419.1\n8,306.5\n6,493.0\n4,049.9\n2020 2021 2022 2023 2024\nNet profit and revenues development \nBRL thousand\n65Annual Report 2024\nBTG Pactual')

In [7]:
del vector_store

## Add new pdf to vector store

In [8]:
from finsight_rag.ingest.ingest_annual_reports_pdfs import IngestConfig, AnnualReportsIngestor

anual_report_ingester_cfg = IngestConfig(
    corpus_path=dataset_rag_path,
    chroma_dir=vector_store_path,
    collection="brazilian_annual_reports",
    chunk_size=rag_config["chunk_size"],
    chunk_overlap=rag_config["chunk_overlap"],
    batch_rows=rag_config["batch_rows"],
    embedding_model=embedding_model,
)

annual_reports_ingestor = AnnualReportsIngestor(anual_report_ingester_cfg)

In [9]:
new_pdf_filename = "Airbus-Annual-Report-2024.pdf"
new_pdf_path = dataset_rag_path + "/" + new_pdf_filename
print(f"Adding new PDF to vector store: {new_pdf_path}")

Adding new PDF to vector store: C:/Users/lucas/projects/FinSight-RAG/data/rag/annual-reports-2024/Airbus-Annual-Report-2024.pdf


In [10]:
from langchain_core.documents import Document
from langchain_community.document_loaders import PyPDFLoader

from finsight_rag.ingest.utils import extract_company_from_filename, extract_year_from_filename

company = extract_company_from_filename(new_pdf_filename)
year = extract_year_from_filename(new_pdf_filename)

print(f"Loading PDF of {company}: {new_pdf_path}")
loader = PyPDFLoader(new_pdf_path)
docs = loader.load()

for d in docs:
    d.metadata["company"] = company
    d.metadata["file_name"] = new_pdf_filename
    d.metadata["year"] = year

print(f"Number of pages in the new PDF: {len(docs)}")

Loading PDF of Airbus: C:/Users/lucas/projects/FinSight-RAG/data/rag/annual-reports-2024/Airbus-Annual-Report-2024.pdf
Number of pages in the new PDF: 24


In [12]:
annual_reports_ingestor.add_docs_to_vector_store(docs)

In [14]:
annual_reports_ingestor.vector_store.similarity_search("What was Airbus's revenue?", k=5)

[Document(id='4fe06a77-8c63-4619-ab46-862b8587a1ac', metadata={'trapped': '/False', 'year': '2024', 'page_label': 'A', 'moddate': '2025-03-21T09:14:22+00:00', 'company': 'Airbus', 'creationdate': '2025-03-21T09:10:29+00:00', 'file_name': 'Airbus-Annual-Report-2024.pdf', 'total_pages': 24, 'page': 0, 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'producer': 'Adobe PDF Library 17.0', 'source': 'C:/Users/lucas/projects/FinSight-RAG/data/rag/annual-reports-2024/Airbus-Annual-Report-2024.pdf'}, page_content='Airbus Annual Report\nOverview 2024\nInnovation  \n in action'),
 Document(id='13db359e-e338-438e-aa19-45c240dc65c4', metadata={'total_pages': 24, 'creator': 'Adobe InDesign 19.5 (Macintosh)', 'creationdate': '2025-03-21T09:10:29+00:00', 'trapped': '/False', 'page': 4, 'file_name': 'Airbus-Annual-Report-2024.pdf', 'producer': 'Adobe PDF Library 17.0', 'page_label': '3', 'company': 'Airbus', 'source': 'C:/Users/lucas/projects/FinSight-RAG/data/rag/annual-reports-2024/Airbus-Annual-Report