## Import RAG config

In [3]:
from finsight_rag.ingest.utils import extract_company_from_filename, extract_year_from_filename

def test_extract_metadata_from_filename():
    filename = "Embraer-AnnualReport-2024.pdf"
    company = extract_company_from_filename(filename)
    year = extract_year_from_filename(filename)
    assert year == "2024"
    assert company == "Embraer"
    
    filename = "Petrobas-Management-Report-2024.pdf"
    company = extract_company_from_filename(filename)
    year = extract_year_from_filename(filename)
    assert year == "2024"
    assert company == "Petrobas"
    

In [1]:
from importlib import resources as impresources
import finsight_rag.config as config
from finsight_rag.utils import load_yaml

rag_config_path = (impresources.files(config) / "rag_config.yaml")
rag_config = load_yaml(rag_config_path)
print("Available RAG configurations:")
print(list(rag_config.keys()))

Available RAG configurations:
['dataset_path', 'vector_store_path', 'chunk_size', 'chunk_overlap', 'batch_rows', 'embedding_model', 'gen_model', 'temperature', 'max_new_tokens', 'top_k_chunks']


In [2]:
dataset_rag_path = rag_config["dataset_path"]
vector_store_path = rag_config["vector_store_path"]
embedding_model = rag_config["embedding_model"]

print(f"RAG dataset path: {dataset_rag_path}")
print(f"RAG vector store path: {vector_store_path}")
print(f"RAG embedding model: {embedding_model}")

RAG dataset path: C:/Users/User/projects/FinSight-RAG/data/rag/annual-reports-2024
RAG vector store path: C:/Users/User/projects/FinSight-RAG/data/rag/annual-reports-2024-vector-store
RAG embedding model: sentence-transformers/all-MiniLM-L6-v2


## Accessing RAG vector store

In [3]:
from langchain_chroma import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

# Initialize the embedding function (same one used when creating)
embeddings = HuggingFaceEmbeddings(model_name=embedding_model)

# Load the persisted database
vector_store = Chroma(
    collection_name="brazilian_annual_reports",  # same collection name
    persist_directory=vector_store_path,  # path where it was saved
    embedding_function=embeddings,
)

In [5]:
dir(vector_store)

['_Chroma__ensure_collection',
 '_Chroma__query_collection',
 '_LANGCHAIN_DEFAULT_COLLECTION_NAME',
 '__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__slots__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_asimilarity_search_with_relevance_scores',
 '_chroma_collection',
 '_client',
 '_collection',
 '_collection_configuration',
 '_collection_metadata',
 '_collection_name',
 '_cosine_relevance_score_fn',
 '_embedding_function',
 '_euclidean_relevance_score_fn',
 '_get_retriever_tags',
 '_max_inner_product_relevance_score_fn',
 '_select_relevance_score_fn',
 '_similarity_search_with_relevance_scores',
 'aadd_documents',
 'aadd_texts',
 'add_documents',
 '

In [4]:
vector_store.get()

{'ids': ['4c5c4c46-036a-41c5-a44e-b845237ae244',
  'effa14aa-2844-4a05-a6df-a7bd603420ef',
  'ec434dcf-c713-4704-bbc2-7ebd5c487ebc',
  '756408cb-7d83-4d33-bb74-01ac2969260a',
  '5c6d1631-3569-4f14-9f74-8f8b320c5448',
  '6b14288a-ecf1-43cc-bca2-a3c15f4375dd',
  'ece2ed48-0aae-4c0a-b0a4-b3d519c19593',
  '015ab9f2-f86d-4eeb-9975-099567fa8229',
  'c22dc125-78f7-45d2-9b37-ece12b0fb980',
  'cfe61ac9-8b4c-4102-ad85-84c4dcfb44e2',
  'f6aa609a-205e-437a-b225-057abb0a1f33',
  '5f089973-c1ee-41e9-9c8d-aaa7ffe194ad',
  '70367220-4b3b-4471-852c-a278979e1058',
  'd9b32567-0836-4e2f-80f1-171051b46e0e',
  'c42a35b9-7af1-4a97-bfe1-b94e40eaac17',
  '927ccbb6-c8b6-46c9-82d4-f853e53ad5d0',
  '1861f5da-1f1d-45a1-8aa4-ec24f0131235',
  'adb64488-ab05-4153-9005-fa8ecb1f58ff',
  'db6a57de-cc21-42d4-99ae-318b330fccd7',
  '58141f86-5314-4905-b522-93e1109f679a',
  'd170c329-9104-493f-a9e7-9816cac63cb6',
  'f8586039-a833-40d5-a419-b30df4d2509e',
  '67753eff-853f-4177-84fd-6682ba9c18cf',
  '9f8cf5f2-3f1f-4767-909f-

In [5]:
vector_store.similarity_search("What was Vale's revenue?", k=5)

[Document(id='f70bbdaa-5a87-44b2-9a57-d7cea72fd7e4', metadata={'page': 53, 'creationdate': '2025-03-17T16:55:35+00:00', 'trapped': '/False', 'creator': 'Adobe InDesign 20.1 (Macintosh)', 'year': '2024', 'moddate': '2025-03-17T16:55:52+00:00', 'producer': 'Adobe PDF Library 17.0', 'company': 'Vale', 'total_pages': 60, 'source': 'C:/Users/User/projects/FinSight-RAG/data/rag/annual-reports-2024/Vale-Management-Report-2024.pdf', 'page_label': '54', 'file_name': 'Vale-Management-Report-2024.pdf', 'chunk_index': 4}, page_content='Payments of leasing (1,108) (1,159)\nDividends and interest on capital paid to Vale shareholders (20,662) (27,759)\nVALE\n2024 Management Report\n54'),
 Document(id='71b43d28-8f92-44b1-b314-42658ac8ebb3', metadata={'year': '2024', 'total_pages': 60, 'chunk_index': 0, 'file_name': 'Vale-Management-Report-2024.pdf', 'page': 46, 'creationdate': '2025-03-17T16:55:35+00:00', 'trapped': '/False', 'producer': 'Adobe PDF Library 17.0', 'source': 'C:/Users/User/projects/FinS