In [1]:
!pip install -q -r requirements.txt

In [2]:
# Importing packages

# Manipulating the operating system
import os

# Function to load the pre-trained embeddings model
from langchain_huggingface import HuggingFaceEmbeddings  

# Vector database
from langchain_chroma import Chroma  

# Document loaders
from langchain_community.document_loaders import DirectoryLoader, PyMuPDFLoader

# Text chunk splitter
from langchain_text_splitters import RecursiveCharacterTextSplitter

In [3]:
# Path to documents
documents_path = "/Users/joaovitorzimmermann/Projetos/rag_system/documents"

In [4]:
# Definition of the directory where vector databases will be stored
vectordb_path = "/Users/joaovitorzimmermann/Projetos/rag_system/vector_dbs"

In [5]:
emb_model_name = "BAAI/bge-m3"

In [6]:
# Function to create the vector database from processed documents
def create_vector_db(subject):

    # Message informing the start of the process
    print("\nGenerating Embeddings. Please wait...")
    
    # Loading PDF files from the specified directory
    loader = DirectoryLoader(
        f"{documents_path}/{subject}",      # Directory where source PDF files are stored
        glob = "*.pdf",                          # Pattern of files to load
        loader_cls = PyMuPDFLoader,              # PDF loading class
    )
    
    # Loads documents from the directory
    documents = loader.load()
    
    # Checks if there are loaded documents, otherwise ends the function
    if not documents:
        print("No documents found.")
        return  
    
    # Defines a text splitter to segment documents into smaller parts
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size = 512,   # Defines the maximum size of each chunk
        chunk_overlap = 256  # Defines the overlap between chunks to maintain context
    )
    
    # Divides documents into smaller chunks
    chunks = text_splitter.split_documents(documents)
    
    # Name of the embeddings model used
    # https://huggingface.co/BAAI/bge-base-en
    model_name = emb_model_name
    
    # Parameters for embedding generation
    # Defines the normalization of embeddings for similarity calculation
    encode_kwargs = {'normalize_embeddings': True}  
    
    # Instantiates the embeddings model
    embedding_model = HuggingFaceEmbeddings(
        model_name = model_name,           # Chosen model
        model_kwargs = {'device': 'cpu'},  # Defines execution on CPU
        encode_kwargs = encode_kwargs      # Embeddings configuration
    )

    # Variable to store vector databases
    vectordb = None
    
    # Creation of the vector database from processed documents
    vectordb = Chroma.from_documents(
        chunks,                                                # Chunks generated from documents
        embedding_model,                                       # Embeddings model used
        persist_directory = f"{vectordb_path}/{subject}"       # Directory where the vector database will be stored
    )
    
    # Message informing that the vector database was created successfully
    print(f"\nRAG Vector Database for {subject} Created Successfully.\n")

In [None]:
create_vector_db("amd")

In [None]:
create_vector_db("nvidia")

In [7]:
create_vector_db("intel")


Generating Embeddings. Please wait...

RAG Vector Database for intel Created Successfully.

