# Simple RAG

In [1]:
import sys
import os
from typing import List

# Path to the directory containing config.py
config_path = '/home/mauricio/Documents/Projects/RAG-Mastery'

# Add the directory to sys.path
if config_path not in sys.path:
    sys.path.append(config_path)

# Now you can import the API_KEY from config.py
from config import API_KEY

path_to_docs = "/home/mauricio/Documents/Projects/RAG-Mastery/data"

Import the ChatMistralAI class from langchain. With this, we'll be able to use the AI model from Mistral. In our case, the 7x8b model will be enough for our RAG system. We initialize the model with our API key to access Mistral's services.

In [2]:
from langchain_mistralai.chat_models import ChatMistralAI
def get_llm_model(self):
        return ChatMistralAI(
            model_name="open-mixtral-8x22b", 
            mistral_api_key=self.API_KEY
        )

In the next part, we use an UnstructuredLoader due to the multiple advantages it provides, and later we plan to increase the capabilities of our RAG system with this. The main advantages are that it can handle multiple file formats (.docx, .pdf, .txt) with a single loader and it preserves the document structure. This provides better retention of the original document layout and structure, which can lead to more accurate and context-aware text extraction.


In [6]:
from langchain_unstructured import UnstructuredLoader
from langchain.schema.document import Document

def load_documents(folder_path: str) -> List[Document]:
    documents = []
    for file in os.listdir(folder_path):
        if file.endswith('.docx') or file.endswith('.pdf') or file.endswith('.txt'):
            loader = UnstructuredLoader(os.path.join(folder_path, file))
            documents.extend(loader.load())
        print("Document loaded lenght: ", len(documents))
    print("Documents loaded successfully ✅")
    print(documents[0].metadata.get("filename"))
    return documents

In [10]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma


def split_documents(documents: list[Document], chunk_size: int = 1000, chunk_overlap: int = 200) -> List[Document]:
        try:
            text_splitter: RecursiveCharacterTextSplitter = RecursiveCharacterTextSplitter(
                chunk_size=chunk_size,
                chunk_overlap=chunk_overlap,
                length_function=len,
                separators=["\n\n", "\n", " ", ""]
            )
            splits: List[Document] = text_splitter.split_documents(documents)
            print("Split document successfully ✅")
            print("Documents split: ", len(splits))
            return splits
        except Exception as e:
            print(f"Error splitting documents: {e}")
            raise


In [11]:
a = load_documents(path_to_docs)
b = split_documents(a)

Document loaded lenght:  1868
Documents loaded successfully ✅
2023-rulebook_final.pdf
Split document successfully ✅
Documents split:  1891
