In [4]:
from dotenv import load_dotenv
from langchain_openai import OpenAIEmbeddings
from langchain_groq import ChatGroq
from langchain_community.document_loaders import PyPDFDirectoryLoader
from langchain_community.vectorstores import Chroma
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.chains.combine_documents.stuff import create_stuff_documents_chain
from langchain.prompts import ChatPromptTemplate
from langchain.chains.retrieval import create_retrieval_chain
from langchain_core.output_parsers.string import StrOutputParser
import glob
import os

In [5]:
# LOAD THE API KEYS 
load_dotenv()
os.environ["OPENAI_API_KEY"]=os.getenv("OPENAI_API_KEY")
os.environ["GROQ_API_KEY"]=os.getenv("GROQ_API_KEY")
os.environ["LANGCHAIN_API_KEY"]=os.getenv("LANGCHAIN_API_KEY")
os.environ["LANGCHAIN_TRACING_V2"] = "true"
os.environ["LANGCHAIN_PROJECT"] = "LOIS DE FINANCE RAG"

In [6]:
documents=PyPDFDirectoryLoader("./data").load()
documents[0]

Document(metadata={'producer': 'Adobe PDF Library 10.0.1', 'creator': 'Adobe InDesign CC (Windows)', 'creationdate': '2019-12-24T15:19:17+01:00', 'moddate': '2019-12-24T15:19:24+01:00', 'trapped': '/False', 'source': 'data\\FLAW 2020.pdf', 'total_pages': 106, 'page': 0, 'page_label': '2395'}, page_content='{\nROYAUME DU MAROC\nBULLETIN OFFICIEL\nEDITION DE TRADUCTION OFFICIELLE\nEDITIONS\nTARIFS D’ABONNEMENT ABONNEMENT\nIMPRIMERIE OFFICIELLE\nRabat - Chellah\nTél. : 05.37.76.50.24 ‑ 05.37.76.50.25\n05.37.76.54.13\nCompte n° :\n310 810 1014029004423101 33\nouvert à la Trésorerie Préfectorale de Rabat \nau nom du régisseur des recettes \nde l’Imprimerie officielle\nAU MAROC\nA L’ETRANGER\n6 mois 1 an\nEdition générale...................................................................\nEdition de traduction officielle.............................................\nEdition des conventions internationales................................\nEdition des annonces légales, judiciaires et administr

In [7]:
# split to chunks
text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  
        chunk_overlap=100,
        length_function=len,
        add_start_index=True,
    )

splited_document=text_splitter.split_documents(documents)

In [8]:
len(splited_document)

4924

In [9]:
splited_document[5].page_content

'Décret n° 2-19-846 du 16 rabii II 1441 (13 décembre 2019) portant  délégation de pouvoir,  au ministre de \nl’économie, des finances et de la réforme de l’administration, en matière de financements extérieurs ...2499\nDécret n° 2-19-847 du 16 rabii II 1441 (13 décembre 2019) portant délégation de pouvoir, au ministre de \nl’économie, des finances et de la réforme de l’administration, en vue de conclure des contrats d’emprunts'

In [10]:
embeddings=OpenAIEmbeddings()

In [11]:
vectorstore = Chroma.from_documents(splited_document, embeddings)

In [12]:
vectorstore.similarity_search("what is the lois 2022")

[Document(metadata={'page_label': '2605', 'total_pages': 98, 'producer': 'Adobe PDF Library 11.0', 'creator': 'Adobe InDesign CC (Windows)', 'page': 0, 'creationdate': '2021-12-21T11:52:49+00:00', 'moddate': '2021-12-21T11:52:59+00:00', 'source': 'data\\FLAW 2022.pdf', 'trapped': '/False', 'start_index': 1167}, page_content='la publication au Bulletin officiel est prévue par les lois ou les réglements en vigueur\nPages\nSOMMAIRE\nTEXTES GENERAUX\nLoi de finances pour l’année budgétaire 2022.\nDahir n° 1-21-115 du 5 joumada I 1443 (10 décembre 2021 ) portant promulgation de la loi de finances \nn° 76-21 pour l’année budgétaire 2022 ...............................................................................................2606\nMinistre de l’économie et des finances .\xa0–\xa0Délégation de\xa0pouvoir.'),
 Document(metadata={'total_pages': 110, 'creationdate': '2020-12-18T12:58:58+00:00', 'moddate': '2020-12-18T12:59:09+00:00', 'trapped': '/False', 'producer': 'Adobe PDF Library 11.0'

In [13]:
system_prompt = (
    "You are an expert assistant for question-answering tasks of public finance. "
    "Use the following pieces of retrieved context to answer the question. "
    "If you don't know the answer, just say that you don't know. "
    "If the user asks you a question, you need to get the information from the context then add your knowledge.\n\n"
    "Context: {context}"
    "and remember to remove unnessessary numbers or simboles"
)

prompt = ChatPromptTemplate.from_messages([
    ("system", system_prompt),
    ("human", "{input}")
])

In [100]:
#prompt.invoke({"context":vectorstore,
              # "question":"what is new in 2022"})

In [14]:
llm=ChatGroq(model="openai/gpt-oss-20b")
stuff_chain = create_stuff_documents_chain(
    llm,
    prompt,
    output_parser=StrOutputParser()
)

In [15]:
retriever = vectorstore.as_retriever()
chain = create_retrieval_chain(retriever, stuff_chain)

In [16]:
chain.invoke({"input": "what is new in 2025 LOIS"})["answer"]

'**What’s new in the 2025 finance law (LOI\u202fDE\u202fFINANCES\u202fN°\u202f60‑24)**  \n\n| Area | New or changed provision | Effective date |\n|------|--------------------------|----------------|\n| **General tax and revenue regime** | The law extends the application of certain provisions of the General Tax Code that were previously only applicable to the 2023 budget. | 1\u202fJanuary\u202f2025 (for acts and conventions) |\n| **Specific articles of the General Tax Code** | Article\u202f206\u202fbis and Articles\u202f293‑297 of the General Tax Code are now applied to the 2025 budget period, with the same adjustments that were made in the 2023 law. | 1\u202fJanuary\u202f2025 |\n| **Autonomous state services** | Services of the State that are managed autonomously during the 2025 fiscal year remain subject to the same legislative and regulatory framework as in previous years, but the law explicitly notes that any new decrees relating to these services must be submitted for parliamentary