In [1]:
doc_path = "./data/BOI.pdf"
model = "llama3"

In [2]:
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
import fitz  # PyMuPDF

### Extraction du PDF et conversion en documents langchain

In [3]:
if doc_path:
    try:
        doc = fitz.open(doc_path)
        print("PDF chargé avec succès.")

        if len(doc) > 0:
            # Extraction du texte de toutes les pages
            content = "\n".join([page.get_text() for page in doc])
            print("Aperçu du contenu (100 premiers caractères) :")
            print(content[:100])

            # Vérifier et transformer en Document LangChain
            if isinstance(content, str) and content.strip():
                documents = [Document(page_content=content)]
                print("Contenu converti en document LangChain.")

            else:
                print("Le fichier PDF est vide ou ne contient pas de texte lisible.")

        else:
            print("Le fichier PDF est vide.")

    except Exception as e:
        print(f"Erreur lors du chargement du PDF : {e}")
else:
    print("Veuillez fournir un chemin de fichier PDF valide.")

PDF chargé avec succès.
Aperçu du contenu (100 premiers caractères) :
Beneficial Ownership Information Report
Filing Instructions
Financial Crimes Enforcement Network
U.S
Contenu converti en document LangChain.


### Découpage en chunks

In [4]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=300)
chunks = text_splitter.split_documents(documents)
print(f"Découpage terminé ({len(chunks)} chunks générés).")
print("🔹 Premier chunk :", chunks[0])

Découpage terminé (48 chunks générés).
🔹 Premier chunk : page_content='Beneficial Ownership Information Report
Filing Instructions
Financial Crimes Enforcement Network
U.S. Department of the Treasury
Version 1.0 January 2024'


In [5]:
len(chunks)

48

In [6]:
chunks[0]

Document(metadata={}, page_content='Beneficial Ownership Information Report\nFiling Instructions\nFinancial Crimes Enforcement Network\nU.S. Department of the Treasury\nVersion 1.0 January 2024')

### Insertion dans la BD des vecteurs

In [7]:
import ollama

ollama.pull("nomic-embed-text")

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [8]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings

# Charger le modèle d'embedding
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

# Convertir les documents en vecteurs et les stocker dans FAISS
vector_db = FAISS.from_documents(chunks, embedding_model)
print("Ajout terminé dans la base FAISS.")


  embedding_model = OllamaEmbeddings(model="nomic-embed-text")


Ajout terminé dans la base FAISS.


### Phase du retrieval

In [9]:
## === Retrieval ===
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_ollama import ChatOllama

from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# set up our model to use
llm = ChatOllama(model=model)

# a simple technique to generate multiple questions from a single question and then retrieve documents
# based on those questions, getting the best of both worlds.
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
    llm,
    prompt=QUERY_PROMPT
)


### Generation de la réponse

In [21]:
# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}

Ensure that the response is in the same language as the question.

"""

prompt = ChatPromptTemplate.from_template(template)


chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


In [17]:
res = chain.invoke(input=("عن ماذا يتكلم هاذا ال PDF ؟",))

print(res)

هذه الوثيقة هي تعليمات تقديم معلومات ملكية نفعية. فهي تتحدث عن القانون الأمريكي المعروف باسم "قانون التشفير المالي" (The Corporate Transparency Act) والذي يطلب من بعض أشكال الشركات الأمريكية والاجانبيةتقديم معلومات حول ملاك النفعة الأثرية إلى بنك crimes enforcement network (FinCEN).


In [22]:
res = chain.invoke(input=("De quoi parle ce document ?",))

print(res)

Ce document traite des instructions pour le fichier de reporting d'informations sur la propriété bénéficiaire (BOIR) et les exigences liées à l'enregistrement de ces informations avec FinCEN.


In [25]:
res = chain.invoke(input=("What this document is about?",))

print(res)

Beneficial Ownership Information Reporting Filing Instructions for January 2024 - Version 1.0


In [20]:
res = chain.invoke(input=("what are the main points as a business owner I should be aware of?",))

print(res)

As a business owner, you should be aware of the following main points:

* The Beneficial Ownership Information Report (BOIR) must be true, correct, and complete before it can be filed with FinCEN.
* You must provide information in every field marked with an asterisk (*) in all circumstances.
* You must also provide information in all fields not marked with an asterisk that are applicable to you.
* The BOIR must include identifying document images for all company applicants and beneficial owners reported on the form.
* The images must be complete, clear, and readable and must be provided for all company applicants and beneficial owners.
* You can only attach one image file per company applicant or beneficial owner, and the attachment cannot be larger than four (4) megabytes of data.
* Only certain types of identifying documents are permitted, including a non-expired U.S. passport or State-issued driver's license.
* The BOIR must include information about all foreign addresses in the exi