In [4]:
#!pip install tiktoken

In [5]:
#!pip install chromadb

In [6]:
#!pip install langchain langchain_community langchain_ollama langchain_text_splitters

In [7]:
#!pip install pymupdf

In [8]:
doc_path = "./data/BOI.pdf"
model = "llama3"

In [9]:
from langchain_ollama import OllamaEmbeddings
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import Chroma
from langchain.schema import Document
import fitz  # PyMuPDF

### Extraction du PDF et conversion en documents langchain

In [10]:
if doc_path:
    try:
        doc = fitz.open(doc_path)
        print("PDF chargé avec succès.")

        if len(doc) > 0:
            # Extraction du texte de toutes les pages
            content = "\n".join([page.get_text() for page in doc])
            print("Aperçu du contenu (100 premiers caractères) :")
            print(content[:100])

            # Vérifier et transformer en Document LangChain
            if isinstance(content, str) and content.strip():
                documents = [Document(page_content=content)]
                print("Contenu converti en document LangChain.")

            else:
                print("Le fichier PDF est vide ou ne contient pas de texte lisible.")

        else:
            print("Le fichier PDF est vide.")

    except Exception as e:
        print(f"Erreur lors du chargement du PDF : {e}")
else:
    print("Veuillez fournir un chemin de fichier PDF valide.")

PDF chargé avec succès.
Aperçu du contenu (100 premiers caractères) :
Beneficial Ownership Information Report
Filing Instructions
Financial Crimes Enforcement Network
U.S
Contenu converti en document LangChain.


### Découpage en chunks

In [11]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1200, chunk_overlap=300)
chunks = text_splitter.split_documents(documents)
print(f"Découpage terminé ({len(chunks)} chunks générés).")
print("🔹 Premier chunk :", chunks[0])

Découpage terminé (48 chunks générés).
🔹 Premier chunk : page_content='Beneficial Ownership Information Report
Filing Instructions
Financial Crimes Enforcement Network
U.S. Department of the Treasury
Version 1.0 January 2024'


In [12]:
documents

[Document(metadata={}, page_content='Beneficial Ownership Information Report\nFiling Instructions\nFinancial Crimes Enforcement Network\nU.S. Department of the Treasury\nVersion 1.0 January 2024\n\nBeneficial Ownership Information Reporting Filing Instructions \nJanuary 2024 - Version 1.0\n2\nTable of Contents\nI. Who, What, When of Beneficial Ownership Information Reporting Requirements.........................3\nII. Where to Report Beneficial Ownership Information ............................................................................5\nIII. How to Report Beneficial Ownership Information.................................................................................6\na. Recommendations for Successful Filings........................................................................................6\nb. Item Instructions.................................................................................................................................10\nDisclaimer: These filing instr

In [13]:
len(chunks)

48

In [14]:
chunks

[Document(metadata={}, page_content='Beneficial Ownership Information Report\nFiling Instructions\nFinancial Crimes Enforcement Network\nU.S. Department of the Treasury\nVersion 1.0 January 2024'),
 Document(metadata={}, page_content='Beneficial Ownership Information Reporting Filing Instructions \nJanuary 2024 - Version 1.0\n2\nTable of Contents\nI. Who, What, When of Beneficial Ownership Information Reporting Requirements.........................3\nII. Where to Report Beneficial Ownership Information ............................................................................5\nIII. How to Report Beneficial Ownership Information.................................................................................6\na. Recommendations for Successful Filings........................................................................................6\nb. Item Instructions..............................................................................................................................

### Insertion dans la BD des vecteurs

In [15]:
import ollama

ollama.pull("nomic-embed-text")

ProgressResponse(status='success', completed=None, total=None, digest=None)

In [16]:
from langchain.vectorstores import FAISS
from langchain.embeddings import OllamaEmbeddings

# Charger le modèle d'embedding
embedding_model = OllamaEmbeddings(model="nomic-embed-text")

# Convertir les documents en vecteurs et les stocker dans FAISS
vector_db = FAISS.from_documents(chunks, embedding_model)
print("Ajout terminé dans la base FAISS.")


  embedding_model = OllamaEmbeddings(model="nomic-embed-text")


Ajout terminé dans la base FAISS.


### Phase du retrieval

In [17]:
## === Retrieval ===
from langchain.prompts import ChatPromptTemplate, PromptTemplate
from langchain_core.output_parsers import StrOutputParser

from langchain_ollama import ChatOllama

from langchain_core.runnables import RunnablePassthrough
from langchain.retrievers.multi_query import MultiQueryRetriever

# set up our model to use
llm = ChatOllama(model=model)

# a simple technique to generate multiple questions from a single question and then retrieve documents
# based on those questions, getting the best of both worlds.
QUERY_PROMPT = PromptTemplate(
    input_variables=["question"],
    template="""You are an AI language model assistant. Your task is to generate five
    different versions of the given user question to retrieve relevant documents from
    a vector database. By generating multiple perspectives on the user question, your
    goal is to help the user overcome some of the limitations of the distance-based
    similarity search. Provide these alternative questions separated by newlines.
    Original question: {question}""",
)

retriever = MultiQueryRetriever.from_llm(
    vector_db.as_retriever(search_type="similarity", search_kwargs={"k": 5}),
    llm,
    prompt=QUERY_PROMPT
)


### Generation de la réponse

In [18]:
# RAG prompt
template = """Answer the question based ONLY on the following context:
{context}
Question: {question}
"""

prompt = ChatPromptTemplate.from_template(template)


chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
    | StrOutputParser()
)


# res = chain.invoke(input=("what is the document about?",))
# res = chain.invoke(
#     input=("what are the main points as a business owner I should be aware of?",)
# )
res = chain.invoke(input=("how to report BOI?",))

print(res)

According to the provided context, here's how to report beneficial ownership information (BOI):

1. **Who must report**: Certain types of U.S. and foreign entities are required to report beneficial ownership information to FinCEN.
2. **What to report**: Beneficial ownership information includes information about the entity, its beneficial owners, and in certain cases its company applicants.
3. **When to report**: Beneficial ownership information is reported to FinCEN through Beneficial Ownership Information Reports (BOIRs).

To file a BOIR, you can:

1. Use the E-Filing submission guides for both PDF and web-based versions of the BOIR.
2. Access these guides by going to [http://boifiling.fincen.gov](http://boifiling.fincen.gov) and selecting Help.

Alternatively, FinCEN also offers system-to-system BOIR transmission via secure Application Programming Interface (API) for those who want to automate the filing process.
