In [None]:
import re
from langchain_core.documents import Document 
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("data/cleaned/IFRS_9.txt", "r", encoding="utf-8") as file :
    text = file.read()

Now I will create the metadata tree before chunking the datas

In [3]:
Chapter_regex = r"\n(Chapter [1-9] [A-Z].*)\n"
Appendix_regex = r"(Appendix [A-Z] (?=[A-Z]))"
Chapters_elem = re.split(f"{Chapter_regex}|{Appendix_regex}", text)

#No intro
if "Chapter" not in Chapters_elem[0] :
    Chapters_elem = Chapters_elem[1:]

whole_text = []

In [4]:
for elem in range(0,len(Chapters_elem),3) :

    if Chapters_elem[elem] :
        true_chapter_name = Chapters_elem[elem]
    else :
        true_chapter_name = Chapters_elem[elem+1]

    pattern_parts = r"(?<![\d\.])([A-Z]?\d\.\d) (?=[A-Z])"
    nb_parts = re.findall(pattern_parts, Chapters_elem[elem+2]) 
    
    splitted = re.split(" ",true_chapter_name)
    chapter_id = splitted[1]
    chapter_txt = " "+" ".join(splitted[2:])

    if nb_parts : #With a part, only appendix with def haven't any parts

        current_type = "content"
        
        parts_ = re.split(pattern_parts, Chapters_elem[elem+2])
        parts_ = parts_[1:]
        
        for i in range(0,len(parts_),2):

            pattern_subpart = r"([A-Z]?\d\.\d\.\d+[A-Z]? (?=[A-Z]))"
            nb_subpart = re.findall(pattern_subpart, parts_[i+1])

            if nb_subpart :
            
                subparts = re.split(pattern_subpart, parts_[i+1])
                
                section_id = parts_[i].strip()
                section_txt = " "+subparts[0]
                
                subparts = subparts[1:]

                for j in range(0,len(subparts),2) : #a_pattern in subparts

                    # (a) parts
                    a_pattern = r"\n(\([a-hj-z]\) )"
                    nb_a_part = re.findall(a_pattern, subparts[j+1])

                    if nb_a_part :
                        
                        a_parts = re.split(a_pattern, subparts[j+1])#[1:] #i serves as an indication for smaller part
                        
                        subparts_id = subparts[j].strip()
                        subparts_txt = " "+a_parts[0]

                        a_parts = a_parts[1:]

                        for x in range(0,len(a_parts),2) :
                            item_nb = a_parts[x].strip()
                            item_text = " "+a_parts[x+1]

                            final_text = true_chapter_name+"\n"+section_id+section_txt+'\n'+subparts_id+subparts_txt+'\n'+item_nb+item_text
                            whole_text.append({"text" : final_text,
                                         "metadata" : {
                                             "chapter" : chapter_id,
                                             "chapter_title" : chapter_txt,
                                             "type" : current_type,
                                             "section" : section_id,
                                             "sub_section" : subparts_id,
                                             "a_part" : item_nb
                                            }})
                    
                    else :
                        subparts_id = subparts[j].strip()
                        subparts_txt = " "+subparts[j+1]
                        final_text = true_chapter_name+"\n"+section_id+section_text+"\n"+subparts_id+subparts_txt
                        whole_text.append({"text" : final_text,
                                         "metadata" : {
                                             "chapter" : chapter_id,
                                             "chapter_title" : chapter_txt,
                                             "type" : current_type,
                                             "section" : section_id,
                                             "sub_section" : subparts_id
                                            }})
                    
            else : #Chapter with part and no subpart

                a_pattern = r"\n(\([a-hj-z]\) )"
                nb_a_part = re.findall(a_pattern, parts_[i+1])
                
                if nb_a_part : #a_pattern in part
                    a_part = re.split(a_pattern, parts_[i+1])

                    section_id = parts_[i].strip()
                    section_txt = " "+a_part[0]
                    
                    a_part = a_part[1:]

                    for x in range(0,len(a_part),2) :
                        item_id = a_part[x].strip()
                        item_text = " "+a_part[x+1]
                        
                        final_text = true_chapter_name+"\n"+section_id+section_text+"\n"+item_id+item_text
                        whole_text.append({"text" : final_text,
                                         "metadata" : {
                                             "chapter" : chapter_id,
                                             "chapter_title" : chapter_txt,
                                             "type" : current_type,
                                             "section" : section_id,
                                             "a_part" : item_id
                                            }})

                else : #no a_part in the part
                    section_id = parts_[i].strip()
                    section_text = " "+parts_[i+1]
                    
                    final_text = true_chapter_name+"\n"+section_id+section_text
                    whole_text.append({"text" : final_text,
                                         "metadata" : {
                                             "chapter" :chapter_id,
                                             "chapter_title" : chapter_txt,
                                             "type" : current_type,
                                             "section" : section_id
                                            }})

    else : #If no part then it's definitions
        defs = re.split(r"(\w+_\w+) |(\n)(?=[A-Z]\w+)", Chapters_elem[elem+2])[1:]
        for i in range(0,len(defs),3) :
            if defs[i] :
                current_type = "definition"
                definition_term = defs[i].replace("_"," ")
                
                whole_text.append({"text" : true_chapter_name+'\n'+defs[i+2],
                                         "metadata" : {
                                             "chapter" :chapter_id,
                                             "chapter_title" : chapter_txt,
                                             "type" : current_type,
                                             "def_term" : definition_term
                                            }})
            else :
                current_type = "content"
                whole_text.append({"text" : true_chapter_name+'\n'+defs[i+2],
                                         "metadata" : {
                                             "chapter" :chapter_id,
                                             "chapter_title" : chapter_txt,
                                             "type" : current_type,
                                             }})


Chunk the datas

In [5]:
splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=100,
    separators=["\n\n", "\n", ". ", "; ", " "]
)

In [6]:
chunked_text = []

for entry in whole_text:
    text = entry["text"]
    metadata = entry["metadata"]

    chunks = splitter.split_text(text)

    for idx, chunk in enumerate(chunks):
        chunked_text.append({
            "text": chunk,
            "metadata": {
                **metadata,
                "chunk_id": idx 
            }
        })

Now we can stock it in langchain

In [7]:
docs = [
    Document(
        page_content=item["text"],
        metadata=item["metadata"]
    )
    for item in chunked_text]

In [8]:
model_name = "BAAI/bge-small-en"
model_kwargs = {"device": "cpu"}
encode_kwargs = {"normalize_embeddings": True}
hf = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs, encode_kwargs=encode_kwargs
)

In [9]:
vectorstore = FAISS.from_documents(docs,hf)

In [None]:
FAISS.load_local()

Store the vector base locally 

In [19]:
vectorstore.save_local("faiss_ifrs_index")

Test of the vector base with simple method (similarity search)

In [22]:
query = "What are the rules around hedge accounting ?"
docs = vectorstore.similarity_search(query, k=5)

for d in docs:
    print("---")
    print(d.metadata)
    print(d.page_content)

---
{'chapter': '6', 'chapter_title': ' Hedge accounting', 'type': 'content', 'section': '6.9', 'sub_section': '6.9.6', 'chunk_id': 1}
Accounting for qualifying hedging relationships Cash flow hedges
---
{'chapter': '6', 'chapter_title': ' Hedge accounting', 'type': 'content', 'section': '6.5', 'sub_section': '6.5.10', 'chunk_id': 3}
Cash flow hedges
---
{'chapter': '6', 'chapter_title': ' Hedge accounting', 'type': 'content', 'section': '6.1', 'sub_section': '6.1.1', 'chunk_id': 1}
6.1.1 The objective of hedge accounting is to represent, in the financial statements, the effect of an entity’s risk management activities that use financial instruments to manage exposures arising from particular risks that could affect profit or loss (or other comprehensive income, in the case of investments in equity instruments for which an entity has elected to present changes in fair value in other comprehensive income in accordance with paragraph 5.7.5). This approach aims to convey the context of he