In [None]:
import pdfplumber
from pathlib import Path
import re
import numpy as np
import fitz

In [None]:
#First assume every page contains only structured texts with one column 

tables = []
text_blocks = []

with pdfplumber.open("data/raw/IFRS_9.pdf") as pdf:
    for page in pdf.pages:
        text_blocks.append(page.extract_text())
        tables.append(page.extract_tables()) #Will help identify page with table to perform specific cleaning

### First preprocessing : entire text

In [None]:
#Skip the introduction and delete the appendix of amendments dates / lists of participants
#text_blocks = text_blocks[5:-10]
#tables = tables[5:-10]

In [None]:
def clean_all_text(text_blocks) :
# First, header and footpage
    text_blocks = [re.sub("IFRS 9\n", "", block) for block in text_blocks]
    text_blocks = [re.sub(r"\n.*© IFRS Foundation.*", "", block) for block in text_blocks]

#Paragraph 
    text_blocks = [re.sub(r"(Chapter .*)", r"\n\1\n",block) for block in text_blocks] #Add an extra space before and after Chapter

#    text_blocks = [re.sub(r"(?<![a-z])([A-Z]?\d\.\d\.\d )", r"\n\1",block) for block in text_blocks] #Add an extra space before a subpart

#    text_blocks = [re.sub(r"(?<![a-z])(\n[A-Z]?\d\.\d )", r"\n\1",block) for block in text_blocks] #Add an extra space before a subpart
#    text_blocks = [re.sub(r"(?<![a-z])(\n[A-Z]\d\.\d\.\d )", r"\n\1",block) for block in text_blocks] #Add an extra space before a subpart

# A paragraph is divided in multiple lines so we need to fuse it back into one line
    text_blocks = [re.sub(r"([a-z,A-Z,\(\)])\n([a-z,A-Z])", r"\1 \2",block) for block in text_blocks]

# Sometimes sub sections (X.X.Y) aren't properly spaced form the previous line
    text_blocks = [re.sub(r" (\d\.\d\.\d [A-Z])", r"\n\1",block) for block in text_blocks]

#Specific cases
    text_blocks = [re.sub(r"([a-z])\n([a-z,1-9])", r"\1 \2", block) for block in text_blocks] #Avoid that (.) subpart be fuse into previous line
    text_blocks = [re.sub(r"([1-9])\n([a-z])", r"\1 \2", block) for block in text_blocks] #If not this specific, new subpart can be added into previous line
    text_blocks = [re.sub(r"(:)\n([A-Z])", r"\1 \2", block) for block in text_blocks]

    #If new page has a sentence of previous page last paragraph, fuse it to the paragraph
    for i in range(1,len(text_blocks)-1) :
    
        if text_blocks[i] : #If page isn't empty (last page IFRS)
        
            if text_blocks[i][0].islower() and text_blocks[i-1][-1].islower() :
        
                end_sentence = text_blocks[i].split("\n")[0]
                new_block = "\n".join(text_blocks[i].split("\n")[1:])
        
                text_blocks[i] = new_block
                text_blocks[i-1] = text_blocks[i-1]+" "+end_sentence
    
    return text_blocks

In [None]:
text_blocks = clean_all_text(text_blocks)

### Second preprocessing : definitions pages

In [None]:
def group_words_into_entries(words, y_tol=3, line_spacing_tol=12):
    # Sort by vertical, then horizontal
    words = sorted(words, key=lambda w: (w[1], w[0]))
    entries = []
    current_entry = []
    current_y = None
    last_line_y = None

    for w in words:
        x0, y0, x1, y1, text, block, line, span = w

        # Case 1 — first word → start entry
        if current_y is None:
            current_y = y0
            last_line_y = y0
            current_entry.append(w)
            continue

        # Case 2 — same line (y difference small)
        if abs(y0 - last_line_y) <= y_tol:
            current_entry.append(w)
            continue

        # Case 3 — next line but should be merged (definition continuation)
        if abs(y0 - last_line_y) <= line_spacing_tol:
            current_entry.append(w)
            last_line_y = y0
            continue

        # Case 4 — too far → new entry
        entries.append(current_entry)
        current_entry = [w]
        current_y = y0
        last_line_y = y0

    # push last one
    if current_entry:
        entries.append(current_entry)

    return entries

def entry_to_text(entry, type):

    entry = sorted(entry, key=lambda w: (w[6],w[0]))
    text = " ".join(w[4] for w in entry)
    
    if type == "col" :
        return [text, entry[0][1]]
    else :
        return text
    
def normalize_def(term):
    import re
    from unidecode import unidecode

    t = term.lower()
    t = unidecode(t)
    t = re.sub(r"[^a-z0-9]+", "_", t)
    t = re.sub(r"_+", "_", t).strip("_")
    return t+"_"

In [None]:
def detect_if_multi_col(doc, page_num):

    page = doc[page_num]
    words = page.get_text_words()
    if not words:
        return 

    # keep python list
    raw = page.get_text_words()

    # extract columns with list comprehension
    x0 = np.array([w[0] for w in raw], dtype=float)
    y0 = np.array([w[1] for w in raw], dtype=float)
    texts = [w[4] for w in raw]

    words = raw  # keep original structure

    # Column part
    width = page.rect.width
    bins_x = np.linspace(0, width, 200)
    density_x, _ = np.histogram(x0, bins=bins_x)

    idx_left = np.argmax(density_x > 4)
    if density_x[idx_left] == 0: #If no 
        return 

    big_peaks = np.where(density_x > 20)[0]
    if len(big_peaks) != 1: #If cols > 2 
        return 

    idx_sep = big_peaks[0]

    # Check init distance between peaks
    if abs(idx_left - idx_sep) <= 20:
        return 

    midpoint_x = bins_x[idx_sep - 1]

    # Row part
    height = page.rect.height
    bins_y = np.linspace(0, height, 200)
    density_y, _ = np.histogram(y0, bins=bins_y)

    row_peaks = np.where(density_y > 15)[0]

    # If no row separator is detected → everything is 2-column
    if len(row_peaks) == 0:
        cutoff_y = height
    else:
        cutoff_y = bins_y[row_peaks[0] - 1]

    left_words  = [w for w in words if (w[0] < midpoint_x and w[1] < cutoff_y)]
    right_words = [w for w in words if (w[0] >= midpoint_x and w[1] < cutoff_y)]
    below_words = [w for w in words if w[1] >= cutoff_y]


    left_entries  = group_words_into_entries(left_words)
    right_entries = group_words_into_entries(right_words)
    below_entries = group_words_into_entries(below_words)

    left_texts  = [entry_to_text(e, "col") for e in left_entries]
    right_texts = [entry_to_text(e, "col") for e in right_entries]
    below_texts = [entry_to_text(e, "row") for e in below_entries]

    final_te = []
    for rtxt, ry in right_texts:
        merged = False
        for ltxt, ly in left_texts:
            if abs(ry - ly) < 1:
                final_te.append(normalize_def(ltxt) + " " + rtxt)
                merged = True
                break
        if not merged:
            final_te.append(rtxt)

    final_te.extend(t for t in below_texts)

    return "\n".join(final_te)

In [None]:
doc = fitz.open("/home/hlm/Documents/Mini-RAG/data/raw/IFRS_9.pdf")

In [None]:
for i in range(63,68) : 
    text_blocks[i] = detect_if_multi_col(doc,i)

### Third preprocessing : table pages

In [None]:
#To know when to use OCR
ocr_needed = []
tables_ = []

for page in range(len(doc)) :
    if len(doc[page].get_drawings()) > 5 : 

        if not tables[page] : #Tables not recognized by pdfplumber
            ocr_needed.append(page)
        else :
            tables_.append(page)

Identified tables

In [None]:
def to_markdown(headers, cols, continued = False):
    md = ""

    if not continued :
        for i in headers :
            md += f"| {i} "
        md+="|\n"

        for i in headers :
            md += "|---"
        md+="|\n"

    for i in cols :
        i = re.sub(r"([a-z])\n([a-z])", r"\1 \2", i)
        i = re.sub(r"-\n([a-z])", r"\1", i)
        i = i.replace("\n", "<br>")

        md += f"| {i} "

    return md

def transform_table_to_use(tables_, text_blocks) :

    for num in tables_ :
    
        table = pdf.pages[num].find_table().cells 
    
        y_min = 10000
        y_max = 0
        for elem in table :
            y_min = min(elem[1],y_min)
            y_max = max(elem[3],y_max)

        before =[]
        after = []
        
        for i in pdf.pages[num].extract_words() :
            if i["top"] < y_min and len(before) < 10:
                before.append(i["text"])
            if i["bottom"] > y_max and len(after) < 10 :
                after.append(i["text"])

        #Reconstruct the true page
        new_page = ""
        tablemm = tables[num][0]
        headers = tablemm[0]
        cols = tablemm[1]
        
        if len(after) > 5 :
            past_table = False

            after_text = " ".join(after[:5])
            text = text_blocks[num]

            for j in text.split("\n") :
                if past_table :
                    new_page += j

                elif after_text in j :
                    past_table = True
                    new_page += after_text+j.split(after_text)[1]
        
        else : 
            if "...continued" in before :
                new_page += to_markdown(headers, cols, continued=True)
            else :
                new_page += to_markdown(headers,cols)

        text_blocks[num] = new_page

    return text_blocks

In [None]:
text_blocks = transform_table_to_use(tables_, text_blocks)

In [None]:
ocr_needed

The first page that needs ocr will be deleted after and only the 73 really needs ocr. I actually used ChatGPT to get direcctly the text from the page directly and the layout as it needed really specific instructions to get good results

In [None]:
layout = ['''IFRS 9\n
Derecognition of financial assets (Section 3.2)\n
\n
B3.2.1\n
The following flow chart illustrates the evaluation of whether and to what extent a financial asset is derecognised.\n
\n
Consolidate all subsidiaries [Paragraph 3.2.1]\n
\n
Determine whether the derecognition principles below are applied to a part or all of an asset (or group of similar assets) [Paragraph 3.2.2]\n
\n
Have the rights to the cash flows from the asset expired? [Paragraph 3.2.3(a)]\n
Yes → Derecognise the asset\n
No → Continue\n
\n
Has the entity transferred its rights to receive the cash flows from the asset? [Paragraph 3.2.4(a)]\n
Yes → Continue\n
No → Continue\n
\n
Has the entity assumed an obligation to pay the cash flows from the asset that meets the conditions in paragraph 3.2.5? [Paragraph 3.2.4(b)]\n
No → Continue to recognise the asset\n
Yes → Continue\n
\n
Has the entity transferred substantially all risks and rewards? [Paragraph 3.2.6(a)]\n
Yes → Derecognise the asset\n
No → Continue\n
\n
Has the entity retained substantially all risks and rewards? [Paragraph 3.2.6(b)]\n
Yes → Continue to recognise the asset\n
No → Continue\n
\n
Has the entity retained control of the asset? [Paragraph 3.2.6(c)]\n
No → Derecognise the asset\n
Yes → Continue to recognise the asset to the extent of the entity’s continuing involvement\n
\n
A436\n
© IFRS Foundation\n
''']

In [None]:
layout = "".join(layout)
text_blocks[73] = layout

Get rid of unecessary pages

In [None]:
#Skip the introduction and delete the appendix of amendments dates / lists of participants
text_blocks = text_blocks[5:-10]

In [None]:
text_blocks = clean_all_text(text_blocks)

In [None]:
cc = "".join(text_blocks)
cc = re.sub(r"\n{2,}","\n", cc)

Nearly finished preprocess, need to found a solution for a slight problem on chapter acquisition during multi col