Guideline parsing

First of all, we must prepare the environment installing the required packages.

In [4]:
%pip install beautifulsoup4 bs4 requests pathvalidate cleantext




Then, we are going to download the first page of the guidelines and parse it to extract the name of all the guidelines.

In [3]:
from bs4 import BeautifulSoup
import requests

website_root = "https://uroweb.org" #Guidelines domain
folder_guidelines_root = "/guidelines"  #Guidelines page
guidelines_url = website_root + folder_guidelines_root
html_document = requests.get(guidelines_url).text #Webpage downloader, output as json
guidelines_index_soup = BeautifulSoup(html_document, 'html.parser') #Creation of a "soup" object, make easier to navigate the document
all_guidelines_a_class = guidelines_index_soup.findAll("a", {"class": "guideline-card"}) #Find all link that have the word 'guidelines' inside

#Enumerating the guidelines found
all_guidelines_url = []
for guideline in all_guidelines_a_class:
    all_guidelines_url.append(guideline['href'])
    print(guideline['href'])

/guidelines/prostate-cancer
/guidelines/non-muscle-invasive-bladder-cancer
/guidelines/upper-urinary-tract-urothelial-cell-carcinoma
/guidelines/muscle-invasive-and-metastatic-bladder-cancer
/guidelines/primary-urethral-carcinoma
/guidelines/renal-cell-carcinoma
/guidelines/testicular-cancer
/guidelines/penile-cancer
/guidelines/sexual-and-reproductive-health
/guidelines/non-neurogenic-female-luts
/guidelines/urethral-strictures
/guidelines/management-of-non-neurogenic-male-luts
/guidelines/chronic-pelvic-pain
/guidelines/urological-infections
/guidelines/neuro-urology
/guidelines/urolithiasis
/guidelines/paediatric-urology
/guidelines/urological-trauma
/guidelines/renal-transplantation


To proper indexing and vectorializing, it would be better to not randomly split the text, but to subdivide it.
We can do this by splitting the pragraph categorized using an h* token.
We can perform this recursivly until each file is under our desired lenght

In the following cell, we are going to declare the funcion to perfom the split and declare the variables that regulate the split

In [11]:
import os
from pathvalidate import sanitize_filename

#May be worth noticing that if you already have a model running, you can use their tokenizer function to get the real amount of tokens
# This works on llama.cpp, different inference software may have different methods
## len(llm_model.tokenize(bytes("This is a test", encoding='utf-8')))

#We start from <h4> because <h3> are the chapters titles
def h4_splitter(
        full_paragraph_raw,
        chapter_folder_path,
        token_subdivider,
        maximum_token_lenght,
        ):
    header_list = ['h3', 'h4']  #Target headers for this split cycle
    file_path = os.path.join(
        chapter_folder_path, #Chapter folder path from function argument
        sanitize_filename(full_paragraph_raw[0].text.replace(" ", "_").lower()[:15]) #Sanitized filename cut to the first 15 characters
        )
    os.makedirs(file_path, exist_ok=True)
    #We must initialize the variables for the loop
    paragraph_loop = 0   #variable to check if we are going to populate the subparagraph of another header
    full_sub_paragraph_raw = [] #Subpragraph list
    full_paragraph_txt = "" #Subparagraph text
    #Iterating on each row of the paragraph
    for row in full_paragraph_raw:
        #If this row and the next one have consecutives target headers, skip this row
        if row.name in header_list and row.find_next_sibling().name in header_list:
            continue
        #If this row and the next one does not have consecutives target headers, proceed
        if row.name in header_list and row.find_next_sibling().name not in header_list:
            #If there is a loop in progress, we must check if we need to split the paragraph or if we can save the file
            if paragraph_loop == 1:
                text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
                if text_tokens > maximum_token_lenght: #If the token count is higher than the maximum lenght we set, we must split the paragraph
                    #All these print are for debug purposes, you can remove them
                    print("Split to h5")
                    print("File: " + filename_sanitized)
                    print("Lenght: " + str(text_tokens))
                    print ("")
                    h5_splitter(full_sub_paragraph_raw, file_path, token_subdivider, maximum_token_lenght)  #We call the next split function
                    #We reset the variables for a new loop/file
                    full_paragraph_txt = ""
                    full_sub_paragraph_raw = []
                    paragraph_loop = 0
                else:   #If the token count is lower than the maximum lenght, we save the file
                    f = open(os.path.join(file_path,filename_sanitized), "w", encoding="utf-8")  #Create/open the file
                    f.write(full_paragraph_txt)    #Write the variable on the file
                    f.close()   #Close the file
                    #We reset the variables for a new loop/file
                    full_paragraph_txt = ""
                    full_sub_paragraph_raw = []
                    paragraph_loop = 0
            #If there is no loop in progress, we must define the new filename
            if paragraph_loop == 0:
                full_paragraph_txt += row.text + ". " #We append a dot and a space at the end of the row
                full_sub_paragraph_raw.append(row)  #We add the header and separate it from the rest of the text
                filename_sanitized = sanitize_filename(row.text.replace(" ", "_").lower()[:25] + ".txt") #We sanitize the filename and cut it to the first 25 characters
                paragraph_loop = 1   #We set that we are in a loop to populate a subparagraph
        #If the row has not a target header, we add the text to the subparagraph
        if row.name not in header_list:
            full_paragraph_txt += row.text + " " #We append a space at the end of the row
            full_sub_paragraph_raw.append(row) #We add the paragraph to the full text
    #At the end of the for, we must save the last paragraph and reset the variables
    text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
    if text_tokens > maximum_token_lenght: #If the token count is higher than the maximum lenght we set, we must split the paragraph
        #All these print are for debug purposes, you can remove them
        print("Split to h5")
        print("File: " + filename_sanitized)
        print("Lenght: " + str(text_tokens))
        print ("")
        h5_splitter(full_sub_paragraph_raw, file_path, token_subdivider, maximum_token_lenght) #We call the next split function
    else:   #If the token count is lower than the maximum lenght, we save the file
        f = open(os.path.join(file_path,filename_sanitized), "w", encoding="utf-8")  #Create/open the file
        f.write(full_paragraph_txt)    #Write the variable on the file
        f.close()   #Close the file

def h5_splitter(
        full_paragraph_raw,
        chapter_folder_path,
        token_subdivider,
        maximum_token_lenght,
        ):
    header_list = ['h3', 'h4', 'h5']    #Target headers for this split cycle
    file_path = os.path.join(chapter_folder_path, sanitize_filename(full_paragraph_raw[0].text.replace(" ", "_").lower()[:15])) #Chapter folder path from function argument
    os.makedirs(file_path, exist_ok=True)   #Create the folder if it doesn't exist
    #We must initialize the variables for the loop
    paragraph_loop = 0
    full_sub_paragraph_raw = []
    full_paragraph_txt = ""
    #Iterating on each row of the paragraph
    for row in full_paragraph_raw:
        #If this row and the next one have consecutives target headers, skip this row
        if row.name in header_list and row.find_next_sibling().name in header_list:
            continue
        #If this row and the next one does not have consecutives target headers, proceed
        if row.name in header_list and row.find_next_sibling().name not in header_list:
            #If there is a loop in progress, we must check if we need to split the paragraph or if we can save the file
            if paragraph_loop == 1:
                text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
                if text_tokens > maximum_token_lenght: #If the token count is higher than the maximum lenght we set, we must split the paragraph
                    #All these print are for debug purposes, you can remove them
                    print("Split to h6")
                    print("File: " + filename_sanitized)
                    print("Lenght: " + str(text_tokens))
                    print ("")
                    h6_splitter(full_sub_paragraph_raw, file_path, token_subdivider, maximum_token_lenght)  #We call the next split function
                    #We reset the variables for a new loop/file     
                    full_paragraph_txt = ""
                    full_sub_paragraph_raw = []
                    paragraph_loop = 0
                else:   #If the token count is lower than the maximum lenght, we save the file
                    f = open(os.path.join(file_path,filename_sanitized), "w", encoding="utf-8")  #Create/open the file
                    f.write(full_paragraph_txt)    #Write the variable on the file
                    f.close()   #Close the file
                    #We reset the variables for a new loop/file
                    full_paragraph_txt = ""
                    full_sub_paragraph_raw = []
                    paragraph_loop = 0
            #If there is no loop in progress, we must define the new filename
            if paragraph_loop == 0:
                full_paragraph_txt += row.text + ". "   #We append a dot and a space at the end of the row
                full_sub_paragraph_raw.append(row)  #We add the header and separate it from the rest of the text
                filename_sanitized = sanitize_filename(row.text.replace(" ", "_").lower()[:25] + ".txt") #We sanitize the filename and cut it to the first 25 characters
                paragraph_loop = 1   #We set that we are in a loop to populate a subparagraph
        #If the row has not a target header, we add the text to the subparagraph
        if row.name not in header_list:
            full_paragraph_txt += row.text + " "    #We append a space at the end of the row
            full_sub_paragraph_raw.append(row)      #We add the paragraph to the full text
    #At the end of the for, we must save the last paragraph and reset the variables
    text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
    if text_tokens > maximum_token_lenght: #If the token count is higher than the maximum lenght we set, we must split the paragraph
        #All these print are for debug purposes, you can remove them
        print("Split to h6")
        print("File: " + filename_sanitized)
        print("Lenght: " + str(text_tokens))
        print ("")
        h6_splitter(full_sub_paragraph_raw, file_path, token_subdivider, maximum_token_lenght)  #We call the next split function
    else:   #If the token count is lower than the maximum lenght, we save the file
        f = open(os.path.join(file_path,filename_sanitized), "w", encoding="utf-8")  #Create/open the file
        f.write(full_paragraph_txt)    #Write the variable on the file
        f.close()   #Close the file

def h6_splitter(
        full_paragraph_raw,
        chapter_folder_path,
        token_subdivider,
        maximum_token_lenght,
        ):
    header_list = ['h3', 'h4', 'h5', 'h6']  #Target headers for this split cycle
    file_path = os.path.join(chapter_folder_path, sanitize_filename(full_paragraph_raw[0].text.replace(" ", "_").lower()[:15])) #Chapter folder path from function argument
    os.makedirs(file_path, exist_ok=True)   #Create the folder if it doesn't exist
    #We must initialize the variables for the loop
    paragraph_loop = 0
    full_sub_paragraph_raw = []
    full_paragraph_txt = ""
    #Iterating on each row of the paragraph
    for row in full_paragraph_raw:
        #If this row and the next one have consecutives target headers, skip this row
        if row.name in header_list and row.find_next_sibling().name in header_list:
            continue
        #If this row and the next one does not have consecutives target headers, proceed
        if row.name in header_list and row.find_next_sibling().name not in header_list:
            #If there is a loop in progress, we must check if we need to split the paragraph or if we can save the file
            if paragraph_loop == 1:
                text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
                if text_tokens > maximum_token_lenght: #If the token count is higher than the maximum lenght we set, we must split the paragraph
                    #All these print are for debug purposes, you can remove them
                    print("Still too long..." + str(text_tokens))
                    print("Proceding to half split")
                    print ("")
                    print(filename_sanitized)
                    #Since <h6> is the smallest paragraph I found, to further reduce the lenght we split the paragraph in half
                    splitted_string = split_string_in_half(full_paragraph_txt)  #We split the paragraph in half
                    i1 = 1  #We initialize the counter for the first split
                    #Iterating on the splitted string
                    for split in splitted_string:
                        text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
                        if text_tokens > maximum_token_lenght: #If the token count is higher than the maximum lenght we set, we must split the paragraph
                            #All these print are for debug purposes, you can remove them
                            print("Still still too long..." + str(text_tokens))
                            print("Proceding to second half split")
                            print ("")
                            print(filename_sanitized)
                            #If the paragraph is still too long, we split it again
                            splitted_string2 = split_string_in_half(split)
                            i2 = 1  #We initialize the counter for the second split
                            #Iterating on the second splitted string
                            for split2 in splitted_string2:
                                #We do not proceed to further split the file, we save it
                                #We could actually proceed to further split the file, but it's not necessary for the purpose of this script
                                #In case we need to iterative and endlessy split the file it would be better to implement a recursive function
                                #Still, blindly splitting may lead to loss of context and meaning, so a manual check is always recommended and if a ton of splitting is needed, maybe consider using a model with bigger context window or implement a semantic splitting
                                f = open(os.path.join(file_path,filename_sanitized.replace(".txt", "")+"-"+str(i1)+"-"+str(i2)+".txt"), "w", encoding="utf-8")  #Create/open the file
                                f.write(split2)    #Write the variable on the file
                                f.close()   #Close the file
                                i2 += 1   #Increment the counter for the second split
                        else:
                            f = open(os.path.join(file_path,filename_sanitized.replace(".txt", "")+"-"+str(i1)+".txt"), "w", encoding="utf-8")  #Create/open the file
                            f.write(split)  #Write the variable on the file
                            f.close()   #Close the file
                            i1 += 1  #Increment the counter for the first split
                else:
                    #Make a subchapter folder, and create it if it doesn't exist
                    file_path = os.path.join(chapter_folder_path, sanitize_filename(full_paragraph_raw[0].text.replace(" ", "_").lower()[:15])) #Chapter folder path from function argument
                    os.makedirs(file_path, exist_ok=True)   #Create the folder if it doesn't exist
                    f = open(os.path.join(file_path,filename_sanitized), "w", encoding="utf-8")  #Create/open the file
                    f.write(full_paragraph_txt)    #Write the variable on the file
                    f.close()   #Close the file
                    #We reset the variables for a new loop/file
                    full_paragraph_txt = ""
                    full_sub_paragraph_raw = []
                    paragraph_loop = 0
            #If there is no loop in progress, we must define the new filename
            if paragraph_loop == 0:
                full_paragraph_txt += row.text + ". "   #We append a dot and a space at the end of the row
                full_sub_paragraph_raw.append(row)  #We append the row to the full text
                filename_sanitized = sanitize_filename(row.text.replace(" ", "_").lower()[:25] + ".txt")    #We sanitize the filename and cut it to the first 25 characters
                paragraph_loop = 1   #We set that we are in a loop to populate a subparagraph
        #If the row has not a target header, we add the text to the subparagraph
        if row.name not in header_list:
            full_paragraph_txt += row.text + " "    #We append a space at the end of the row
            full_sub_paragraph_raw.append(row)  #We append the row to the full text
    #At the end of the for, we must save the last paragraph and reset the variables
    text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
    if text_tokens > maximum_token_lenght: #If the token count is higher than the maximum lenght we set, we must split the paragraph
        #All these print are for debug purposes, you can remove them
        print("Still too long..." + str(text_tokens))
        print("Proceding to half split")
        print ("")
        print(filename_sanitized)
        #Since <h6> is the smallest paragraph I found, to further reduce the lenght we split the paragraph in half
        splitted_string = split_string_in_half(full_paragraph_txt)
        #We must repeat the split cicle because otherwise we are going to lose the last paragraph of each document.
        #This could be implemented in a nicer way, separating the splitting loops in their own functions, but for the purpose of this script it's not necessary
        i1 = 1  #We initialize the counter for the first split
        #We iterate on the splitted string
        for split in splitted_string:   
            text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
            if text_tokens > maximum_token_lenght: #If the token count is higher than the maximum lenght we set, we must split the paragraph
                #All these print are for debug purposes, you can remove them
                print("Still still too long..." + str(text_tokens))
                print("Proceding to second half split")
                print ("")
                print(filename_sanitized)
                #If the paragraph is still too long, we split it again
                splitted_string2 = split_string_in_half(split)
                i2 = 1  #We initialize the counter for the second split
                for split2 in splitted_string2:
                    f = open(os.path.join(file_path,filename_sanitized.replace(".txt", "")+"-"+str(i1)+"-"+str(i2)+".txt"), "w", encoding="utf-8")  #Create/open the file
                    f.write(split2)    #Write the variable on the file
                    f.close()   #Close the file
                    i2 += 1 #Increment the counter for the second split
            else:
                f = open(os.path.join(file_path,filename_sanitized.replace(".txt", "")+"-"+str(i1)+".txt"), "w", encoding="utf-8")  #Create/open the file
                f.write(split)  #Write the variable on the file
                f.close()   #Close the file
                i1 += 1 #Increment the counter for the first split
    else:
        f = open(os.path.join(file_path,filename_sanitized), "w", encoding="utf-8")  #Create/open the file
        f.write(full_paragraph_txt)    #Write the variable on the file
        f.close()   #Close the file

#File closure could be delegated to its own function, but for the purpose of this script it's not necessary
#A lot of cleanup could be done on this code, but for its limited scope is not necessary. In case webpage structure changes, the code must be updated and maybe this time done properly
        
#Function to split a string in half
def split_string_in_half(s):
    string_structure = []  #List to store the two halves of the string
    midpoint = len(s) // 2  #Find the midpoint of the string
    string_structure.append(s[:midpoint])   #Append the first half of the string
    string_structure.append(s[midpoint:])   #Append the second half of the string
    return string_structure   #Return the list with the two halves of the string

In the following cell, we are going to iterate each guidelines url, download, parse the webpage and split it files using the above declared functions

In [13]:

#We declare the chapters to skip, because they do not contain useful informations and they may pollute data
chapters_to_skip = ['introduction', 'methods', 'references', 'conflict-of-interest', 'citation-information']
#Since tokens for LLM are about 4 characters long, we can use this to estimate the maximum lenght of the prompt
token_subdivider = 4        #We use 4 characters as a token lenght
maximum_token_lenght = 4096 #We use 4096 as the maximum token lenght for our model; Must be changed based on your model

#We iterate on each guideline
for url in all_guidelines_url:
    guideline_index_url = website_root + url    #We generate the guideline index url
    html_document = requests.get(guideline_index_url).text #Webpage downloader, output as json
    guideline_soup = BeautifulSoup(html_document, 'html.parser') #Creation of a "soup" object, make easier to navigate the document
    guideline_index = guideline_soup.findAll('li', {'class': 'guideline-chapters__item'})   #Find all the chapters of the specific guideline
    guideline_name = url.rsplit('/', 1)[-1] #We get the guideline name from the url
    guideline_folder = os.path.join(sanitize_filename(folder_guidelines_root), sanitize_filename(guideline_name)) #We generate the folder name
    os.makedirs(guideline_folder, exist_ok=True)  #We create the folder
    #We initialize the lists with the chapter names and urls
    index_name = [] #List with the chapter names
    index_url = []  #List with the chapter urls
    #We iterate on each chapter
    for item in guideline_index:
        index_name.append(item.text)        #We append the chapter name to the list
        index_url.append(item.a['href'])    #We append the chapter url to the list
    number_of_chapters = len(index_name)    #We get the number of chapters
    guideline_content_raw = []  #Chapter content list
    full_page_raw = []  #Full page content list
    #We iterate on each chapter
    for i in range(number_of_chapters):
        guideline_chapter_url = website_root + index_url[i]   #We generate the chapter url
        chapter_title_name_from_url = index_url[i].rsplit('/', 1)[-1] #We get the chapter name from the url
        print("Starting processing: " + chapter_title_name_from_url) #Print for debug purposes
        #We skip the chapters we listed aboce
        if chapter_title_name_from_url not in chapters_to_skip:
            html_document = requests.get(guideline_chapter_url).text #Webpage downloader, output as json
            guideline_chapter_soup = BeautifulSoup(html_document, 'html.parser') #Creation of a "soup" object, make easier to navigate the document
            #The next few steps could be done in a single line, but for clarity purpose I split them
            guideline_text = guideline_chapter_soup.find('article', {'class': 'guideline-text'}) #Find the article tag with the guideline text
            article_text_in_generator = guideline_text.contents #Get the content of the article
            chapter_title = article_text_in_generator[0].text   #Get the chapter title
            chapter_folder = chapter_title.lower().replace(" ", "_") #Generate the chapter folder name
            chapter_folder_path = os.path.join(guideline_folder, sanitize_filename(chapter_folder)) #Generate the chapter folder path
            os.makedirs(chapter_folder_path, exist_ok=True)  #Create the folder
            filename_sanitized = "_PLACEHOLDER_FILENAME_"   #Initialize the filename variable
            definitive_path = "_PLACEHOLDER_PATHNAME_"   #Initialize the definitive path variable
            full_paragraph_raw = []    #Initialize the full paragraph list
            full_paragraph_txt = "_PLACEHOLDER_CONTENT_"    #Initialize the full paragraph text variable
            full_sub_paragraph_txt = "" #Initialize the full subparagraph text variable
            in_paragraph_loop = 0   #Initialize the variable to check if we are populating a subparagraph
            #Loop on the chapter content
            for row in article_text_in_generator:   #Iterating on each row of the chapter
                if row.name == 'h2':    #If it's an header <h2> we can skip it
                    continue
                if row.name == 'h3':    #If it's an header <h3> we must check if we are populating a subparagraph <h4>
                    if in_paragraph_loop == 1: #If we are populating a subparagraph <h4> and we reach an header <h3> we must close the subparagraph and save it
                        text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
                        #Se il testo è troppo lungo, dividiamo con <h4>
                        if text_tokens > maximum_token_lenght:
                            h4_splitter(full_paragraph_raw, chapter_folder_path, token_subdivider, maximum_token_lenght)    #We call the first split function
                            #We reset the variables for a new loop/file
                            full_paragraph_raw = []
                            in_paragraph_loop = 0
                        else:
                            #if the file is short enough, we save it as a single file
                            file_path = chapter_folder_path   #We set the file path
                            os.makedirs(file_path, exist_ok=True)   #Create the folder if it doesn't exist
                            f = open(os.path.join(file_path,filename_sanitized), "w", encoding="utf-8")  #Creiamo/apriamo il file
                            f.write(full_paragraph_txt)    #Scriviamo la variabile su file
                            f.close()   #Chiudiamo il file
                            in_paragraph_loop = 0   #Ripristiniamo la variabile per un nuovo loop/file
                    #The if is placed as second, so we don't have to repeat the code in the previous condition
                    #We must check if we are populating a subparagraph <h4> or if we are starting a new paragraph
                    if in_paragraph_loop == 0:
                        filename_sanitized = sanitize_filename(row.text.replace(" ", "_").lower()[:25] + ".txt")  #Sanitize the filename and cut it to the first 25 characters
                        full_paragraph_txt = row.text + ". "  #We append a dot and a space at the end of the row
                        full_paragraph_raw = []    #Initialize the full paragraph list
                        full_paragraph_raw.append(row)  #We append the header to the full text
                        in_paragraph_loop = 1   #We set that we are in a loop to populate a subparagraph    
                else:   #If it's not an header <h3> we must add the text to the paragraph
                    full_paragraph_txt += row.text + " "  #We append a space at the end of the row
                    full_paragraph_raw.append(row)  #We append the row to the full text
            #At the end of the loop we must check if we are populating a subparagraph <h4> and save it
            #We must check the lenght of the praragraph to split it if it's too long
            text_tokens = len(full_paragraph_txt)//token_subdivider #Count the tokens of the chapter subdividing the text by the aproximated token lenght
            #Se il testo è troppo lungo, dividiamo con <h4>
            if text_tokens > maximum_token_lenght:
                h4_splitter(full_paragraph_raw, chapter_folder_path, token_subdivider, maximum_token_lenght)    #We call the first split function
                #We reset the variables for a new loop/file
                full_paragraph_raw = []
                in_paragraph_loop = 0 
            else:
                #if the file is short enough, we save it as a single file
                file_path = chapter_folder_path
                os.makedirs(file_path, exist_ok=True)   #Create the folder if it doesn't exist
                f = open(os.path.join(file_path,filename_sanitized), "w", encoding="utf-8")  #Create/open the file
                f.write(full_paragraph_txt)    #Write the variable on the file
                f.close()   #Close the file
                in_paragraph_loop = 0   #Reset the variable for a new loop/file
        #We skip the chapters we listed aboce
        if chapter_title_name_from_url in chapters_to_skip:
            print("Skipping " + index_url[i].rsplit('/', 1)[-1] + " chapter")

    print("End of guideline: " + guideline_index_url + "\n")
        
    

Starting processing: introduction
Skipping introduction chapter
Starting processing: methods
Skipping methods chapter
Starting processing: epidemiology-and-aetiology
Starting processing: classification-and-staging-systems
Starting processing: diagnostic-evaluation
Starting processing: treatment
Split to h5
File: 6.2.1._deferred_treatment.txt
Lenght: 7511

Split to h6
File: 6.2.1.2._active_surveilla.txt
Lenght: 5249

Split to h5
File: 6.2.2._radical_prostatect.txt
Lenght: 6738

Split to h6
File: 6.2.2.3._surgical_techniq.txt
Lenght: 4266

Split to h5
File: 6.2.3._radiotherapy.txt
Lenght: 8293

Split to h6
File: 6.2.3.1._external_beam_ra.txt
Lenght: 5596

Split to h5
File: 6.3.5._adjuvant_treatment.txt
Lenght: 4111

Split to h5
File: 6.4.5._treatment_of_psa-o.txt
Lenght: 9606

Split to h6
File: 6.4.5.1._treatment_of_psa.txt
Lenght: 6063

Split to h5
File: 6.7.8._treatment_after_do.txt
Lenght: 4594

Starting processing: followup
Starting processing: quality-of-life-outcomes-in-prostate-ca

Once extracted, the text must be cleaned.
We do this in two separate step to allow editing in a non destructive way, allowing to modify the cleaning parameters without the need of redownloading everything.

In [13]:
import cleantext
import re
import glob
import os
from pathvalidate import sanitize_filename

#Define the path to the files
txt_folder_path = sanitize_filename(folder_guidelines_root) #Folder path of the guidelines, inherited from the previous functions
#Join the path of rag_sources and the guidelines folder
processed_file_folder = "rag_sources" #Folder path for the processed files
processed_file_path = os.path.join(processed_file_folder, txt_folder_path)

#Define the function to clean the text
def txt_processer (file_path):    
    extracted_text = open(file_path, "r", encoding="utf-8").read()  #Open the file and read the content
    extracted_text_without_citations = re.sub(r'\[\d+(,\d+)*\]', '', extracted_text) #Remove the citations numbers
    cleaned_text = cleantext.clean(
            extracted_text_without_citations,
            clean_all=False,    #If True, it will set all the others parameters to True
            extra_spaces=True,  #Remove extra white spaces
            stemming=False,     #Perform stemming
            stopwords=True,     #Removal of selected language stopwords
            stp_lang='english', #Stopwords language
            lowercase=True,     #Lowercase all the text
        )
    return cleaned_text

#Create the folder if it doesn't exist
os.makedirs(processed_file_path, exist_ok=True)
#Iterating on each file in the main directory
for filename in glob.iglob(txt_folder_path + "/**/**.txt", recursive=True):
    print(filename)
    #os.path.split splits the path in two parts, the folders [0] and the file [1]
    document_file_name = os.path.split(filename)[1] #Get the file name, including the extension
    #We define the folder path of the file replacing the txt_folder_path with processed_file_path
    processed_folder_path = os.path.split(filename)[0].replace(txt_folder_path, processed_file_path)
    #We create the folder if it doesn't exist
    os.makedirs(processed_folder_path, exist_ok=True)
    cleaned_text = txt_processer(filename) #Invoke the function to clean the text
    file = open(os.path.join(processed_folder_path, document_file_name) , "w", encoding="utf-8")  #Create/open the file, we replace the folder path with the processed one
    file.writelines(cleaned_text)   #Write the cleaned text on the file
    file.close()    #Close the file

print("Processed: " + filename + "\n")


guidelines\chronic-pelvic-pain\2._methodology\2.1._methods.txt
guidelines\chronic-pelvic-pain\2._methodology\2.2._review.txt
guidelines\chronic-pelvic-pain\3._epidemiology_aetiology_and_pathophysiology\3.3._abdominal_aspects_of.txt
guidelines\chronic-pelvic-pain\3._epidemiology_aetiology_and_pathophysiology\3.4._summary_of_evidence_.txt
guidelines\chronic-pelvic-pain\3._epidemiology_aetiology_and_pathophysiology\3.1._chronic_vi\3.1.1._incidence.txt
guidelines\chronic-pelvic-pain\3._epidemiology_aetiology_and_pathophysiology\3.1._chronic_vi\3.1.2._prevalence.txt
guidelines\chronic-pelvic-pain\3._epidemiology_aetiology_and_pathophysiology\3.1._chronic_vi\3.1.3._influence_on_quali.txt
guidelines\chronic-pelvic-pain\3._epidemiology_aetiology_and_pathophysiology\3.1._chronic_vi\3.1.4._costs.txt
guidelines\chronic-pelvic-pain\3._epidemiology_aetiology_and_pathophysiology\3.1._chronic_vi\3.1._chronic_visceral_pai.txt
guidelines\chronic-pelvic-pain\3._epidemiology_aetiology_and_pathophysiology