In [26]:
# %pip install llama-index-readers-file
# %pip install --u llama-index
# %pip install llama-parse
# %pip install openai
# %pip install langchain
# %pip install --upgrade numpy
# %pip install tiktoken
# %pip install python-dotenv
# %pip install -U sentence-transformers
# %pip install langchain_experimental
# %pip install langchain_openai
# %pip install --upgrade langchain_openai langchain_core


In [27]:
import os
import openai
from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader, Document
from llama_parse import LlamaParse  # pip install llama-parse

import nest_asyncio
nest_asyncio.apply()

# import param
# from huggingface_hub import login

# Loading Environment variables:
dotenv_path = 'KEYs.env'  
_ = load_dotenv(dotenv_path)

openai.api_key = os.environ['OPENAI_API_KEY']
PINECONE_API_KEY=os.environ['PINECONE_API_KEY']
LLAMA_CLOUD_API_KEY=os.environ['LLAMA_CLOUD_API_KEY']
# hf_token = os.environ['hf_token']
# login(token=hf_token)

In [28]:
# Read document

parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,
    result_type="markdown"  # "markdown" and "text" are available
    #verbose=True
)

def get_meta(file_path):
    return {"source": file_path}

file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader(input_dir="C:/Users/marlo/OneDrive/Desktop/Anaconda/Fun/Deep_Learning/Semantic_Search_Project/Database_NEW",
                               #input_files=["013-028l_S3_Klassifikation-Diagnostik-Therapie-Urtikaria_2022-04.pdf"],
                                recursive=True, 
                                num_files_limit=3,
                                file_extractor=file_extractor,
                                filename_as_id=True,
                                file_metadata=get_meta
                                )
docs = reader.load_data()

print(type(docs), "\n")
print(len(docs), "\n")
print(type(docs[0]))
print(type(docs[0].text))

Started parsing the file under job_id e35c1b91-a7c4-46f5-8d5c-fa1f0a090beb
.Started parsing the file under job_id 2c88a1e0-717e-41e7-84d0-21aad27ea33c
....Started parsing the file under job_id 38612bea-8460-4773-be82-3ac738ad7e02
.<class 'list'> 

3 

<class 'llama_index.core.schema.Document'>
<class 'str'>


### 2. Text Normalization

In [29]:
# Normalize Text
import re
# List of expressions to replace
expressions_to_replace = [
    "bzw\.", "z\. B\.", "med\.", "Dr\.", "zit\.", "n\.", "e.V\.", "rer\.", "nat\.", "Prof\.", "B.A\.",
    "Ca\.", "ca\.", "usw\.", "v\. a\.", "p\. p\.", "s\. o\.", "s\. u\.", "sog\.", "u\. a\.", "vs\.",
    "Min\.", "et al\.", "ärztl\.", "evtl\.", "ggf\."
]

for i in range(len(docs)):
    print("iteration:",i)
    # Replace the expressions in the markdown text
    for expression in expressions_to_replace:
        docs[i].text = re.sub(expression, expression.replace("\\.", ""), docs[i].text)

iteration: 0
iteration: 1
iteration: 2


### 4. Chunking Algorithm

In [31]:
# NEW Chunker Alg
'''
Method:
1. Markdown Chunking: First use the MarkdownTextSplitter to do “rule-based chunking” using the titles and headers.
2. Semantic Chunking:
    2.1. Check the chunk size of the resulting chunks
    2.2. If chunk size is below min token threshold (i.e. 20 tokens) --> merge the chunk with the previous chunk
    2.3. If chunk size exceeds the max chunk size of the embedding model (i.e. 512 for e5-multilingual) --> subchunk that chunk with the SemanticChunker.
        2.3.1. Find the chunks with token size above max token limit (512 tokens)
        2.3.2. Split the chunks into subchunks using breakpoint value of 95 (to begin with)
        2.3.3. Check subchunks:
            2.3.3.1 if the created subchunks are smaller than min token limit --> merge with previous subchunk
            2.3.3.2 Substitute the original "parent chunk" with its subchunks (Note: Metadata of parent chunks are copied to children)
        4. Iteratively check again the size each resulting subchunk. (go back to 2.3.1)
        5. If the subchunks cannot be broken down any further, (i.e. SemanticChunker is unable to break at threshold 95) --> reduce threshold to 85

'''
'''
Improvements:
- Whenever there is a header, a new chunk must start. i.e. cannot have header in the middle of chunk
- Embedding similarity idea also for merging when below min limit.
'''

# MarkdownTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
import tiktoken
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from tqdm import tqdm
import re
import json

# helper functions

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    #num_tokens = len(encoding.encode(string, disallowed_special = set()))

    return num_tokens

def semantic_chunker(threshold_amount):
    """Creates an instance of a SemanticChunker. Input threshold value for breakpoint splitting"""
    text_splitter = SemanticChunker(
        OpenAIEmbeddings(),
        breakpoint_threshold_type="percentile",
        #breakpoint_threshold_type="standard_deviation"
        #breakpoint_threshold_type="interquartile"
        breakpoint_threshold_amount = threshold_amount
    )
    return text_splitter

def print_all_chunks(list_of_docs,with_page_content=False, with_num_tokens=True):

    for i in range(len(list_of_docs)):
        num_of_tokens = num_tokens_from_string(list_of_docs[i].page_content, "cl100k_base")
        if with_num_tokens == True:
            print(f"num. of tokens in chunk {i} is: {num_of_tokens}")
        if with_page_content == True:
            #print(f"-----------------------------CHUNK {i} ----------------------------------")
            #print(list_of_docs[i].page_content)
            print(f"---")
            print(list_of_docs[i].page_content)

def find_idxs_below_min_chunk_size(list_of_docs, min_chunk_size=20, show_indices=True):

    idxs_below_min_chunk_size = []

    for i in range(len(list_of_docs)):
        num_of_tokens = num_tokens_from_string(list_of_docs[i].page_content, "cl100k_base")

        # if chunk size is below min_chunk_size, append index to list
        if num_of_tokens < min_chunk_size:
            idxs_below_min_chunk_size.append(i)

    num_of_idxs_in_list = len(idxs_below_min_chunk_size)

    if show_indices==True:
        print(f"{num_of_idxs_in_list} indices below min threshold: {idxs_below_min_chunk_size}")

    return idxs_below_min_chunk_size

def find_idxs_above_max_chunk_size(list_of_docs, max_chunk_size=512):

    idxs_above_max_chunk_size = []

    for i in range(len(list_of_docs)):
        num_of_tokens = num_tokens_from_string(list_of_docs[i].page_content, "cl100k_base")

        # if chunk size is below min_chunk_size, append index to list
        if num_of_tokens > max_chunk_size:
            idxs_above_max_chunk_size.append(i)

    num_of_idxs_in_list = len(idxs_above_max_chunk_size)

    print(f"{num_of_idxs_in_list} indices above max threshold: {idxs_above_max_chunk_size}")

    return idxs_above_max_chunk_size

def remove_below_min_chunks_list(list_of_docs, list_blw_min_idx, min_chunk_size=20):

    # inverse list order to not get indexing problems
    sorted_max_indices = sorted(list_blw_min_idx, reverse=True)
    #print(sorted_max_indices)

    for m in sorted_max_indices:

        num_of_tokens = num_tokens_from_string(list_of_docs[m].page_content, "cl100k_base")

        if num_of_tokens < min_chunk_size and m != 0:
            update_data = {
                "page_content": list_of_docs[m-1].page_content + " \n" + list_of_docs[m].page_content,
            }
            new_doc_node = list_of_docs[m-1].copy(update = update_data)

            # delete the two old nodes and subsitute it with the new node
            list_of_docs.pop(m)
            list_of_docs.pop(m-1)

            #insert new merged node into list
            list_of_docs.insert(m-1,new_doc_node)

def list_of_str_2_list_of_docs(list_of_str, doc_obj):

    list_of_docs = []

    for n in range(len(list_of_str)):

        data = {
            "page_content": list_of_str[n]
        }
        new_doc_obj = doc_obj.copy(update = data)
        list_of_docs.append(new_doc_obj)

    #print(list_of_docs)
    return list_of_docs

def remove_above_max_chunks_list(list_of_docs, list_abv_max_idx, breakpoint_thresh_value=95):

    # inverse list order to not get indexing problems
    sorted_max_indices = sorted(list_abv_max_idx, reverse=True)
    #print(sorted_max_indices)

    for m in tqdm(sorted_max_indices):

            # semantically split chunk
            subchunks = semantic_chunker(breakpoint_thresh_value).split_text(list_of_docs[m].page_content)

            # right now subchunks is a list of str (i.e. each element only contains text).
            # convert this list of strings into a list of docs
            node = list_of_docs[m].copy() # create copy of parent node (needed for list_of_str_2_list_of_docs fct.)
            docs = list_of_str_2_list_of_docs(subchunks, node)

            # Check, that the created subchunks are of token size > min_chunk_size (=20) --> if not, merge them with previous subchunk
            subchunk_idxs_below_min_thresh = find_idxs_below_min_chunk_size(docs, min_chunk_size=100, show_indices=False)
            remove_below_min_chunks_list(docs, subchunk_idxs_below_min_thresh, min_chunk_size=100)

            num_of_subchunks=len(docs)

            #create copy of the node
            copy_node_2 = list_of_docs[m].copy()
            #print(copy_node)

            # delete node from list
            list_of_docs.pop(m)

            for n in reversed(range(num_of_subchunks)):

                update_data = {
                    "page_content": docs[n].page_content
                }
                copy_node_2_updated = copy_node_2.copy(update = update_data)

                #insert new merged node into list
                list_of_docs.insert(m, copy_node_2_updated)

def write_results_in_txt(list_of_docs,leitlinien_doc,list_of_above_max_indices, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(f"-----------------------------RESULTS ----------------------------------\n")
        file.write(f"Leitlinien doc: {leitlinien_doc} \n")
        file.write(f"final num. of chunks: {len(list_of_docs)}\n")
        file.write(f"chunks over max. token limit: {list_of_above_max_indices}\n")

        for i in range(len(list_of_docs)):
            num_of_tokens = num_tokens_from_string(list_of_docs[i].page_content, "cl100k_base")
            file.write(f"num. of tokens in chunk {i} is: {num_of_tokens} \n")
            file.write(f"-----------------------------CHUNK {i} ----------------------------------\n")
            file.write(list_of_docs[i].page_content + '\n')

# Page number functions
'''
Method: "Unsexy but works"
1. Read Markdown Text. Whenever the page delimiter (i.e. "---") is encountered, write the current page count above the line (e.g. (Page x)) (Note: Llamaparse reliably detects all page breaks)
2. For each Header (marked by ###,##,#) write page number into the Header
3. Let MarkdownHeaderTextSplitter split the text (During Chunking). This will automatically create chunks with metadata containing "Header" information.
4. Now we extract the Page number previously written into the Header, and write it as a new entry in the metadata dictionary under "Page Number"
5. Delete all the (Page x) added previously to the headers and page_content to restore our original markdown text.

'''

def count_and_update_delimiters(markdown_text):
    # Compile regex pattern to match "---" delimiters on separate lines
    delimiter_pattern = re.compile(r'^\s*---\s*$', re.MULTILINE)

    # Initialize count
    delimiter_count = 0

    # Function to replace each "---" delimiter with "--- {count} delimiter(s) found"
    def replace(match):
        nonlocal delimiter_count
        delimiter_count += 1
        return f'\n(Page {delimiter_count})\n---\n'

    # Replace each "---" delimiter
    updated_markdown_text = re.sub(delimiter_pattern, replace, markdown_text)

    return updated_markdown_text

def modify_headers_with_page_numbers(markdown_text):
    # Compile regex pattern to match headers (###, ##, #) and "page_nr: x"
    header_pattern = re.compile(r'^(#+)\s+(.*)$', re.MULTILINE)
    page_number_pattern = re.compile(r'(Page \s*(\d+))', re.IGNORECASE)

    # Function to replace each header with header + page number
    def replace(match):
        header_level = match.group(1)
        header_text = match.group(2)
        page_nr_match = page_number_pattern.search(markdown_text, match.end())
        if page_nr_match:
            page_nr = page_nr_match.group(1)
            return f'{header_level} {header_text} ({page_nr})'
        else:
            return match.group(0)

    # Replace each header
    updated_markdown_text = re.sub(header_pattern, replace, markdown_text)

    return updated_markdown_text

def update_metadata_with_page_numbers(md_header_splits):
    '''
    This function:
    1. reads the page numbers, previously inserted into the headers, and writes them into a seperate metadata entry: Page Number
    2. deletes all the page numbers inserted previously into the header metadata entries
    '''
    # Compile regex pattern to extract page numbers from headers
    page_number_pattern = re.compile(r'\(Page\s+(\d+)\)')

    for header_split in md_header_splits:
        metadata = header_split.metadata
        page_content = header_split.page_content
        page_numbers = []

        if metadata:
            last_header_value = list(metadata.values())[-1]
            page_number_match = page_number_pattern.search(last_header_value)
            if page_number_match:
                page_number = int(page_number_match.group(1))
                page_numbers.append(page_number)
                metadata['Page Number'] = page_numbers
            for key, value in metadata.items():
                if re.search(r'\bHeader\b', key):
                  # Remove the page number and surrounding brackets from the header
                  value_without_page_number = re.sub(page_number_pattern, '', value).strip()

                  # Update the metadata dictionary
                  metadata[key] = value_without_page_number
                if key == 'Page Number':
                    page_nums = set(int(match.group(1)) for match in page_number_pattern.finditer(page_content))
                    for num in page_nums:
                        if num not in page_numbers:
                            page_numbers.append(num)
                            metadata['Page Number'] = page_numbers

def update_page_content(md_header_splits):
    '''
    This function deletes all the page numbers (i.e. (Page x)) inserted previously into the page content
    '''
    # Compile regex pattern to match (Page x)
    page_number_pattern = re.compile(r'\(Page\s+\d+\)')

    for header_split in md_header_splits:
        page_content = header_split.page_content
        # Remove (Page x) from the page content
        updated_page_content = re.sub(page_number_pattern, '', page_content)
        # Update the page_content in md_header_splits
        header_split.page_content = updated_page_content

def update_metadata_with_source(md_header_splits,guidline_metadata):
    '''
    This function includes the document source into the final chunks metadata
    '''
    for header_split in md_header_splits:
        header_split.metadata['source'] = guidline_metadata['Guideline_Name']

def update_metadata_with_validity(md_header_splits, guidline_metadata):
    '''
    This function includes the document source into the final chunks metadata
    '''
    for header_split in md_header_splits:
        # split.metadata['Page Number'][0] +=1
        if "abgelaufen" not in guidline_metadata['Guideline_Name']:
            header_split.metadata["Gültigkeit"] = "Gültig"
        else:
            header_split.metadata["Gültigkeit"] = "Abgelaufen"

def update_metadata_with_Fachgesellschaft(md_header_splits, guidline_metadata):
    '''
    This function includes the Fachgesellschaft into the final chunks metadata
    '''
    for header_split in md_header_splits:
        header_split.metadata['Fachgesellschaft'] = guidline_metadata['Fachgesellschaft']

def update_metadata_with_href(md_header_splits, guidline_metadata):
    '''
    This function includes the download_href into the final chunks metadata
    '''
    for header_split in md_header_splits:
        header_split.metadata['href'] = guidline_metadata['download_href']

# 1. MarkdownTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)

# Settings
min_chunk_size = 300
max_chunk_size = 1000 #ada: 8191, e-5: 512

file_path = 'guideline_metadata.json'
with open(file_path, 'r', encoding='utf-8') as file:
    json_metadata = json.load(file)

# List containing all chunks
doc_chunks = []

d = 0
for d in range(len(docs)):

    documents = docs[d]
    source_name = documents.metadata['source'].split('\\')[-1]
    print("source_name:",source_name)
    guidline_metadata = json_metadata[source_name]
    print("guidline_metadata:",guidline_metadata)
    
    # add page number data into markdown
    updated_text = count_and_update_delimiters(documents.text)
    modified_text = modify_headers_with_page_numbers(updated_text)

    # Perform Markdown Text Splitting
    md_header_splits = markdown_splitter.split_text(modified_text)
    #md_header_splits = markdown_splitter.split_text(documents[0].text)

    # add page metadata
    update_metadata_with_page_numbers(md_header_splits)
    update_metadata_with_source(md_header_splits, guidline_metadata)
    update_metadata_with_validity(md_header_splits, guidline_metadata)
    update_metadata_with_Fachgesellschaft(md_header_splits, guidline_metadata)
    update_metadata_with_href(md_header_splits, guidline_metadata)
    update_page_content(md_header_splits)

    print("----------------------------------MarkdownHeaderTextSplitter----------------------------------")

    print(f"num. of chunks after MarkdownHeaderTextSplitter: {len(md_header_splits)}")

    for i in range(len(md_header_splits)):
        str_ = num_tokens_from_string(md_header_splits[i].page_content, "cl100k_base")
        print(f"num. of tokens in chunk {i} is: {str_}")
        print(md_header_splits[i].metadata)

    print("Indices that are below min/above max token size: ")
    list_of_below_min_indices = find_idxs_below_min_chunk_size(md_header_splits, min_chunk_size)
    list_of_above_max_indices= find_idxs_above_max_chunk_size(md_header_splits, max_chunk_size)
    i = 0
    #num_of_idxs_in_list = []
    previous_indices = None
    breakpoint_thresh_value = 95

    print("----------------------------------MIN_MAX_SUBCHUNKING:----------------------------------")
    while len(list_of_below_min_indices) > 0 or len(list_of_above_max_indices) > 0:

        list_of_below_min_indices = find_idxs_below_min_chunk_size(md_header_splits, min_chunk_size)

        if len(list_of_below_min_indices) > 0:
            remove_below_min_chunks_list(md_header_splits, list_of_below_min_indices, min_chunk_size)
            print(f"-----------------------CHUNKS AFTER MIN REMOVAL {i}--------------------------")
            print(f"num. of chunks after {i} MIN_SUBCHUNKING: {len(md_header_splits)}")
            #print_all_chunks(md_header_splits, with_page_content=False)

        list_of_above_max_indices= find_idxs_above_max_chunk_size(md_header_splits, max_chunk_size)

        # Check, if two consecutive numbers in the num_of_idxs_in_list = [] are the same, decrease the breakpoint threshold by 10.
        if previous_indices is not None and list_of_above_max_indices == previous_indices and breakpoint_thresh_value!=75:
            breakpoint_thresh_value=breakpoint_thresh_value-10

        if list_of_above_max_indices == previous_indices and breakpoint_thresh_value==75:
            print("Finished!")
            break

        # Update the previous indices for the next iteration
        previous_indices = list_of_above_max_indices

        if len(list_of_above_max_indices) > 0:
            remove_above_max_chunks_list(md_header_splits, list_of_above_max_indices, breakpoint_thresh_value)
            print(f"-----------------------CHUNKS AFTER MAX REMOVAL {i}--------------------------")
            print(f"num. of chunks after {i} MAX_SUBCHUNKING: {len(md_header_splits)}")
            #print_all_chunks(md_header_splits, with_page_content=False)

        #print(md_header_splits)
        print(len(md_header_splits))

        i = i+1

    print("-----------------------FINAL CHUNKS--------------------------")

    doc_chunks.append(md_header_splits)
    print_all_chunks(md_header_splits, with_page_content=False)

    leitlinien_doc= documents.id_
    output_file = f"chunking_results_{d}.txt"
    open(output_file, 'w').close()
    write_results_in_txt(md_header_splits,leitlinien_doc,list_of_above_max_indices, output_file)


source_name: 001-005l_S1_Rueckenmarksnahe-Regionalanaesthesien-Thrombembolieprophylaxe-antithrombotische-Medikation_2021-10_1.pdf
guidline_metadata: {'Fachgesellschaft': ['Anästhesiologie & Intensivmedizin'], 'download_href': 'https://register.awmf.org/assets/guidelines/001-005l_S1_Rueckenmarksnahe-Regionalanaesthesien-Thrombembolieprophylaxe-antithrombotische-Medikation_2021-10_1.pdf', 'Guideline_Name': '001-005l_S1_Rueckenmarksnahe-Regionalanaesthesien-Thrombembolieprophylaxe-antithrombotische-Medikation_2021-10_1.pdf'}
----------------------------------MarkdownHeaderTextSplitter----------------------------------
num. of chunks after MarkdownHeaderTextSplitter: 145
num. of tokens in chunk 0 is: 696
{'source': '001-005l_S1_Rueckenmarksnahe-Regionalanaesthesien-Thrombembolieprophylaxe-antithrombotische-Medikation_2021-10_1.pdf', 'Gültigkeit': 'Gültig', 'Fachgesellschaft': ['Anästhesiologie & Intensivmedizin'], 'href': 'https://register.awmf.org/assets/guidelines/001-005l_S1_Rueckenmark

100%|██████████| 21/21 [00:33<00:00,  1.58s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 0--------------------------
num. of chunks after 0 MAX_SUBCHUNKING: 102
102
28 indices below min threshold: [4, 14, 15, 35, 37, 38, 43, 45, 47, 55, 57, 58, 59, 60, 62, 63, 69, 70, 71, 73, 74, 77, 82, 84, 90, 92, 93, 95]
-----------------------CHUNKS AFTER MIN REMOVAL 1--------------------------
num. of chunks after 1 MIN_SUBCHUNKING: 79
20 indices above max threshold: [3, 4, 9, 10, 12, 14, 18, 21, 22, 23, 25, 26, 29, 30, 40, 47, 50, 57, 71, 72]


100%|██████████| 20/20 [00:31<00:00,  1.59s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 1--------------------------
num. of chunks after 1 MAX_SUBCHUNKING: 105
105
16 indices below min threshold: [4, 12, 17, 21, 35, 42, 45, 58, 65, 66, 67, 72, 94, 95, 96, 98]
-----------------------CHUNKS AFTER MIN REMOVAL 2--------------------------
num. of chunks after 2 MIN_SUBCHUNKING: 91
21 indices above max threshold: [3, 5, 6, 10, 13, 15, 17, 18, 22, 25, 28, 30, 32, 34, 35, 39, 40, 50, 61, 68, 84]


100%|██████████| 21/21 [00:32<00:00,  1.57s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 2--------------------------
num. of chunks after 2 MAX_SUBCHUNKING: 110
110
11 indices below min threshold: [4, 7, 8, 14, 18, 24, 25, 41, 67, 79, 103]
-----------------------CHUNKS AFTER MIN REMOVAL 3--------------------------
num. of chunks after 3 MIN_SUBCHUNKING: 101
20 indices above max threshold: [3, 5, 7, 11, 14, 17, 19, 21, 25, 28, 32, 33, 35, 37, 40, 50, 60, 71, 78, 94]


100%|██████████| 20/20 [00:26<00:00,  1.33s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 3--------------------------
num. of chunks after 3 MAX_SUBCHUNKING: 113
113
10 indices below min threshold: [4, 13, 17, 21, 39, 43, 48, 70, 82, 106]
-----------------------CHUNKS AFTER MIN REMOVAL 4--------------------------
num. of chunks after 4 MIN_SUBCHUNKING: 103
18 indices above max threshold: [3, 5, 7, 11, 14, 17, 22, 26, 29, 35, 37, 39, 42, 52, 62, 73, 80, 96]


100%|██████████| 18/18 [00:22<00:00,  1.28s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 4--------------------------
num. of chunks after 4 MAX_SUBCHUNKING: 112
112
9 indices below min threshold: [4, 13, 17, 21, 39, 43, 69, 81, 105]
-----------------------CHUNKS AFTER MIN REMOVAL 5--------------------------
num. of chunks after 5 MIN_SUBCHUNKING: 103
18 indices above max threshold: [3, 5, 7, 11, 14, 17, 22, 26, 29, 35, 37, 39, 42, 52, 62, 73, 80, 96]


100%|██████████| 18/18 [00:22<00:00,  1.25s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 5--------------------------
num. of chunks after 5 MAX_SUBCHUNKING: 126
126
17 indices below min threshold: [4, 12, 14, 18, 22, 28, 33, 43, 47, 56, 68, 79, 80, 91, 93, 117, 119]
-----------------------CHUNKS AFTER MIN REMOVAL 6--------------------------
num. of chunks after 6 MIN_SUBCHUNKING: 109
17 indices above max threshold: [3, 5, 7, 11, 14, 17, 22, 26, 29, 35, 38, 42, 46, 57, 67, 78, 102]


100%|██████████| 17/17 [00:20<00:00,  1.23s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 6--------------------------
num. of chunks after 6 MAX_SUBCHUNKING: 131
131
20 indices below min threshold: [4, 12, 14, 18, 22, 28, 33, 37, 38, 49, 53, 54, 61, 62, 74, 85, 86, 98, 122, 124]
-----------------------CHUNKS AFTER MIN REMOVAL 7--------------------------
num. of chunks after 7 MIN_SUBCHUNKING: 114
12 indices above max threshold: [3, 5, 7, 14, 17, 22, 26, 29, 40, 62, 72, 83]


100%|██████████| 12/12 [00:14<00:00,  1.21s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 7--------------------------
num. of chunks after 7 MAX_SUBCHUNKING: 124
124
10 indices below min threshold: [4, 16, 20, 26, 31, 46, 69, 80, 81, 93]
-----------------------CHUNKS AFTER MIN REMOVAL 8--------------------------
num. of chunks after 8 MIN_SUBCHUNKING: 114
12 indices above max threshold: [3, 5, 7, 14, 17, 22, 26, 29, 40, 62, 72, 83]
Finished!
-----------------------FINAL CHUNKS--------------------------
num. of tokens in chunk 0 is: 696
num. of tokens in chunk 1 is: 321
num. of tokens in chunk 2 is: 707
num. of tokens in chunk 3 is: 1304
num. of tokens in chunk 4 is: 977
num. of tokens in chunk 5 is: 1299
num. of tokens in chunk 6 is: 439
num. of tokens in chunk 7 is: 1017
num. of tokens in chunk 8 is: 898
num. of tokens in chunk 9 is: 823
num. of tokens in chunk 10 is: 964
num. of tokens in chunk 11 is: 785
num. of tokens in chunk 12 is: 777
num. of tokens in chunk 13 is: 703
num. of tokens in chunk 14 is: 1370
num. of tokens 

100%|██████████| 41/41 [01:26<00:00,  2.11s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 0--------------------------
num. of chunks after 0 MAX_SUBCHUNKING: 272
272
45 indices below min threshold: [12, 21, 25, 51, 61, 81, 96, 102, 140, 150, 157, 167, 168, 169, 170, 171, 172, 180, 184, 185, 188, 189, 191, 201, 203, 209, 211, 212, 214, 215, 216, 217, 221, 222, 228, 234, 239, 241, 244, 253, 254, 255, 257, 261, 267]
-----------------------CHUNKS AFTER MIN REMOVAL 1--------------------------
num. of chunks after 1 MIN_SUBCHUNKING: 234
39 indices above max threshold: [10, 12, 13, 15, 21, 27, 31, 41, 49, 54, 55, 60, 75, 80, 81, 85, 87, 89, 92, 94, 123, 138, 141, 146, 153, 164, 171, 183, 194, 197, 198, 201, 205, 209, 211, 219, 221, 225, 228]


100%|██████████| 39/39 [01:02<00:00,  1.61s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 1--------------------------
num. of chunks after 1 MAX_SUBCHUNKING: 288
288
34 indices below min threshold: [38, 87, 94, 107, 114, 163, 170, 177, 179, 191, 199, 213, 224, 225, 227, 228, 231, 235, 236, 237, 240, 242, 251, 254, 263, 264, 267, 271, 272, 273, 274, 278, 279, 282]
-----------------------CHUNKS AFTER MIN REMOVAL 2--------------------------
num. of chunks after 2 MIN_SUBCHUNKING: 257
26 indices above max threshold: [15, 26, 37, 63, 64, 69, 85, 92, 96, 97, 99, 102, 106, 109, 158, 163, 170, 181, 188, 200, 225, 229, 231, 239, 241, 246]


100%|██████████| 26/26 [00:39<00:00,  1.52s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 2--------------------------
num. of chunks after 2 MAX_SUBCHUNKING: 282
282
17 indices below min threshold: [39, 89, 103, 120, 169, 176, 184, 196, 204, 216, 248, 251, 260, 261, 264, 269, 271]
-----------------------CHUNKS AFTER MIN REMOVAL 3--------------------------
num. of chunks after 3 MIN_SUBCHUNKING: 265
19 indices above max threshold: [15, 38, 64, 65, 71, 87, 95, 107, 116, 164, 170, 177, 188, 195, 233, 237, 239, 247, 249]


100%|██████████| 19/19 [00:28<00:00,  1.50s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 3--------------------------
num. of chunks after 3 MAX_SUBCHUNKING: 280
280
13 indices below min threshold: [39, 90, 121, 169, 177, 185, 197, 205, 248, 251, 260, 261, 264]
-----------------------CHUNKS AFTER MIN REMOVAL 4--------------------------
num. of chunks after 4 MIN_SUBCHUNKING: 267
18 indices above max threshold: [15, 38, 64, 65, 71, 88, 96, 118, 166, 172, 179, 190, 197, 235, 239, 241, 249, 251]


100%|██████████| 18/18 [00:27<00:00,  1.52s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 4--------------------------
num. of chunks after 4 MAX_SUBCHUNKING: 281
281
13 indices below min threshold: [39, 73, 91, 122, 178, 186, 198, 206, 249, 252, 261, 262, 265]
-----------------------CHUNKS AFTER MIN REMOVAL 5--------------------------
num. of chunks after 5 MIN_SUBCHUNKING: 268
17 indices above max threshold: [15, 38, 64, 65, 71, 88, 96, 118, 173, 180, 191, 198, 236, 240, 242, 250, 252]


100%|██████████| 17/17 [00:27<00:00,  1.63s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 5--------------------------
num. of chunks after 5 MAX_SUBCHUNKING: 281
281
13 indices below min threshold: [39, 73, 91, 122, 178, 186, 198, 206, 249, 252, 261, 262, 265]
-----------------------CHUNKS AFTER MIN REMOVAL 6--------------------------
num. of chunks after 6 MIN_SUBCHUNKING: 268
17 indices above max threshold: [15, 38, 64, 65, 71, 88, 96, 118, 173, 180, 191, 198, 236, 240, 242, 250, 252]


100%|██████████| 17/17 [00:25<00:00,  1.51s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 6--------------------------
num. of chunks after 6 MAX_SUBCHUNKING: 305
305
28 indices below min threshold: [42, 43, 80, 99, 107, 132, 133, 189, 197, 208, 209, 212, 219, 221, 259, 265, 266, 267, 268, 269, 271, 273, 282, 283, 284, 285, 287, 289]
-----------------------CHUNKS AFTER MIN REMOVAL 7--------------------------
num. of chunks after 7 MIN_SUBCHUNKING: 283
8 indices above max threshold: [78, 96, 183, 190, 210, 256, 264, 267]


100%|██████████| 8/8 [00:15<00:00,  1.88s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 7--------------------------
num. of chunks after 7 MAX_SUBCHUNKING: 297
297
12 indices below min threshold: [79, 98, 186, 194, 215, 261, 263, 271, 273, 275, 276, 281]
-----------------------CHUNKS AFTER MIN REMOVAL 8--------------------------
num. of chunks after 8 MIN_SUBCHUNKING: 286
6 indices above max threshold: [78, 96, 183, 190, 210, 256]


100%|██████████| 6/6 [00:08<00:00,  1.40s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 8--------------------------
num. of chunks after 8 MAX_SUBCHUNKING: 292
292
6 indices below min threshold: [79, 98, 186, 194, 215, 262]
-----------------------CHUNKS AFTER MIN REMOVAL 9--------------------------
num. of chunks after 9 MIN_SUBCHUNKING: 286
6 indices above max threshold: [78, 96, 183, 190, 210, 256]
Finished!
-----------------------FINAL CHUNKS--------------------------
num. of tokens in chunk 0 is: 683
num. of tokens in chunk 1 is: 975
num. of tokens in chunk 2 is: 345
num. of tokens in chunk 3 is: 312
num. of tokens in chunk 4 is: 301
num. of tokens in chunk 5 is: 939
num. of tokens in chunk 6 is: 493
num. of tokens in chunk 7 is: 404
num. of tokens in chunk 8 is: 846
num. of tokens in chunk 9 is: 524
num. of tokens in chunk 10 is: 951
num. of tokens in chunk 11 is: 618
num. of tokens in chunk 12 is: 802
num. of tokens in chunk 13 is: 767
num. of tokens in chunk 14 is: 454
num. of tokens in chunk 15 is: 358
num. of tokens

100%|██████████| 3/3 [00:04<00:00,  1.45s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 0--------------------------
num. of chunks after 0 MAX_SUBCHUNKING: 24
24
3 indices below min threshold: [0, 1, 21]
-----------------------CHUNKS AFTER MIN REMOVAL 1--------------------------
num. of chunks after 1 MIN_SUBCHUNKING: 22
1 indices above max threshold: [1]


100%|██████████| 1/1 [00:01<00:00,  1.34s/it]


-----------------------CHUNKS AFTER MAX REMOVAL 1--------------------------
num. of chunks after 1 MAX_SUBCHUNKING: 23
23
2 indices below min threshold: [0, 1]
-----------------------CHUNKS AFTER MIN REMOVAL 2--------------------------
num. of chunks after 2 MIN_SUBCHUNKING: 22
1 indices above max threshold: [1]


100%|██████████| 1/1 [00:01<00:00,  1.23s/it]

-----------------------CHUNKS AFTER MAX REMOVAL 2--------------------------
num. of chunks after 2 MAX_SUBCHUNKING: 23
23
1 indices below min threshold: [0]
-----------------------CHUNKS AFTER MIN REMOVAL 3--------------------------
num. of chunks after 3 MIN_SUBCHUNKING: 23
0 indices above max threshold: []
23
1 indices below min threshold: [0]
-----------------------CHUNKS AFTER MIN REMOVAL 4--------------------------
num. of chunks after 4 MIN_SUBCHUNKING: 23
0 indices above max threshold: []
Finished!
-----------------------FINAL CHUNKS--------------------------
num. of tokens in chunk 0 is: 209
num. of tokens in chunk 1 is: 967
num. of tokens in chunk 2 is: 483
num. of tokens in chunk 3 is: 570
num. of tokens in chunk 4 is: 816
num. of tokens in chunk 5 is: 380
num. of tokens in chunk 6 is: 821
num. of tokens in chunk 7 is: 747
num. of tokens in chunk 8 is: 372
num. of tokens in chunk 9 is: 576
num. of tokens in chunk 10 is: 770
num. of tokens in chunk 11 is: 582
num. of tokens in




In [32]:
print(md_header_splits[0])
print(len(md_header_splits))

page_content='## Hygieneempfehlungen für die Regionalanästhesie   \nSAWMF online  \naktueller Stand: 11/2014  \npubliziert bei: Das Portal der wissenschaftlichen Medizin  \nAWMF-Register Nr. 001/014 Klasse: S1 \n### S1 Leitlinie   \nHygieneempfehlungen für die Regionalanästhesie - Überarbeitete Handlungsempfehlung des AK Regionalanästhesie der Deutschen Gesellschaft für Anästhesiologie (DGAI) ("Die 10 Gebote")  \n|1*|2*|1|3|4|\n|---|---|---|---|---|\n|K. Kerwat|S. \nSchulz-Stübner|T. Steinfeldt|P. Kessler|T. Volk|\n|P. Gastmeier|C. Geffers|T. Ermert|M.G. Boschin|T.' metadata={'Header 2': 'Hygieneempfehlungen für die Regionalanästhesie', 'Page Number': [1], 'source': '001-014l_S1_Hygieneempfehlungen__RegionalAnästhesie_2014-11-abgelaufen.pdf', 'Gültigkeit': 'Abgelaufen', 'Fachgesellschaft': ['Anästhesiologie & Intensivmedizin'], 'href': 'https://register.awmf.org/assets/guidelines/001-014l_S1_Hygieneempfehlungen__RegionalAnästhesie_2014-11-abgelaufen.pdf'}
23


### Pincone Loading

In [33]:
from pinecone import Pinecone
from sentence_transformers import SentenceTransformer

pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = 'pinecone-test'
index = pinecone.Index(INDEX_NAME)
index.describe_index_stats()

model = SentenceTransformer("BAAI/bge-small-en-v1.5") # 'all-MiniLM-L6-v2'

In [37]:
# Preprocessing Data Load for splits
vector_limit=10

splits = md_header_splits[:vector_limit]
splits_metadata_source = []
splits_metadata_page = []
splits_metadata_Gültigkeit = []
splits_page_content = []
#preprocess
for i in range(vector_limit):
    splits_page_content.append(splits[i].page_content)
    metadata = splits[i].metadata
    print(metadata)
    print(metadata["source"])
    splits_metadata_source.append(metadata['source'])
    splits_metadata_page.append(metadata['Page Number'][0])
    splits_metadata_Gültigkeit.append(metadata['Gültigkeit'])
    #splits_metadata.append(splits[i].metadata)


print(splits_metadata_source)
print(len(splits_metadata_source))

print(splits_metadata_page)
print(len(splits_metadata_page))

print(splits_page_content)
print(len(splits_page_content))

{'Header 2': 'Hygieneempfehlungen für die Regionalanästhesie', 'Page Number': [1], 'source': '001-014l_S1_Hygieneempfehlungen__RegionalAnästhesie_2014-11-abgelaufen.pdf', 'Gültigkeit': 'Abgelaufen', 'Fachgesellschaft': ['Anästhesiologie & Intensivmedizin'], 'href': 'https://register.awmf.org/assets/guidelines/001-014l_S1_Hygieneempfehlungen__RegionalAnästhesie_2014-11-abgelaufen.pdf'}
001-014l_S1_Hygieneempfehlungen__RegionalAnästhesie_2014-11-abgelaufen.pdf
{'Header 2': 'Hygieneempfehlungen für die Regionalanästhesie', 'Header 3': 'S1 Leitlinie', 'Page Number': [1], 'source': '001-014l_S1_Hygieneempfehlungen__RegionalAnästhesie_2014-11-abgelaufen.pdf', 'Gültigkeit': 'Abgelaufen', 'Fachgesellschaft': ['Anästhesiologie & Intensivmedizin'], 'href': 'https://register.awmf.org/assets/guidelines/001-014l_S1_Hygieneempfehlungen__RegionalAnästhesie_2014-11-abgelaufen.pdf'}
001-014l_S1_Hygieneempfehlungen__RegionalAnästhesie_2014-11-abgelaufen.pdf
{'Header 2': 'Hygieneempfehlungen für die Regi

In [38]:
# Creating new index
from pinecone import Pinecone
from pinecone import ServerlessSpec
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-small-en-v1.5") # 'all-MiniLM-L6-v2'

pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = 'leitliniengpt-vdb'

# if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
#     pinecone.delete_index(INDEX_NAME)
# print(INDEX_NAME)
# pinecone.create_index(name=INDEX_NAME, 
#     dimension=model.get_sentence_embedding_dimension(),      #  dimension=384 - dimensionality of bge-small-en-v1.5
#     metric='cosine',
#     spec=ServerlessSpec(cloud='aws', region='eu-west-1'))

In [39]:
index = pinecone.Index(INDEX_NAME)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 10}},
 'total_vector_count': 10}

In [None]:
# for i in tqdm(range(len(splits))):

#     # create metadata batch
#     metadatas = {'page_content': splits_page_content[i],'source': splits_metadata_source[i], 'page': splits_metadata_page[i], 'Gültigkeit': splits_metadata_Gültigkeit[i]}
#     # create embeddings
#     xc = model.encode(splits_page_content[i])
#     xc = xc.tolist()
#     print(xc)
#     # create records list for upsert
#     records =   {
#                 "id": i, 
#                 "values": xc, 
#                 "metadata": metadatas
#                 }
#     print(records)
#     print(type(xc[0]))
#     # upsert to Pinecone
#     index.upsert(vectors=records)

  0%|          | 0/10 [00:00<?, ?it/s]

  0%|          | 0/10 [00:00<?, ?it/s]

[-0.016539108008146286, 0.018893864005804062, -0.018540034070611, 0.02588261291384697, 0.03043079376220703, 0.036336399614810944, -0.0036059461999684572, 0.022562047466635704, 0.017635414376854897, -0.057421304285526276, 0.03999806195497513, -0.09104007482528687, -0.010623509995639324, -0.013552345335483551, 0.01217682845890522, 0.011277548037469387, 0.006350613199174404, 0.02119830995798111, -0.03916080296039581, -0.0041095116175711155, 0.07600763440132141, -0.05886286124587059, 0.03582432121038437, 0.03411406651139259, 0.03134357929229736, 0.0073353610932827, 0.022068919613957405, -0.03944979980587959, -0.01054752990603447, -0.1930847316980362, 0.031758010387420654, -0.04000283032655716, 0.00982184149324894, -0.0259333997964859, -0.05434231832623482, 0.006540659815073013, -0.0564926341176033, 0.04913867637515068, -0.03705724701285362, -0.018569467589259148, -0.01105196587741375, 0.021430788561701775, 0.003682562615722418, -0.0212184377014637, -0.02183716930449009, -0.0036817705258727




ValueError: Invalid vector value passed: cannot interpret type <class 'str'>

In [40]:
print(len(splits))

10


In [41]:
# Upserting Data to pinecone
batch_size=5

for i in tqdm(range(0, len(splits), batch_size)):
    # find end of batch
    i_end = min(i+batch_size, len(splits))
    # create IDs batch
    ids = [str(x) for x in range(i, i_end)]
    # create metadata batch
    metadatas = [
                    {'text': text,'source': source, 'page': page, 'Gültigkeit': gültigkeit}
                    for text, source, page, gültigkeit in zip(splits_page_content[i:i_end], splits_metadata_source[i:i_end], splits_metadata_page[i:i_end], splits_metadata_Gültigkeit[i:i_end])
                ]

    # create embeddings
    print(splits_page_content[i:i_end])
    xc = model.encode(splits_page_content[i:i_end])
    print(type(xc.tolist()))
    xc = xc.tolist()
    # create records list for upsert
    records = zip(ids, xc, metadatas)
    # upsert to Pinecone
    index.upsert(vectors=records)

  0%|          | 0/2 [00:00<?, ?it/s]

['## Hygieneempfehlungen für die Regionalanästhesie   \nSAWMF online  \naktueller Stand: 11/2014  \npubliziert bei: Das Portal der wissenschaftlichen Medizin  \nAWMF-Register Nr. 001/014 Klasse: S1 \n### S1 Leitlinie   \nHygieneempfehlungen für die Regionalanästhesie - Überarbeitete Handlungsempfehlung des AK Regionalanästhesie der Deutschen Gesellschaft für Anästhesiologie (DGAI) ("Die 10 Gebote")  \n|1*|2*|1|3|4|\n|---|---|---|---|---|\n|K. Kerwat|S. \nSchulz-Stübner|T. Steinfeldt|P. Kessler|T. Volk|\n|P. Gastmeier|C. Geffers|T. Ermert|M.G. Boschin|T.', 'Wiesmann|\n|H. Wulf|1 Klinik für Anästhesie und Intensivtherapie, Universitätsklinikum Marburg des UKGM, Philipps-Universität Marburg|1 Klinik für Anästhesie und Intensivtherapie, Universitätsklinikum Marburg des UKGM, Philipps-Universität Marburg|1 Klinik für Anästhesie und Intensivtherapie, Universitätsklinikum Marburg des UKGM, Philipps-Universität Marburg|1 Klinik für Anästhesie und Intensivtherapie, Universitätsklinikum Marburg 

 50%|█████     | 1/2 [00:02<00:02,  2.05s/it]

['## 9. Diskonnektion erhöhen das Infektionsrisiko   \nDiskonnektionen und Manipulationen an Regionalanästhesiekathetern sind auf das erforderliche Minimum zu beschränken Im Fall einer geplanten Diskonnektion des Systems, z.B. für Nachinjektionen oder Systemwechsel, wird eine Sprühdesinfektion der Konnektionsstelle mit einem alkoholischen Hautdesinfektionsmittel (30s Einwirkzeit) empfohlen \n## 10. Infektzeichen sind Warnsignale für die Patientensicherheit   \nVorgehen bei Infektzeichen:  \na) Im Falle generalisierter Infektzeichen ist ein Regionalanalgesiekatheter grundsätzlich wie Fremdmaterial in Analogie zum zentralen Venenkatheter zu betrachten und muss ggf entfernt bzw gewechselt werden  \nb) Bei lokalen Infektionszeichen (Rötung, Schwellung, Druckschmerz) ist der Regionalanästhesiekatheter zügig zu entfernen  \nc) Bei schwerwiegendem Befund ist eine kalkulierte Antibiotikatherapie in Erwägung zu ziehen  \nd) Treten unter oder nach rückenmarknahen Regionalanästhesieverfahren neur

100%|██████████| 2/2 [00:03<00:00,  1.83s/it]
