In [2]:
# %pip install llama-index-readers-file
# %pip install --u llama-index
# %pip install llama-parse
# %pip install openai
# %pip install langchain
# %pip install --upgrade numpy
# %pip install tiktoken
# %pip install python-dotenv
# %pip install -U sentence-transformers
# %pip install langchain_experimental
# %pip install langchain_openai
# %pip install --upgrade langchain_openai langchain_core
# %pip install pinecone
# %pip install sentence_transformers


In [3]:
import logging
LOG = logging.getLogger(__name__)
# Set the logging level to DEBUG
LOG.setLevel(logging.DEBUG)

# Sample data: lists of API keys and folder names
api_keys = API_KEY_LIST = [ 'llx-9zwZrC1cPhoNSxhfDlzQc2OuEe9b73WJOmXtAra9kD1HxVRF',
                            'llx-u4t0fZSHsXQAhit7M7Hjf1MJTX3BoxAKJW9jswGTfwKf9tFB',
                            'llx-QUF47pz0VJGzUrdUzFpOgmjHpH9TpAmudTAiJYmF5aCyPrgN',
                            'llx-LoEWhKswtx0ltVIeuldBGKxykNfDLb8WpfJ0rQiqGVYzgFge',
                            'llx-RSBs3YYka7OpvtSu33s24LKe9HzmQvzzJ0TidvxouF6TQwrz',
                            'llx-AHsZMxFAh7wjO8IWCEcU70QWobBE1h9co7uQzr7CRpXhcrrN']
folder_names = [f"{i}" for i in range(0, 79)]  

# Chose Index
index = 0

LLAMA_CLOUD_API_KEY = api_keys[index]
folder_name = folder_names[index]

LOG.debug(f"Set LLAMA_CLOUD_API_KEY to {api_keys[index]}")
LOG.debug(f"Set folder_name to {folder_names[index]}")

In [4]:
import os
import openai
from dotenv import load_dotenv
from llama_index.core import SimpleDirectoryReader, Document
from llama_parse import LlamaParse  # pip install llama-parse

import nest_asyncio
nest_asyncio.apply()

# Loading Environment variables:
dotenv_path = 'KEYs.env'  
_ = load_dotenv(dotenv_path)

openai.api_key = os.environ['OPENAI_API_KEY']
PINECONE_API_KEY=os.environ['PINECONE_API_KEY']
# LLAMA_CLOUD_API_KEY=os.environ['LLAMA_CLOUD_API_KEY']


In [5]:
# Read document

parser = LlamaParse(
    api_key=LLAMA_CLOUD_API_KEY,
    result_type="markdown"  # "markdown" and "text" are available
    #verbose=True
)

def get_meta(file_path):
    return {"source": file_path}

input_dir = f"C:/Users/marlo/OneDrive/Desktop/Anaconda/Fun/Deep_Learning/Semantic_Search_Project/LeitlinienGPT/New_Folder/{folder_name}"

file_extractor = {".pdf": parser}
reader = SimpleDirectoryReader(input_dir=input_dir,
                                recursive=True,
                                num_files_limit=2,
                                file_extractor=file_extractor,
                                filename_as_id=True,
                                file_metadata=get_meta
                                )
docs = reader.load_data()

LOG.debug(type(docs), "\n")
LOG.debug(len(docs), "\n")
LOG.debug(type(docs[0]))
LOG.debug(type(docs[0].text))

Started parsing the file under job_id 25d78a8b-64c8-4f82-867f-70afbf11a192
Started parsing the file under job_id 1282ae3b-dd15-451a-95c3-ae9e35288af8


### 2. Text Normalization

In [6]:
# Normalize Text
import re
# List of expressions to replace
expressions_to_replace = [
    "bzw\.", "z\. B\.", "med\.", "Dr\.", "zit\.", "n\.", "e.V\.", "rer\.", "nat\.", "Prof\.", "B.A\.",
    "Ca\.", "ca\.", "usw\.", "v\. a\.", "p\. p\.", "s\. o\.", "s\. u\.", "sog\.", "u\. a\.", "vs\.",
    "Min\.", "et al\.", "ärztl\.", "evtl\.", "ggf\."
]

for i in range(len(docs)):
    LOG.debug("iteration:",i)
    # Replace the expressions in the markdown text
    for expression in expressions_to_replace:
        docs[i].text = re.sub(expression, expression.replace("\\.", ""), docs[i].text)

### 4. Chunking Algorithm

In [7]:
# NEW Chunker Alg
'''
Method:
1. Markdown Chunking: First use the MarkdownTextSplitter to do “rule-based chunking” using the titles and headers.
2. Semantic Chunking:
    2.1. Check the chunk size of the resulting chunks
    2.2. If chunk size is below min token threshold (i.e. 20 tokens) --> merge the chunk with the previous chunk
    2.3. If chunk size exceeds the max chunk size of the embedding model (i.e. 512 for e5-multilingual) --> subchunk that chunk with the SemanticChunker.
        2.3.1. Find the chunks with token size above max token limit (512 tokens)
        2.3.2. Split the chunks into subchunks using breakpoint value of 95 (to begin with)
        2.3.3. Check subchunks:
            2.3.3.1 if the created subchunks are smaller than min token limit --> merge with previous subchunk
            2.3.3.2 Substitute the original "parent chunk" with its subchunks (Note: Metadata of parent chunks are copied to children)
        4. Iteratively check again the size each resulting subchunk. (go back to 2.3.1)
        5. If the subchunks cannot be broken down any further, (i.e. SemanticChunker is unable to break at threshold 95) --> reduce threshold to 85

'''
'''
Improvements:
- Whenever there is a header, a new chunk must start. i.e. cannot have header in the middle of chunk
- Embedding similarity idea also for merging when below min limit.
'''

# MarkdownTextSplitter
from langchain.text_splitter import MarkdownHeaderTextSplitter
import tiktoken
from langchain_experimental.text_splitter import SemanticChunker
from langchain_openai.embeddings import OpenAIEmbeddings
from tqdm import tqdm
import re
import json

# helper functions

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    #num_tokens = len(encoding.encode(string, disallowed_special = set()))

    return num_tokens

def semantic_chunker(threshold_amount):
    """Creates an instance of a SemanticChunker. Input threshold value for breakpoint splitting"""
    text_splitter = SemanticChunker(
        OpenAIEmbeddings(),
        breakpoint_threshold_type="percentile",
        #breakpoint_threshold_type="standard_deviation"
        #breakpoint_threshold_type="interquartile"
        breakpoint_threshold_amount = threshold_amount
    )
    return text_splitter

def print_all_chunks(list_of_docs,with_page_content=False, with_num_tokens=True):

    for i in range(len(list_of_docs)):
        num_of_tokens = num_tokens_from_string(list_of_docs[i].page_content, "cl100k_base")
        if with_num_tokens == True:
            LOG.debug(f"num. of tokens in chunk {i} is: {num_of_tokens}")
        if with_page_content == True:
            #LOG.debug(f"-----------------------------CHUNK {i} ----------------------------------")
            #LOG.debug(list_of_docs[i].page_content)
            LOG.debug(f"---")
            LOG.debug(list_of_docs[i].page_content)

def find_idxs_below_min_chunk_size(list_of_docs, min_chunk_size=20, show_indices=True):

    idxs_below_min_chunk_size = []

    for i in range(len(list_of_docs)):
        num_of_tokens = num_tokens_from_string(list_of_docs[i].page_content, "cl100k_base")

        # if chunk size is below min_chunk_size, append index to list
        if num_of_tokens < min_chunk_size:
            idxs_below_min_chunk_size.append(i)

    num_of_idxs_in_list = len(idxs_below_min_chunk_size)

    if show_indices==True:
        LOG.debug(f"{num_of_idxs_in_list} indices below min threshold: {idxs_below_min_chunk_size}")

    return idxs_below_min_chunk_size

def find_idxs_above_max_chunk_size(list_of_docs, max_chunk_size=512):

    idxs_above_max_chunk_size = []

    for i in range(len(list_of_docs)):
        num_of_tokens = num_tokens_from_string(list_of_docs[i].page_content, "cl100k_base")

        # if chunk size is below min_chunk_size, append index to list
        if num_of_tokens > max_chunk_size:
            idxs_above_max_chunk_size.append(i)

    num_of_idxs_in_list = len(idxs_above_max_chunk_size)

    LOG.debug(f"{num_of_idxs_in_list} indices above max threshold: {idxs_above_max_chunk_size}")

    return idxs_above_max_chunk_size

def remove_below_min_chunks_list(list_of_docs, list_blw_min_idx, min_chunk_size=20):

    # inverse list order to not get indexing problems
    sorted_max_indices = sorted(list_blw_min_idx, reverse=True)
    #LOG.debug(sorted_max_indices)

    for m in sorted_max_indices:

        num_of_tokens = num_tokens_from_string(list_of_docs[m].page_content, "cl100k_base")

        if num_of_tokens < min_chunk_size and m != 0:
            update_data = {
                "page_content": list_of_docs[m-1].page_content + " \n" + list_of_docs[m].page_content,
            }
            new_doc_node = list_of_docs[m-1].copy(update = update_data)

            # delete the two old nodes and subsitute it with the new node
            list_of_docs.pop(m)
            list_of_docs.pop(m-1)

            #insert new merged node into list
            list_of_docs.insert(m-1,new_doc_node)

def list_of_str_2_list_of_docs(list_of_str, doc_obj):

    list_of_docs = []

    for n in range(len(list_of_str)):

        data = {
            "page_content": list_of_str[n]
        }
        new_doc_obj = doc_obj.copy(update = data)
        list_of_docs.append(new_doc_obj)

    #LOG.debug(list_of_docs)
    return list_of_docs

def remove_above_max_chunks_list(list_of_docs, list_abv_max_idx, breakpoint_thresh_value=95):

    # inverse list order to not get indexing problems
    sorted_max_indices = sorted(list_abv_max_idx, reverse=True)
    #LOG.debug(sorted_max_indices)

    for m in tqdm(sorted_max_indices):

            # semantically split chunk
            subchunks = semantic_chunker(breakpoint_thresh_value).split_text(list_of_docs[m].page_content)

            # right now subchunks is a list of str (i.e. each element only contains text).
            # convert this list of strings into a list of docs
            node = list_of_docs[m].copy() # create copy of parent node (needed for list_of_str_2_list_of_docs fct.)
            docs = list_of_str_2_list_of_docs(subchunks, node)

            # Check, that the created subchunks are of token size > min_chunk_size (=20) --> if not, merge them with previous subchunk
            subchunk_idxs_below_min_thresh = find_idxs_below_min_chunk_size(docs, min_chunk_size=100, show_indices=False)
            remove_below_min_chunks_list(docs, subchunk_idxs_below_min_thresh, min_chunk_size=100)

            num_of_subchunks=len(docs)

            #create copy of the node
            copy_node_2 = list_of_docs[m].copy()
            #LOG.debug(copy_node)

            # delete node from list
            list_of_docs.pop(m)

            for n in reversed(range(num_of_subchunks)):

                update_data = {
                    "page_content": docs[n].page_content
                }
                copy_node_2_updated = copy_node_2.copy(update = update_data)

                #insert new merged node into list
                list_of_docs.insert(m, copy_node_2_updated)

def write_results_in_txt(list_of_docs,leitlinien_doc,list_of_above_max_indices, output_file):
    with open(output_file, 'w', encoding='utf-8') as file:
        file.write(f"-----------------------------RESULTS ----------------------------------\n")
        file.write(f"Leitlinien doc: {leitlinien_doc} \n")
        file.write(f"final num. of chunks: {len(list_of_docs)}\n")
        file.write(f"chunks over max. token limit: {list_of_above_max_indices}\n")

        for i in range(len(list_of_docs)):
            num_of_tokens = num_tokens_from_string(list_of_docs[i].page_content, "cl100k_base")
            file.write(f"num. of tokens in chunk {i} is: {num_of_tokens} \n")
            file.write(f"-----------------------------CHUNK {i} ----------------------------------\n")
            file.write(list_of_docs[i].page_content + '\n')

# Page number functions
'''
Method: "Unsexy but works"
1. Read Markdown Text. Whenever the page delimiter (i.e. "---") is encountered, write the current page count above the line (e.g. (Page x)) (Note: Llamaparse reliably detects all page breaks)
2. For each Header (marked by ###,##,#) write page number into the Header
3. Let MarkdownHeaderTextSplitter split the text (During Chunking). This will automatically create chunks with metadata containing "Header" information.
4. Now we extract the Page number previously written into the Header, and write it as a new entry in the metadata dictionary under "Page Number"
5. Delete all the (Page x) added previously to the headers and page_content to restore our original markdown text.

'''

def count_and_update_delimiters(markdown_text):
    # Compile regex pattern to match "---" delimiters on separate lines
    delimiter_pattern = re.compile(r'^\s*---\s*$', re.MULTILINE)

    # Initialize count
    delimiter_count = 0

    # Function to replace each "---" delimiter with "--- {count} delimiter(s) found"
    def replace(match):
        nonlocal delimiter_count
        delimiter_count += 1
        return f'\n(Page {delimiter_count})\n---\n'

    # Replace each "---" delimiter
    updated_markdown_text = re.sub(delimiter_pattern, replace, markdown_text)

    return updated_markdown_text

def modify_headers_with_page_numbers(markdown_text):
    # Compile regex pattern to match headers (###, ##, #) and "page_nr: x"
    header_pattern = re.compile(r'^(#+)\s+(.*)$', re.MULTILINE)
    page_number_pattern = re.compile(r'(Page \s*(\d+))', re.IGNORECASE)

    # Function to replace each header with header + page number
    def replace(match):
        header_level = match.group(1)
        header_text = match.group(2)
        page_nr_match = page_number_pattern.search(markdown_text, match.end())
        if page_nr_match:
            page_nr = page_nr_match.group(1)
            return f'{header_level} {header_text} ({page_nr})'
        else:
            return match.group(0)

    # Replace each header
    updated_markdown_text = re.sub(header_pattern, replace, markdown_text)

    return updated_markdown_text

def update_metadata_with_page_numbers(md_header_splits):
    '''
    This function:
    1. reads the page numbers, previously inserted into the headers, and writes them into a seperate metadata entry: Page Number
    2. deletes all the page numbers inserted previously into the header metadata entries
    '''
    # Compile regex pattern to extract page numbers from headers
    page_number_pattern = re.compile(r'\(Page\s+(\d+)\)')

    for header_split in md_header_splits:
        metadata = header_split.metadata
        page_content = header_split.page_content
        page_numbers = []

        if metadata:
            last_header_value = list(metadata.values())[-1]
            page_number_match = page_number_pattern.search(last_header_value)
            if page_number_match:
                page_number = int(page_number_match.group(1))
                page_numbers.append(page_number)
                metadata['Page Number'] = page_numbers
            for key, value in metadata.items():
                if re.search(r'\bHeader\b', key):
                  # Remove the page number and surrounding brackets from the header
                  value_without_page_number = re.sub(page_number_pattern, '', value).strip()

                  # Update the metadata dictionary
                  metadata[key] = value_without_page_number
                if key == 'Page Number':
                    page_nums = set(int(match.group(1)) for match in page_number_pattern.finditer(page_content))
                    for num in page_nums:
                        if num not in page_numbers:
                            page_numbers.append(num)
                            metadata['Page Number'] = page_numbers

def update_page_content(md_header_splits):
    '''
    This function deletes all the page numbers (i.e. (Page x)) inserted previously into the page content
    '''
    # Compile regex pattern to match (Page x)
    page_number_pattern = re.compile(r'\(Page\s+\d+\)')

    for header_split in md_header_splits:
        page_content = header_split.page_content
        # Remove (Page x) from the page content
        updated_page_content = re.sub(page_number_pattern, '', page_content)
        # Update the page_content in md_header_splits
        header_split.page_content = updated_page_content

def update_metadata_with_source(md_header_splits,guidline_metadata):
    '''
    This function includes the document source into the final chunks metadata
    '''
    for header_split in md_header_splits:
        header_split.metadata['source'] = guidline_metadata['Guideline_Name']

def update_metadata_with_validity(md_header_splits, guidline_metadata):
    '''
    This function includes the document source into the final chunks metadata
    '''
    for header_split in md_header_splits:
        # split.metadata['Page Number'][0] +=1
        if "abgelaufen" not in guidline_metadata['Guideline_Name']:
            header_split.metadata["Gültigkeit"] = "Gültig"
        else:
            header_split.metadata["Gültigkeit"] = "Abgelaufen"

def update_metadata_with_Fachgesellschaft(md_header_splits, guidline_metadata):
    '''
    This function includes the Fachgesellschaft into the final chunks metadata
    '''
    for header_split in md_header_splits:
        header_split.metadata['Fachgesellschaft'] = guidline_metadata['Fachgesellschaft']

def update_metadata_with_href(md_header_splits, guidline_metadata):
    '''
    This function includes the download_href into the final chunks metadata
    '''
    for header_split in md_header_splits:
        header_split.metadata['href'] = guidline_metadata['download_href']

# 1. MarkdownTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

markdown_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on, strip_headers=False)

# Settings
min_chunk_size = 300
max_chunk_size = 1000 #ada: 8191, e-5: 512

file_path = 'guideline_metadata.json'
with open(file_path, 'r', encoding='utf-8') as file:
    json_metadata = json.load(file)

# List containing all chunks
doc_chunks = []

d = 0
for d in range(len(docs)):

    documents = docs[d]
    source_name = documents.metadata['source'].split('\\')[-1]
    LOG.debug("source_name:",source_name)
    guidline_metadata = json_metadata[source_name]
    LOG.debug("guidline_metadata:",guidline_metadata)
    
    # add page number data into markdown
    updated_text = count_and_update_delimiters(documents.text)
    modified_text = modify_headers_with_page_numbers(updated_text)

    # Perform Markdown Text Splitting
    md_header_splits = markdown_splitter.split_text(modified_text)
    #md_header_splits = markdown_splitter.split_text(documents[0].text)

    # add page metadata
    update_metadata_with_page_numbers(md_header_splits)
    update_metadata_with_source(md_header_splits, guidline_metadata)
    update_metadata_with_validity(md_header_splits, guidline_metadata)
    update_metadata_with_Fachgesellschaft(md_header_splits, guidline_metadata)
    update_metadata_with_href(md_header_splits, guidline_metadata)
    update_page_content(md_header_splits)

    LOG.debug("----------------------------------MarkdownHeaderTextSplitter----------------------------------")

    LOG.debug(f"num. of chunks after MarkdownHeaderTextSplitter: {len(md_header_splits)}")

    for i in range(len(md_header_splits)):
        str_ = num_tokens_from_string(md_header_splits[i].page_content, "cl100k_base")
        LOG.debug(f"num. of tokens in chunk {i} is: {str_}")
        LOG.debug(md_header_splits[i].metadata)

    LOG.debug("Indices that are below min/above max token size: ")
    list_of_below_min_indices = find_idxs_below_min_chunk_size(md_header_splits, min_chunk_size)
    list_of_above_max_indices= find_idxs_above_max_chunk_size(md_header_splits, max_chunk_size)
    i = 0
    #num_of_idxs_in_list = []
    previous_indices = None
    breakpoint_thresh_value = 95

    LOG.debug("----------------------------------MIN_MAX_SUBCHUNKING:----------------------------------")
    while len(list_of_below_min_indices) > 0 or len(list_of_above_max_indices) > 0:

        list_of_below_min_indices = find_idxs_below_min_chunk_size(md_header_splits, min_chunk_size)

        if len(list_of_below_min_indices) > 0:
            remove_below_min_chunks_list(md_header_splits, list_of_below_min_indices, min_chunk_size)
            LOG.debug(f"-----------------------CHUNKS AFTER MIN REMOVAL {i}--------------------------")
            LOG.debug(f"num. of chunks after {i} MIN_SUBCHUNKING: {len(md_header_splits)}")
            #print_all_chunks(md_header_splits, with_page_content=False)

        list_of_above_max_indices= find_idxs_above_max_chunk_size(md_header_splits, max_chunk_size)

        # Check, if two consecutive numbers in the num_of_idxs_in_list = [] are the same, decrease the breakpoint threshold by 10.
        if previous_indices is not None and list_of_above_max_indices == previous_indices and breakpoint_thresh_value!=75:
            breakpoint_thresh_value=breakpoint_thresh_value-10

        if list_of_above_max_indices == previous_indices and breakpoint_thresh_value==75:
            LOG.debug("Finished!")
            break

        # Update the previous indices for the next iteration
        previous_indices = list_of_above_max_indices

        if len(list_of_above_max_indices) > 0:
            remove_above_max_chunks_list(md_header_splits, list_of_above_max_indices, breakpoint_thresh_value)
            LOG.debug(f"-----------------------CHUNKS AFTER MAX REMOVAL {i}--------------------------")
            LOG.debug(f"num. of chunks after {i} MAX_SUBCHUNKING: {len(md_header_splits)}")
            #print_all_chunks(md_header_splits, with_page_content=False)

        #LOG.debug(md_header_splits)
        LOG.debug(len(md_header_splits))

        i = i+1

    LOG.debug("-----------------------FINAL CHUNKS--------------------------")

    doc_chunks.append(md_header_splits)
    print_all_chunks(md_header_splits, with_page_content=False)

    leitlinien_doc= documents.id_
    output_file = f"chunking_results_{d}.txt"
    open(output_file, 'w').close()
    write_results_in_txt(md_header_splits,leitlinien_doc,list_of_above_max_indices, output_file)


100%|██████████| 25/25 [00:56<00:00,  2.25s/it]
100%|██████████| 25/25 [00:44<00:00,  1.77s/it]
100%|██████████| 16/16 [00:28<00:00,  1.75s/it]
100%|██████████| 13/13 [00:22<00:00,  1.69s/it]
100%|██████████| 13/13 [00:23<00:00,  1.79s/it]
100%|██████████| 10/10 [00:16<00:00,  1.68s/it]
100%|██████████| 9/9 [00:15<00:00,  1.69s/it]
100%|██████████| 8/8 [00:13<00:00,  1.70s/it]
100%|██████████| 38/38 [01:20<00:00,  2.11s/it]
100%|██████████| 23/23 [00:41<00:00,  1.82s/it]
100%|██████████| 19/19 [00:31<00:00,  1.64s/it]
100%|██████████| 16/16 [00:31<00:00,  1.95s/it]
100%|██████████| 15/15 [00:25<00:00,  1.67s/it]
100%|██████████| 15/15 [00:25<00:00,  1.68s/it]
100%|██████████| 3/3 [00:04<00:00,  1.43s/it]
100%|██████████| 2/2 [00:02<00:00,  1.13s/it]


### Pincone Loading

In [8]:
# Preprocessing Data Load for splits
# vector_limit=10

splits = doc_chunks 
splits_metadata_source = [[] for _ in range(len(splits))]
splits_metadata_page = [[] for _ in range(len(splits))]
splits_metadata_Gültigkeit = [[] for _ in range(len(splits))]
splits_page_content = [[] for _ in range(len(splits))]
previous_page_number = 0  # Initialize previous page number

#preprocess
for i in range(len(splits)):
    for j in range(len(splits[i])):
        splits_page_content[i].append(splits[i][j].page_content)
        metadata = splits[i][j].metadata
        LOG.debug("metadata:",metadata)
        splits_metadata_source[i].append(metadata['source'])
        splits_metadata_Gültigkeit[i].append(metadata['Gültigkeit'])
                # Extract page number
        if 'Page Number' in metadata:  # Check if 'Page Number' exists
            current_page_number = metadata['Page Number'][0]
            previous_page_number = current_page_number
        else:  # If 'Page Number' doesn't exist
            current_page_number = previous_page_number + 1  # Calculate page number
        splits_metadata_page[i].append(current_page_number)
        #splits_metadata.append(splits[i].metadata)


# LOG.debug(splits_metadata_source)
# LOG.debug(len(splits_metadata_source))

# LOG.debug(splits_metadata_page)
# LOG.debug(len(splits_metadata_page))

# LOG.debug(splits_page_content)
# LOG.debug(len(splits_page_content))

In [9]:
# Creating new index
from pinecone import Pinecone, ServerlessSpec
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("BAAI/bge-small-en-v1.5") # 'all-MiniLM-L6-v2'

pinecone = Pinecone(api_key=PINECONE_API_KEY)

INDEX_NAME = 'leitliniengpt-vdb'

# if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
#     pinecone.delete_index(INDEX_NAME)
# LOG.debug(INDEX_NAME)
# pinecone.create_index(name=INDEX_NAME, 
#     dimension=model.get_sentence_embedding_dimension(),      #  dimension=384 - dimensionality of bge-small-en-v1.5
#     metric='cosine',
#     spec=ServerlessSpec(cloud='aws', region='eu-west-1'))

  from tqdm.autonotebook import tqdm


## This code resets pinecone
if INDEX_NAME in [index.name for index in pinecone.list_indexes()]:
    pinecone.delete_index(INDEX_NAME)
LOG.debug(INDEX_NAME)
pinecone.create_index(name=INDEX_NAME, 
    dimension=model.get_sentence_embedding_dimension(),      #  dimension=384 - dimensionality of bge-small-en-v1.5
    metric='cosine',
    spec=ServerlessSpec(cloud='aws', region='eu-west-1'))

In [10]:
index = pinecone.Index(INDEX_NAME)
index.describe_index_stats()

{'dimension': 384,
 'index_fullness': 0.0,
 'namespaces': {'': {'vector_count': 262}},
 'total_vector_count': 262}

In [11]:
LOG.debug(len(splits))
for i in splits:
    LOG.debug("Chunks: ", len(i))

In [19]:
for doc_num in range(len(splits)):
    for i in range(0, len(splits[doc_num])):
        # find end of batch
        print(splits[doc_num][i].page_content)
        break

publiziert bei: AWMF-Register-Nr. 001-005, Klassifikation S1
## publiziert bei:   
Analgesie, Sedierung und Delirmanagement in der S3-Leitlinie Intensivmedizin (DAS-Leitlinie 2020)  
AWMF-Registernummer: 001/012 
### Federführende Fachgesellschaften   
- Deutsche Gesellschaft für Anästhesiologie und Intensivmedizin (DGAI)
- Deutsche Interdisziplinäre Vereinigung für Intensiv- und Notfallmedizin (DIVI)


In [13]:
## publiziert bei:   \nAnalgesie, Sedierung und Delirmanagement in der S3-Leitlinie Intensivmedizin (DAS-Leitlinie 2020)  \nAWMF-Registernummer: 001/012 \n### Federführende Fachgesellschaften   \n- Deutsche Gesellschaft für Anästhesiologie und Intensivmedizin (DGAI)\n- Deutsche Interdisziplinäre Vereinigung für Intensiv- und Notfallmedizin (DIVI)

SyntaxError: 'break' outside loop (668683560.py, line 1)

In [None]:
# Upserting Data to pinecone
batch_size=5

id_file_path = "last_used_id.txt"

# Initialize ID counter
current_id = 0

# Check if the ID file exists
if os.path.exists(id_file_path):
    # Read the last used ID from the file
    with open(id_file_path, "r") as id_file:
        current_id = int(id_file.read().strip())

for doc_num in range(len(splits)):
    for i in tqdm(range(0, len(splits[doc_num]), batch_size)):
        # find end of batch
        i_end = min(i+batch_size, len(splits[doc_num]))
        
        # create IDs batch with increasing numbers
        ids = [str(current_id + x) for x in range(i_end - i)]
        # Increment ID counter
        current_id += i_end - i

        # create metadata batch
        metadatas = [
                        {'text': text,'source': source, 'page': page, 'Gültigkeit': gültigkeit}
                        for text, source, page, gültigkeit in zip(splits_page_content[doc_num][i:i_end], splits_metadata_source[doc_num][i:i_end], splits_metadata_page[doc_num][i:i_end], splits_metadata_Gültigkeit[doc_num][i:i_end])
                    ]

        # create embeddings
        xc = model.encode(splits_page_content[doc_num][i:i_end])
        xc = xc.tolist()
        # create records list for upsert
        records = zip(ids, xc, metadatas)
        # upsert to Pinecone
        index.upsert(vectors=records)
    LOG.debug(f"Doc {doc_num} with {i_end} chunks upserted!")
    LOG.debug(f"Total upserted: {current_id}")

# Write the last used ID to the file
with open(id_file_path, "w") as id_file:
    id_file.write(str(current_id))

  0%|          | 0/5 [00:00<?, ?it/s]

100%|██████████| 5/5 [00:09<00:00,  1.95s/it]


Doc 0 with 21 chunks upserted!
Total upserted: 152


100%|██████████| 22/22 [00:49<00:00,  2.25s/it]

Doc 1 with 110 chunks upserted!
Total upserted: 262





In [None]:
# %pip install PyPDF2
# %pip install PyCryptodome 
import os
from PyPDF2 import PdfReader
import shutil

def count_pdf_pages(pdf_path):
    try:
        reader = PdfReader(pdf_path)
        return len(reader.pages)
    except Exception as e:
        LOG.debug(f"Error reading {pdf_path}: {e}")
        return 0

def organize_pdfs(folder_path, max_pages_per_folder=1000):
    # Creating a new directory to store sub-folders
    new_folders_path = 'New_Folder'

    os.makedirs(new_folders_path, exist_ok=True)

    pdf_files = [f for f in os.listdir(folder_path) if f.endswith('.pdf')]
    current_folder_count = 0
    current_page_count = 0
    total_pages = 0
    

    for pdf_file in pdf_files:
        current_folder_path = os.path.join(new_folders_path, f'{current_folder_count}')
        os.makedirs(current_folder_path, exist_ok=True)
        
        pdf_path = os.path.join(folder_path, pdf_file)
        num_pages = count_pdf_pages(pdf_path)
        if current_page_count + num_pages > max_pages_per_folder:
            current_folder_count += 1
            #current_folder_path = os.path.join(new_folders_path, f'{current_folder_count}')
            #os.makedirs(current_folder_path, exist_ok=True)
            total_pages += current_page_count
            current_page_count = 0
        
        shutil.copy(pdf_path, os.path.join(current_folder_path, pdf_file))
        current_page_count += num_pages
    
    LOG.debug(f"Organized {len(pdf_files)} PDFs into {current_folder_count} folders inside 'New_Folders'.")
    LOG.debug("total_pages:",total_pages)

folder_path = 'Database_NEW'
organize_pdfs(folder_path)

In [None]:
API_KEY_LIST = ['llx-u4t0fZSHsXQAhit7M7Hjf1MJTX3BoxAKJW9jswGTfwKf9tFB',
                'llx-QUF47pz0VJGzUrdUzFpOgmjHpH9TpAmudTAiJYmF5aCyPrgN',
                'llx-LoEWhKswtx0ltVIeuldBGKxykNfDLb8WpfJ0rQiqGVYzgFge',
                'llx-RSBs3YYka7OpvtSu33s24LKe9HzmQvzzJ0TidvxouF6TQwrz',
                'llx-9zwZrC1cPhoNSxhfDlzQc2OuEe9b73WJOmXtAra9kD1HxVRF',
                'llx-AHsZMxFAh7wjO8IWCEcU70QWobBE1h9co7uQzr7CRpXhcrrN']