In [1]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import os

def ocr_pdf_pages(input_pdf_path, output_text_file):
    # Open the PDF file
    pdf_document = fitz.open(input_pdf_path)

    with open(output_text_file, "w", encoding="utf-8") as text_file:
        for page_number in range(pdf_document.page_count):
            # Extract the page as an image
            page = pdf_document.load_page(page_number)
            image = page.get_pixmap()
            img = Image.frombytes("RGB", (image.width, image.height), image.samples)

            # Perform OCR using pytesseract
            ocr_result = pytesseract.image_to_string(img)

            # Write OCR result to the text file along with the page number
            text_file.write(ocr_result + "\n")
            text_file.write(f"_Page: {page_number + 1}"+ "\n\n")

    # Close the PDF file
    pdf_document.close()


def ocr_folder(input_folder, output_folder):
    # Create the output folder if it doesn't exist
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)

    # Iterate through all files in the input folder
    for filename in os.listdir(input_folder):
        if filename.endswith(".pdf"):
            # Generate the output file path
            input_file_path = os.path.join(input_folder, filename)
            output_file_path = os.path.join(output_folder, filename.replace(".pdf", ".txt"))

            # Perform OCR on the PDF and save the text to the output file
            ocr_pdf_pages(input_file_path, output_file_path)




In [3]:


input_folder = "Library"  # Replace with the path to your input folder containing PDFs
output_folder = "Library_TEXT"  # The folder where OCR-generated text files will be saved

ocr_folder(input_folder, output_folder)


In [7]:
import os
import re

library_dict = {}

for filename in os.listdir('Library_TEXT'):
    if filename.endswith('.txt'):
        with open(os.path.join('Library_TEXT', filename), 'r', encoding='utf-8') as file:
            content = file.read()

            # Find all occurrences of "Page: {page number}" and split the content accordingly
            pages_split = re.split(r'(_Page:\s*\d+)', content)

            # Create a dictionary for pages, using the page number as the key
            pages_info = {'page_1': pages_split[0].strip()}  # Adding the first page (before the first "Page:" marker)
            for i in range(1, len(pages_split), 2):
                page_number = int(re.search(r'\d+', pages_split[i]).group()) + 1  # Extracting page number
                pages_info[f'page_{page_number}'] = pages_split[i + 1].strip()

            library_dict[filename] = {
                'number_of_pages': len(pages_info),
                'pages': pages_info
            }



In [8]:
library_dict

{'Naloxone Distribution Project.txt': {'number_of_pages': 8,
  'pages': {'page_1': "Naloxone Distribution Project\n\nAbout the NDP\n\nDHCS created the Naloxone Distribution Project (NDP) to combat opioid overdose-related deaths\nthroughout California, The NOP aims to address the opioid crisis by reducing opioid overdose\ndeaths through the provision of free naloxone. For more information on the NOP, please visit\n\nthe CaliforniaMMAT (httns//anw.californiamat.ora/matproject/naloxone-cistribution:\n\npisiest/ website\n\nHow to Apply Via the New Online NDP Portal\n\nApplications to the NDP must be submitted via the NDP online application form\n(https://aurrerahealthgroup.qualtrics.com/ife/form/SV_3aqWz9n74FH7tVs). Please review the below\ntable and FAQs for any additional required materials for your organization's application\n\n‘To apply for naloxone through the NOP:\n\n1. Obtain a standing order.\n2. Gather the required supplemental materials\n3, Complete the NDP online application for

In [9]:
chunk_dict = {}

for filename, file_info in library_dict.items():
    pages = file_info['pages']
    file_chunks = {}

    chunk_number = 1

    for i in range(1, file_info['number_of_pages'] + 1):
        page_key = f'page_{i}'
        page_text = pages[page_key]

        # Split the current page into two halves
        split_index = len(page_text) // 2
        top_half = page_text[:split_index]
        bottom_half = page_text[split_index:]

        # Create a full page chunk
        file_chunks[f'chunk{chunk_number}'] = {'text': top_half + bottom_half, 'pages': [i]}
        chunk_number += 0.5

        # If there's a next page, create a half-and-half chunk
        if i < file_info['number_of_pages']:
            next_page_key = f'page_{i + 1}'
            next_page_text = pages[next_page_key]
            next_top_half = next_page_text[:len(next_page_text) // 2]
            file_chunks[f'chunk{chunk_number}'] = {'text': bottom_half + next_top_half, 'pages': [i, i + 1]}
            chunk_number += 0.5

    chunk_dict[filename] = {'number_of_chunks': len(file_chunks), 'chunks': file_chunks}




In [10]:
chunk_dict

{'Naloxone Distribution Project.txt': {'number_of_chunks': 15,
  'chunks': {'chunk1': {'text': "Naloxone Distribution Project\n\nAbout the NDP\n\nDHCS created the Naloxone Distribution Project (NDP) to combat opioid overdose-related deaths\nthroughout California, The NOP aims to address the opioid crisis by reducing opioid overdose\ndeaths through the provision of free naloxone. For more information on the NOP, please visit\n\nthe CaliforniaMMAT (httns//anw.californiamat.ora/matproject/naloxone-cistribution:\n\npisiest/ website\n\nHow to Apply Via the New Online NDP Portal\n\nApplications to the NDP must be submitted via the NDP online application form\n(https://aurrerahealthgroup.qualtrics.com/ife/form/SV_3aqWz9n74FH7tVs). Please review the below\ntable and FAQs for any additional required materials for your organization's application\n\n‘To apply for naloxone through the NOP:\n\n1. Obtain a standing order.\n2. Gather the required supplemental materials\n3, Complete the NDP online app

In [11]:
import tiktoken

def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

# Lists to hold the token counts
token_counts = []

# Iterate through the documents and chunks, analyzing the token count
for document_name, document_data in chunk_dict.items():
    for chunk_name, chunk_data in document_data['chunks'].items():
        text = chunk_data['text']
        token_count = num_tokens_from_string(text, "cl100k_base")
        token_counts.append(token_count)

# Calculate and print the max, min, and average token counts
max_token_count = max(token_counts)
min_token_count = min(token_counts)
average_token_count = sum(token_counts) / len(token_counts)

print(f"Max token count: {max_token_count}")
print(f"Min token count: {min_token_count}")
print(f"Average token count: {average_token_count}")







Max token count: 898
Min token count: 0
Average token count: 241.62476547842402


In [12]:
from llama_index.schema import TextNode, NodeRelationship, RelatedNodeInfo

nodes = []

# Iterate through the chunk_dict
for filename, file_info in chunk_dict.items():
    chunks = file_info['chunks']
    previous_node = None

    for chunk_key, chunk_info in chunks.items():
        text_chunk = chunk_info['text']
        page_numbers = chunk_info['pages']

        # Create a new TextNode for the chunk
        node = TextNode(text=f"Document: {filename}, Pages: {page_numbers}, Text: {text_chunk}")

        # If there's a previous node, set up the NEXT and PREVIOUS relationships
        if previous_node is not None:
            node.relationships[NodeRelationship.PREVIOUS] = RelatedNodeInfo(node_id=previous_node.node_id)
            previous_node.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(node_id=node.node_id)
            # If needed, you can also include metadata in the relationship
            # previous_node.relationships[NodeRelationship.NEXT] = RelatedNodeInfo(node_id=node.node_id, metadata={"key": "val"})

        nodes.append(node)
        previous_node = node



In [13]:
from llama_index import VectorStoreIndex
import openai
openai.api_key = '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'
index = VectorStoreIndex(nodes)

In [None]:
nodes

[TextNode(id_='52234db5-525f-4a16-a931-eebe56f9d944', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='0768e8ea-12fa-4a20-95e1-b4d77391d378', node_type=None, metadata={}, hash=None)}, hash='eb5b8945faf6cab0fc8c21f39e3d5f1f89d2a360f9cea8b5eec8c4fe1c6752a9', text="Document: Naloxone Distribution Project.txt, Pages: [1], Text: Naloxone Distribution Project\n\nAbout the NDP\n\nDHCS created the Naloxone Distribution Project (NDP) to combat opioid overdose-related deaths\nthroughout California, The NOP aims to address the opioid crisis by reducing opioid overdose\ndeaths through the provision of free naloxone. For more information on the NOP, please visit\n\nthe CaliforniaMMAT (httns//anw.californiamat.ora/matproject/naloxone-cistribution:\n\npisiest/ website\n\nHow to Apply Via the New Online NDP Portal\n\nApplications to the NDP must be submitted via the NDP online application 

In [14]:

from llama_index.retrievers import VectorIndexRetriever
from llama_index.query_engine import RetrieverQueryEngine
from llama_index.indices.postprocessor import SimilarityPostprocessor


# Calculate and print the max, min, and average token counts
max_token_count = max(token_counts)
average_token_count = sum(token_counts) / len(token_counts)

# Determine the value of similarity_top_k based on the max and average token counts
if max_token_count < 1250 and average_token_count < 400:
    similarity_top_k = 5
elif max_token_count > 1250:
    similarity_top_k = 2
elif average_token_count > 400:
    similarity_top_k = 3
else:
    similarity_top_k = 3  # Default value, you can change as needed

# Configure retriever
retriever = VectorIndexRetriever(
    index=index,
    similarity_top_k=similarity_top_k,
)

# Assemble the query engine with SimilarityPostprocessor with similarity_cutoff=0.5
query_engine = RetrieverQueryEngine(
    retriever=retriever,
    node_postprocessors=[
        SimilarityPostprocessor(similarity_cutoff=0.5)
    ]
)

# Query the index
response = query_engine.query("What is an opiod")

# Print the response
print(str(response))

# Get and print the sources
source_nodes = response.source_nodes




An opioid is a type of drug that is used to relieve pain. It works by binding to opioid receptors in the brain and other parts of the body, which can reduce the perception of pain and produce a feeling of euphoria. Opioids can be either natural or synthetic, and include drugs such as morphine, codeine, oxycodone, hydrocodone, and fentanyl.


In [15]:
source_nodes[1]

NodeWithScore(node=TextNode(id_='524341aa-b74a-4dc7-a821-4a2833624f0d', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={<NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='33e49207-3283-4ae0-85c8-995a18d518f8', node_type=None, metadata={}, hash=None), <NodeRelationship.NEXT: '3'>: RelatedNodeInfo(node_id='3b7c9b1b-5868-4e29-a263-288a3ba6df9e', node_type=None, metadata={}, hash=None)}, hash='e7af595d2f2f28668658c6724b48dff7323e9dede7db63ac2e2efedcd98b7801', text='Document: JD-OSF-Allowable-Expenses.txt, Pages: [12, 13], Text:  harms assciated wih iiravenous dug use, neu supple,\nStang, space peer support serdces referrals orem ean\nCheclng,conactonstocare and he fal ange of har ret and\ntreatment services profed by tase programs,\n\n+ Bipaning acters totasing and resent fr rfecious sceses such\na0 HIV and Hepatls C resthing from inavenous op use,\n\n+ Suppotg mae uns tht fr or prov reer fo harm eduon\nScntoesraumort ecov

In [16]:
# Assuming 'source_node' is the NodeWithScore object you provided
source_node = source_nodes[1]

# Extracting the document name
document_name = re.search(r'Document: (.*?),', source_node.node.text).group(1)

# Extracting the pages
pages = re.search(r'Pages: (\[\d+(?:, \d+)*\])', source_node.node.text).group(1)

# Extracting the similarity score
similarity_score = source_node.score

# Print the results
print("Document Name:", document_name)
print("Pages:", pages)
print("Similarity Score:", similarity_score)


Document Name: JD-OSF-Allowable-Expenses.txt
Pages: [12, 13]
Similarity Score: 0.8022662618517444


In [None]:

import textwrap
import re

for node_with_score in source_nodes:
    text_node = node_with_score.node
    

    # Extracting the document name
    document_name = re.search(r'Document: (.*?),', document_text).group(1)

    # Extracting the pages
    pages = re.search(r'Pages: (\[\d+(?:, \d+)*\])', document_text).group(1)

    # Getting the similarity score
    similarity_score = node_with_score.score

   
    print(f"Document Name: {document_name}\nPages: {pages}\nSimilarity Score: {similarity_score}\n")
  

NameError: name 'source_nodes' is not defined