In [1]:
import fitz
import os
import re
from google import genai
from google.genai import types
import pathlib
import keyring
print("Current working directory:", os.getcwd())


Current working directory: /Users/william/github/GradBoxLLM/experimental


In [2]:
def extractPdfContentWithMetadata(pdfFilePath):
    """
    Extracts text content from a PDF, attempting to handle multi-page headers,
    along with page numbers and document title.

    Heuristic approach: Assumes lines starting with '#' are headers.
    May need refinement based on PDF structure.

    Args:
        pdfFilePath (str): The path to the PDF file.

    Returns:
        tuple: A tuple containing:
            - list: A list of text blocks, where each block is a dictionary
              with 'text' (content), 'page' (page number), and 'is_header' (bool).
            - str: The document title (textbook name), or filename if not found in metadata.
    """
    # Read in PDF
    document = fitz.open(pdfFilePath)
    # Get title from metadata, or use filename if not found
    documentTitle = document.metadata.get("title", None)
    if not documentTitle:
        documentTitle = os.path.basename(pdfFilePath)
    # Initialize extraction objects
    fullContentWithMetadata = []
    lastHeader = None
    # For each page in the PDF extract text blocks from the current page
    for pageNumber, page in enumerate(document, start=1):
        blocks = page.get_text("blocks")
        # For each text block check it's a text block
        for block in blocks:
            blockType = block[-1]
            if blockType == 0:  # Text block
                textContent = block[4] # text content of the block
                isHeaderLine = False
                # If line starts with '#', then it's a header
                if re.match(r"^(#+) (.*)", textContent.strip()):
                    isHeaderLine = True
                    lastHeader = textContent.strip()
                # If line is empty after a header, header continuation likely ended
                elif lastHeader and not textContent.strip():
                    lastHeader = None
                # If line is not empty and after a header, consider it header continuation
                elif lastHeader:
                    isHeaderLine = True

                # Append text block info with metadata to the content list
                fullContentWithMetadata.append({
                    "text": textContent,
                    "page": pageNumber,
                    "is_header": isHeaderLine
                })
    # Return extracted content, metadata, and document title
    return fullContentWithMetadata, documentTitle


In [13]:
extractedPdfBlocks, documentTitles = extractPdfContentWithMetadata("../googleDrive/assets/textbooks/Medical-surgical_nursing--Preparation_for_practice_(2010).pdf")

In [18]:
print(extractedPdfBlocks[2000:2050])

[{'text': '2. Intervene by refining the nursing\ndiagnoses to behavioral goals in\ncollaboration with the patient, formulating\nthe desired outcomes for problem\nresolution in the form of behavioral\nobjectives, generating and selecting an\nalternate course of actions, and\nimplementing the action alternatives.\n', 'page': 156, 'is_header': False}, {'text': '3. Evaluate to appraise the degree to which\nthe actual outcomes match the desired\noutcomes (the behavioral objectives).\n', 'page': 156, 'is_header': False}, {'text': '4. Revise the plan by redesigning the\nbehavioral objectives that have not been\nattained; by selecting and implementing\nother action alternatives; and by\ncollecting and analyzing more data to\nevolve new nursing diagnoses, new\nbehavioral objectives, and other action\nalternatives, all in collaboration with the\npatient.\n', 'page': 156, 'is_header': False}, {'text': '1. Identify a particular problem or condition\nof the patient or family or a problem in\nthe se

In [20]:
    import pdfplumber

    def extract_headers(pdf_path):
        """Extracts headers from a PDF file.

        Args:
            pdf_path (str): The path to the PDF file.

        Returns:
            dict: A dictionary where keys are page numbers and values are lists of headers found on that page.
        """
        headers = {}
        with pdfplumber.open(pdf_path) as pdf:
            for page_num, page in enumerate(pdf.pages):
                words = page.extract_words(extra_attrs=["fontname", "size"])
                
                # Identify header font size (adjust threshold as needed)
                header_font_size = 0
                for word in words:
                    if word["size"] > header_font_size:
                        header_font_size = word["size"]

                page_headers = []
                for word in words:
                     # Consider words with the identified header font size as headers
                    if word["size"] == header_font_size:
                        page_headers.append(word["text"])
                
                headers[page_num + 1] = page_headers
        return headers

In [21]:
headers = extract_headers("../googleDrive/assets/textbooks/Medical-surgical_nursing--Preparation_for_practice_(2010).pdf")

In [23]:
print(headers)

{1: [], 2: ['Kathleen', 'S.Osborn,RN,MS,EdD', 'Cheryl', 'E.Wraa,RN,MSN', 'Annita', 'B.Watson,RN,MS,DNSc'], 3: ['S4CARLISLE'], 4: ['ABOUT', 'THE', 'AUTHORS'], 5: ['THANK', 'YOU'], 6: ['S4CARLISLE'], 7: ['Contributors'], 8: ['S4CARLISLE'], 9: ['S4CARLISLE'], 10: ['S4CARLISLE'], 11: ['S4CARLISLE'], 12: ['S4CARLISLE'], 13: ['FOREWORD'], 14: ['A'], 15: ['Research', 'Opportunities', 'and', 'Clinical', 'Impact.'], 16: ['Complementary', 'and', 'Alternative', 'Therapies.', 'Gerontological', 'Considerations.', 'Cultural', 'Considerations.', 'Genetic', 'Considerations.', 'Ethical', 'Issues.'], 17: ['Critical', 'Alerts.', 'Diagnostic', 'Tests.', 'Patient', 'Teaching', '&', 'Discharge', 'Priorities.', 'Nursing', 'Process:', 'Patient', 'Care', 'Plan.', 'Pharmacology', 'Summary', 'Tables.'], 18: ['Clinical', 'Preparation.', 'Clinical', 'Preparation', 'Assignments.', 'National', 'Guidelines.', 'Health', 'Promotion.', 'Risk', 'Factors.'], 19: ['ACKNOWLEDGMENTS'], 20: ['UNIT', '1'], 21: ['UNIT', '2'], 2

In [None]:



# --- Function: headerChunkingWithMetadata ---
# Chunks pre-extracted text content based on headers, preserving metadata.
def headerChunkingWithMetadata(contentWithMetadata):
    """
    Chunks pre-extracted text content (with metadata), handling potential multi-page headers.

    Args:
        contentWithMetadata (list): List of text blocks from extractPdfContentWithMetadata,
                                     now with 'is_header' flag.

    Returns:
        dict: A dictionary where keys are headers and values are lists of chunks.
              Each chunk is a dictionary with 'text' and 'metadata' (including page numbers).
    """
    # --- Initialize chunking objects ---
    headerChunks = {}
    currentHeader = "Introduction" # Default header if no headers found at the beginning
    currentChunkText = ""
    currentPageNumbers = set() # Keep track of page numbers for the current chunk

    # --- Loop through each text block with metadata ---
    for block in contentWithMetadata:
        # --- Get text line, page number, and header flag from the block ---
        line = block['text']
        pageNumber = block['page']
        isHeaderLine = block['is_header'] # Get the is_header flag

        # --- Check if the current line is considered a header line ---
        if isHeaderLine:
            # --- If there's existing chunk text, save it under the current header ---
            if currentChunkText:
                headerChunks.setdefault(currentHeader, []).append({
                    "text": currentChunkText.strip(),
                    "metadata": {"page_numbers": sorted(list(currentPageNumbers))}
                })
            # --- Check if it's a new header (starts with '#') ---
            if re.match(r"^(#+) (.*)", line.strip()):
                # --- If it's a new header, update currentHeader ---
                currentHeader = line.strip()
            # --- Else: it's considered a continuation of the *currentHeader*, so no header change ---

            # --- Reset chunk text and page numbers for the new header/continuation ---
            currentChunkText = ""
            currentPageNumbers = set()
        # --- If not a header line, append to the current chunk ---
        else:
            currentChunkText += line + "\n"
            currentPageNumbers.add(pageNumber)

    # --- Save the last chunk after processing all blocks ---
    if currentChunkText:
        headerChunks.setdefault(currentHeader, []).append({
            "text": currentChunkText.strip(),
            "metadata": {"page_numbers": sorted(list(currentPageNumbers))}
        })
    # --- Return the dictionary of header chunks with metadata ---
    return headerChunks

# --- Function: extractTextFromPdfGemini ---
# Extracts text content from a PDF file directly using Gemini API.
def extractTextFromPdfGemini(pdfFilePath, apiKey):
    """
    Extracts text content from a PDF file directly using Gemini API in a single call.

    Args:
        pdfFilePath (str): The path to the PDF file.
        apiKey (str): Your Gemini API key.

    Returns:
        str: The extracted text content, or None on error.
    """
    try:
        # --- Initialize Gemini API client ---
        client = genai.Client(api_key=apiKey)
        # --- Access the Gemini 2.0 Flash model ---
        model = client.genai_model(model_name='gemini-2.0-flash')

        # --- Read PDF file bytes from the given path ---
        pdf_bytes = pathlib.Path(pdfFilePath).read_bytes()

        # --- Generate content using Gemini API, providing PDF bytes and extraction prompt ---
        response = model.generate_content(
            contents=[
                types.Part.from_bytes(data=pdf_bytes, mime_type='application/pdf'), # Include PDF bytes directly
                "Extract all text content from this PDF file." # Prompt
            ]
        )
        # --- Return the extracted text from the Gemini API response ---
        return response.text
    except Exception as e:
        print(f"Error extracting text from PDF using Gemini API: {e}")
        return None

# --- Function: transcribePdfToHtmlGemini ---
# Transcribes a PDF file to HTML using Gemini API, preserving layout.
def transcribePdfToHtmlGemini(pdfFilePath, apiKey):
    """
    Transcribes a PDF file to HTML directly using Gemini API in a single call, preserving layout.

    Args:
        pdfFilePath (str): The path to the PDF file.
        apiKey (str): Your Gemini API key.

    Returns:
        str: The HTML content of the PDF, or None on error.
    """
    try:
        # --- Initialize Gemini API client ---
        client = genai.Client(api_key=apiKey)
        # --- Access the Gemini 2.0 Flash model ---
        model = client.genai_model(model_name='gemini-2.0-flash')

        # --- Read PDF file bytes from the given path ---
        pdf_bytes = pathlib.Path(pdfFilePath).read_bytes()

        # --- Generate content using Gemini API, providing PDF bytes and HTML conversion prompt ---
        response = model.generate_content(
            contents=[
                types.Part.from_bytes(data=pdf_bytes, mime_type='application/pdf'), # Include PDF bytes directly
                "Convert this PDF file to HTML, preserving the original layout and formatting." # Prompt
            ]
        )
        # --- Gemini might return HTML in markdown code blocks, remove them ---
        htmlContent = response.text.replace("```html", "").replace("```", "").strip()
        # --- Return the extracted HTML content ---
        return htmlContent
    except Exception as e:
        print(f"Error transcribing PDF to HTML using Gemini API: {e}")
        return None

# --- Function: get_api_key_from_keychain ---
# Placeholder function to retrieve API key from keychain securely.
# Replace with your actual keychain interaction code.
def get_api_key_from_keychain():
    """
    Placeholder function: Replace with your actual keychain API key retrieval code.
    This example assumes you are using 'keyring' and have stored your key as 'gemini-api-key'.
    """
    # --- Retrieve API key from keychain using 'keyring' library ---
    api_key = keyring.get_password("gemini-api-key", "user") # Replace "gemini-api-key" and "user" as needed
    # --- Check if API key was successfully retrieved ---
    if not api_key:
        print("Error: Gemini API key not found in keychain.")
        print("Please ensure you have stored your API key in keychain with the correct service name and username.")
        exit()
    # --- Return the retrieved API key ---
    return api_key

# --- Main execution block ---
# This block is executed when the script is run directly.
if __name__ == "__main__":
    # --- 1. Get API Key from Keychain ---
    # Retrieve the Gemini API key from the keychain using the secure function.
    apiKey = get_api_key_from_keychain()
    # --- Configure Gemini API with the retrieved API key ---
    genai.configure(api_key=apiKey)

    # --- 2. Get PDF File Path from User Input ---
    # Prompt the user to enter the file path to their PDF textbook.
    pdfFilePath = input("Enter the path to your PDF textbook file: ")

    # --- 3. Extract Content and Metadata using PyMuPDF ---
    # Extract text content, page numbers, and document title from the PDF using PyMuPDF.
    print("\nExtracting content and metadata from PDF using PyMuPDF...")
    contentWithMetadata, textbookName = extractPdfContentWithMetadata(pdfFilePath)

    # --- 4. Check if content extraction was successful ---
    if contentWithMetadata:
        print(f"\nTextbook Name (Source Identifier): {textbookName}")

        # --- 5. Perform Header Chunking with Metadata ---
        # Chunk the extracted content based on headers, preserving metadata (page numbers).
        print("\nPerforming Header Chunking with Metadata...")
        headerBasedChunksWithMetadata = headerChunkingWithMetadata(contentWithMetadata)

        # --- 6. Output Example of Header Chunks with Metadata (PyMuPDF Extraction) ---
        # Print a simplified example of the header chunks and their metadata for PyMuPDF extraction.
        print("\n--- Example Header Chunks with Metadata (PyMuPDF Extraction) ---")
        for header, chunks in headerBasedChunksWithMetadata.items():
            first_chunk = chunks[0] # Just show the first chunk for brevity
            page_numbers = first_chunk['metadata']['page_numbers']
            chunk_snippet = first_chunk['text'][:100] + "..." if len(first_chunk['text']) > 100 else first_chunk['text']
            print(f"\nHeader: {header}")
            print(f"  Pages: {page_numbers}")
            print(f"  Chunk Snippet: {chunk_snippet}")
            print("  ---")

        # --- 7. Gemini API - Text Extraction (Example) ---
        # Example of using Gemini API to extract text directly from the PDF.
        print("\n--- Gemini API Text Extraction (Example - Snippet) ---")
        extractedTextGemini = extractTextFromPdfGemini(pdfFilePath, apiKey)
        # --- Check if Gemini text extraction was successful ---
        if extractedTextGemini:
            print(extractedTextGemini[:500] + "...\n[...rest of Gemini extracted text is in 'gemini_extracted_text_output.txt']")
            with open("gemini_extracted_text_output.txt", "w", encoding="utf-8") as geminiTextFile:
                geminiTextFile.write(extractedTextGemini)
            print("\nGemini extracted text saved to 'gemini_extracted_text_output.txt'")
        else:
            print("Gemini API Text Extraction failed.")

        # --- 8. Gemini API - HTML Transcription (Example) ---
        # Example of using Gemini API to transcribe the PDF to HTML.
        print("\n--- Gemini API HTML Transcription (Example - Snippet) ---")
        htmlContentGemini = transcribePdfToHtmlGemini(pdfFilePath, apiKey)
        # --- Check if Gemini HTML transcription was successful ---
        if htmlContentGemini:
            print(htmlContentGemini[:500] + "...\n[...rest of Gemini HTML transcription is in 'gemini_pdf_html_output.html']")
            with open("gemini_pdf_html_output.html", "w", encoding="utf-8") as geminiHtmlFile:
                geminiHtmlFile.write(htmlContentGemini)
            print("\nGemini PDF transcribed to HTML and saved to 'gemini_pdf_html_output.html'")
        else:
            print("Gemini API HTML Transcription failed.")


    else:
        print("PDF content extraction using PyMuPDF failed.")

    # --- 9. Final Output Summary and Next Steps ---
    # Print a summary of the generated output files and outline the next steps for RAG pipeline development.
    print("\n--- Gemini PDF Processing and Metadata Extraction - COMPLETE ---")
    print("\nOutputs:")
    print("- 'extracted_text_output.txt': Text extracted using PyMuPDF (for chunking).")
    print("- 'gemini_extracted_text_output.txt': Text extracted directly by Gemini API.")
    print("- 'pdf_html_output.html': HTML transcription of PDF using PyMuPDF (if you used HTML transcription).")
    print("- 'gemini_pdf_html_output.html': HTML transcription of PDF using Gemini API.")
    print("\nNext Steps:")
    print("1. Choose your preferred text output ('extracted_text_output.txt' or 'gemini_extracted_text_output.txt' or parsed 'gemini_pdf_html_output.html') for embedding.")
    print("2. Use chunking methods (header, semantic, etc.) to prepare text for embedding.")
    print("3. Embed chunks using your locally hosted embedding pipeline.")
    print("4. Store embeddings and metadata (textbook name, page numbers) in your vector database for RAG.")
    print("5. Build your RAG chatbot, retrieving chunks and metadata to answer user queries and provide source attribution.")