In [None]:
# !pip install llama_index.llms.azure_openai
# !pip install llama_index.embeddings.azure_openai
# !pip install python-dotenv
# !pip install pymupdf
# !pip install azure
# !pip install azure-ai-documentintelligence
# !pip install surya-ocr
# !pip install pytesseract
# !pip install pandas
# !pip install llama-index llama-index-experimental
from dotenv import load_dotenv
load_dotenv()  # take environment variables from .env.





# File-Based Node Parsers:

In [None]:
# SimpleFileNodeParser

from llama_index.core.node_parser import SimpleFileNodeParser
from llama_index.readers.file import FlatReader
from llama_index.core import Document  # Import the Document class
from pathlib import Path
import fitz  # PyMuPDF

def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()  # Extract text from each page
    doc.close()
    return text

# Extract text from the PDF
pdf_text = extract_text_from_pdf("./data/Semester 11 Results.pdf")

# Create a Document object from the extracted text
document = Document(text=pdf_text)

# Parse the document
parser = SimpleFileNodeParser()
pdf_nodes = parser.get_nodes_from_documents([document])

# Print the parsed nodes
# for i in pdf_nodes: 
#     print(i)
# print(pdf_nodes)




# Relation-Based Node Parsers:

In [None]:
# HierarchicalNodeParser

from llama_index.core.node_parser import HierarchicalNodeParser
from llama_index.core import Document
from pathlib import Path
import fitz  # PyMuPDF to extract text from PDF

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path):
    """Extract text from a PDF file using PyMuPDF."""
    doc = fitz.open(pdf_path)
    text = ""
    for page in doc:
        text += page.get_text()  # Extract text from each page
    doc.close()
    return text

# Extract text from the PDF
pdf_text = extract_text_from_pdf("./data/FYP B Final Clean Report.pdf")

# Create a Document object from the extracted text
document = Document(text=pdf_text)

# Initialize the HierarchicalNodeParser with different chunk sizes
node_parser = HierarchicalNodeParser.from_defaults(
    chunk_sizes=[2048, 512, 128]  # Define hierarchy levels for chunking
)

# Parse the document into hierarchical nodes
hierarchical_nodes = node_parser.get_nodes_from_documents([document])

# Print the parsed hierarchical nodes
# for node in hierarchical_nodes:
#     print(node)



# Azure Document Intelligence

In [None]:
import os
from typing import List, Optional, Union
from io import BufferedIOBase

# Import necessary modules and types from LlamaIndex and Azure SDK
from llama_index.core.bridge.pydantic import Field
from llama_index.core.readers.base import BasePydanticReader
from llama_index.core.schema import Document
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import (
    DocumentAnalysisFeature,
    AnalyzeResult,
    AnalyzeDocumentRequest,
    ContentFormat
)

# FileInput can accept a file path as a string, raw bytes, or a file-like object (buffer)
FileInput = Union[str, bytes, BufferedIOBase]


# class AzDocumentIntelligenceParse(BasePydanticReader):
"""
A parser class that integrates with the Azure Document Intelligence service.
This class sends files to Azure's Document Intelligence API for analysis and 
processes the results into a format usable by the LlamaIndex.
"""

# API key for Azure's Document Intelligence service
# api_key = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_API_KEY")
# # Base URL or endpoint for the Azure Document Intelligence service
# base_url = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")

def _az_analyze(
        # self,
        file_input: FileInput,  # The file to be analyzed, can be a path, bytes, or buffer
        extra_info: Optional[dict] = None,  # Additional metadata for the file
        verbose: bool = False,  # Whether to print additional information for debugging
):
    """
    Analyze the input file using Azure Document Intelligence service and return the processed document.
    """
    # Retrieve the Azure endpoint and API key from environment variables
    endpoint = os.environ["AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT"]
    key = os.environ["AZURE_DOCUMENT_INTELLIGENCE_API_KEY"]

    print(endpoint, key)

    # Convert the file input into a string (assuming it's a file path)
    file_path = str(file_input)
    
    # Initialize the Azure Document Intelligence client with the endpoint and API key
    document_intelligence_client = DocumentIntelligenceClient(endpoint=endpoint, credential=AzureKeyCredential(key))

    # Open the file in binary mode and send it to Azure for document analysis
    with open(file_path, "rb") as f:
        # Begin the document analysis process, using the "prebuilt-layout" model for layout recognition
        poller = document_intelligence_client.begin_analyze_document(
            "prebuilt-layout",  # The prebuilt model for analyzing document layouts
            analyze_request=f,  # The file to be analyzed
            features=[],  # No additional features are specified here
            output_content_format=ContentFormat.MARKDOWN,  # The format of the analyzed output (Markdown)
            content_type="application/octet-stream",  # Specify the content type as raw binary data
        )
        
    # Wait for the analysis to complete and get the result
    result: AnalyzeResult = poller.result()

    # Extract the content from the result, which is in Markdown format
    markdown_content = result.content

    # Create a Document object that holds the text (Markdown content) and any additional metadata
    docs = [
        Document(
            text=markdown_content,  # The analyzed text content
            metadata=extra_info or {},  # Attach any additional metadata, or an empty dictionary if none provided
        )
    ]
    return docs  # Return the processed document(s) as a list

    # def load_data(
    #         self,
    #         file_path: Union[List[FileInput], FileInput],  # Input can be a single file or a list of files
    #         extra_info: Optional[dict] = None,  # Additional metadata to be associated with the file(s)
    # ) -> List[Document]:
    #     """
    #     Load data by sending the file(s) to Azure Document Intelligence for analysis.
    #     Returns the parsed data in the form of Document objects.
    #     """
        # Call the _az_analyze function to perform the analysis and return the document(s)
        # return self._az_analyze(file_path, extra_info)

# print(_az_analyze(file_input = "./data/FYP B Final Clean Report.pdf"))