## Install packages and set variables

In [None]:
#%pip install azure-search-documents==11.5.1 --quiet
#%pip install azure-identity --quiet
#%pip install python-dotenv --quiet
#%pip install pymupdf --quiet
%pip install openai --quiet

In [None]:
# Load credentials
from dotenv import load_dotenv
import os
load_dotenv()

# Check the environment variables are set and assign them to variables.
AI_SEARCH_ENDPOINT = os.getenv('AI_SEARCH_ENDPOINT')
AI_SEARCH_KEY = os.getenv('AI_SEARCH_KEY')

# Ensure all required environment variables are set
if not all([AI_SEARCH_ENDPOINT, AI_SEARCH_KEY]):
    missing_vars = [var for var, val in zip(['AI_SEARCH_ENDPOINT', 'AI_SEARCH_KEY'], 
                                            [AI_SEARCH_ENDPOINT, AI_SEARCH_KEY]) if not val]
    raise ValueError(f"Environment variables {', '.join(missing_vars)} must be set.")

# Print the environment variables
print(f"AI_SEARCH_ENDPOINT: {AI_SEARCH_ENDPOINT}")
print(f"AI_SEARCH_KEY: {AI_SEARCH_KEY}")

index_name: str = "soc_index"

## Convert PDF to images

In [None]:
import fitz  # PyMuPDF
import os
from azure.storage.blob import BlobServiceClient
import uuid

def convert_pdf_to_images(pdf_path, blob_connection_string=None, container_name=None):
    """
    Converts each page of a PDF into JPEG images and saves them in a directory named after the PDF file.
    Optionally uploads the images to an Azure Blob Storage container.

    Args:
    - pdf_path (str): Path to the PDF file.
    - blob_connection_string (str, optional): Azure Blob Storage connection string.
    - container_name (str, optional): Name of the Azure Blob Storage container.

    Returns:
    - list: List of image file paths saved.
    """
    # Create a directory based on the PDF filename in the same directory as the original file
    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    output_dir = os.path.join(os.path.dirname(pdf_path), f"{pdf_name}_images")
    os.makedirs(output_dir, exist_ok=True)

    # Open the PDF file
    pdf_document = fitz.open(pdf_path)
    saved_image_paths = []

    # Convert each page of the PDF into images
    for page_num in range(len(pdf_document)):
        page = pdf_document.load_page(page_num)
        pix = page.get_pixmap()
        image_path = os.path.join(output_dir, f'page{page_num}.jpg')
        pix.save(image_path)
        saved_image_paths.append(image_path)

        # Upload to Azure Blob Storage if connection string and container name are provided
        if blob_connection_string and container_name:
            blob_service_client = BlobServiceClient.from_connection_string(blob_connection_string)
            container_client = blob_service_client.get_container_client(container_name)
            if not container_client.exists():
                container_client.create_container()
            blob_client = container_client.get_blob_client(blob=f'{pdf_name}/page{page_num}.jpg')
            with open(image_path, "rb") as data:
                blob_client.upload_blob(data, overwrite=True)
    
    return saved_image_paths

# Find out the current working directory

BLOB_STORAGE_ACCOUNT_CONNECTION_STRING = os.getenv('BLOB_STORAGE_ACCOUNT_CONNECTION_STRING')

pdf_path = "data\Azure DevOps - SOC 2 Type II Report (2023-10-01-to 2024-09-30).pdf"
container_name = "frompdf"
image_paths = convert_pdf_to_images(pdf_path)
print(f"Converted images saved at: {image_paths}")

## Create a documents payload

In [None]:
import base64
from mimetypes import guess_type

def local_image_to_data_url(image_path):
    """
    Get the url of a local image
    """
    mime_type, _ = guess_type(image_path)

    if mime_type is None:
        mime_type = "application/octet-stream"

    with open(image_path, "rb") as image_file:
        base64_encoded_data = base64.b64encode(image_file.read()).decode("utf-8")

    return f"data:{mime_type};base64,{base64_encoded_data}"

In [None]:
from openai import AzureOpenAI
import json

def gpt4o_imagefile(image_file):
    """
    Gpt-4o model
    """
    
    system_prompt = """
    You are an AI assistance that extracts text from the image. You are especially good at extracting tables.
    When you see a table
    You format the table like this:
    
    Monthly Savings
    | Month    | Savings |Details      |
    | -------- | ------- |------------ |
    | January  | $250    | for holiday |
    | February | $80     | pension     |
    | March    | $420    | new cat     |
"""
    
    client = AzureOpenAI(
        azure_endpoint=os.environ['OPENAI_API_ENDPOINT'],
        api_key=os.environ['OPENAI_API_KEY'],
        api_version='2023-05-15',
        )

    response = client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": system_prompt,
            },
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "Extract text from the image"},
                    {
                        "type": "image_url",
                        "image_url": {"url": local_image_to_data_url(image_file)},
                    },
                ],
            },
        ],
        max_tokens=2000,
        temperature=0.0,
    )
    
    return response.choices[0].message.content

In [None]:
import os


# Extract data from each image and save to JSON
start_page = 120  # specify the start page number
end_page = 201 # specify the end page number
extracted_data=[]


for image_path in image_paths[start_page:end_page]:
    data = gpt4o_imagefile(image_path)
    filename_only = os.path.splitext(os.path.basename(image_path))[0]
    extracted_data.append([data, filename_only])


In [None]:
for i, (text, filename_only) in enumerate(extracted_data):
    filename = f"extracted_data_{filename_only}.md"
    with open(filename, "w", encoding="utf-8") as f:
        f.write(text)
    print(f"Wrote extracted text to -> {filename}")

In [None]:
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient


# Initialize the BlobServiceClient
blob_service_client = BlobServiceClient.from_connection_string(BLOB_STORAGE_ACCOUNT_CONNECTION_STRING)
container_name = "toindexsoc"

# Get the container client
container_client = blob_service_client.get_container_client(container_name)

# Ensure the container exists
if not container_client.exists():
    container_client.create_container()

# Upload each element in the extracted_data list as a separate markdown file
for i, (text, filename_only) in enumerate(extracted_data):
    blob_name = f"extracted_data_{filename_only}.md"
    blob_client = container_client.get_blob_client(blob_name)
    
    # Upload the markdown text to Blob Storage
    blob_client.upload_blob(text, overwrite=True)
    
    print(f"Extracted markdown uploaded to blob storage at: {blob_name}")



In [None]:
from azure.search.documents.indexes import SearchIndexerClient
from azure.search.documents.indexes.models import (
    SearchIndexerDataContainer,
    SearchIndexerDataSourceConnection
)

# Create a data source 
indexer_client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AzureKeyCredential(AI_SEARCH_KEY))
container = SearchIndexerDataContainer(name=container_name)
data_source_connection = SearchIndexerDataSourceConnection(
    name=container_name+"-connection",
    type="azureblob",
    connection_string=BLOB_STORAGE_ACCOUNT_CONNECTION_STRING,
    container=container
)
data_source = indexer_client.create_or_update_data_source_connection(data_source_connection)

print(f"Data source '{data_source.name}' created or updated")

In [None]:
from azure.core.credentials import AzureKeyCredential
from azure.search.documents.indexes import SearchIndexClient
from azure.search.documents.indexes.models import (
    SearchField,
    SearchFieldDataType,
    VectorSearch,
    HnswAlgorithmConfiguration,
    VectorSearchProfile,
    AzureOpenAIVectorizer,
    AzureOpenAIVectorizerParameters,
    SearchIndex
)

AZURE_SEARCH_CREDENTIAL = AzureKeyCredential(AI_SEARCH_KEY)
# Create a search index  
index_name = container_name+"-index"
index_client = SearchIndexClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL)  
fields = [
    SearchField(name="parent_id", type=SearchFieldDataType.String),  
    SearchField(name="title", type=SearchFieldDataType.String),
    SearchField(name="chunk_id", type=SearchFieldDataType.String, key=True, sortable=True, filterable=True, facetable=True, analyzer_name="keyword"),  
    SearchField(name="chunk", type=SearchFieldDataType.String, sortable=False, filterable=False, facetable=False),  
    SearchField(name="text_vector", type=SearchFieldDataType.Collection(SearchFieldDataType.Single), vector_search_dimensions=1536, vector_search_profile_name="myHnswProfile")
    ]  
  
# Configure the vector search configuration  
vector_search = VectorSearch(  
    algorithms=[  
        HnswAlgorithmConfiguration(name="myHnsw"),
    ],  
    profiles=[  
        VectorSearchProfile(  
            name="myHnswProfile",  
            algorithm_configuration_name="myHnsw",  
            vectorizer_name="myOpenAI",  
        )
    ],  
    vectorizers=[  
        AzureOpenAIVectorizer(  
            vectorizer_name="myOpenAI",  
            kind="azureOpenAI",  
            parameters=AzureOpenAIVectorizerParameters(  
                resource_url=OPENAI_API_ENDPOINT,  
                deployment_name=os.getenv("EMBEDDINGS_MODEL_NAME"),
                model_name="text-embedding-ada-002",
                api_key=OPENAI_API_KEY
            ),
        ),  
    ], 
)  
  
# Create the search index
index = SearchIndex(name=index_name, fields=fields, vector_search=vector_search)  
result = index_client.create_or_update_index(index)  
print(f"{result.name} created")  

In [None]:
from azure.search.documents.indexes.models import (
    SplitSkill,
    InputFieldMappingEntry,
    OutputFieldMappingEntry,
    AzureOpenAIEmbeddingSkill,
    EntityRecognitionSkill,
    SearchIndexerIndexProjection,
    SearchIndexerIndexProjectionSelector,
    SearchIndexerIndexProjectionsParameters,
    IndexProjectionMode,
    SearchIndexerSkillset,
    CognitiveServicesAccountKey,
    WebApiSkill
)

AZURE_AI_KEY = os.getenv('AZURE_AI_KEY')


# Create a skillset  
skillset_name = container_name+"-skillset"

split_skill = SplitSkill(  
    description="Split skill to chunk documents",  
    text_split_mode="pages",  
    context="/document",  
    maximum_page_length=20000,  
    page_overlap_length=500,  
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/content"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="textItems", target_name="pages")  
    ],  
)

embedding_skill = AzureOpenAIEmbeddingSkill(  
    description="Skill to generate embeddings via Azure OpenAI",  
    context="/document/pages/*",
    resource_url=OPENAI_API_ENDPOINT,
    api_key=OPENAI_API_KEY,
    deployment_name=os.getenv("EMBEDDINGS_MODEL_NAME"),
    model_name="text-embedding-ada-002",
    dimensions=1536,
    inputs=[  
        InputFieldMappingEntry(name="text", source="/document/pages/*"),  
    ],  
    outputs=[  
        OutputFieldMappingEntry(name="embedding", target_name="text_vector")  
    ],  
)




index_projections = SearchIndexerIndexProjection(  
    selectors=[  
        SearchIndexerIndexProjectionSelector(  
            target_index_name=index_name,  
            parent_key_field_name="parent_id",  
            source_context="/document/pages/*",  
            mappings=[  
                InputFieldMappingEntry(name="chunk", source="/document/pages/*"),  
                InputFieldMappingEntry(name="text_vector", source="/document/pages/*/text_vector"),
                InputFieldMappingEntry(name="title", source="/document/metadata_storage_name"),  
            ],  
        ),  
    ],  
    parameters=SearchIndexerIndexProjectionsParameters(  
        projection_mode=IndexProjectionMode.SKIP_INDEXING_PARENT_DOCUMENTS  
    ),  
) 

cognitive_services_account = CognitiveServicesAccountKey(key=AZURE_AI_KEY)


skills = [split_skill, embedding_skill]

skillset = SearchIndexerSkillset(  
    name=skillset_name,  
    description="Skillset to chunk documents and generating embeddings",  
    skills=skills,
    index_projection=index_projections,
    cognitive_services_account=cognitive_services_account
)

client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL)  
client.create_or_update_skillset(skillset)  
print(f"{skillset.name} created")  

In [None]:
from azure.search.documents.indexes.models import (
    SearchIndexer,
    FieldMapping
)

# Create an indexer 
indexer_name = container_name+"-indexer" 

indexer_parameters = None

indexer = SearchIndexer(  
    name=indexer_name,  
    description="Indexer to index documents and generate embeddings",  
    skillset_name=skillset_name,  
    target_index_name=index_name,  
    data_source_name=data_source.name,
    # Map the metadata_storage_name field to the title field in the index to display the PDF title in the search results  
    field_mappings=[FieldMapping(source_field_name="metadata_storage_name", target_field_name="title")],
    parameters=indexer_parameters
)  

# Create and run the indexer  
indexer_client = SearchIndexerClient(endpoint=AI_SEARCH_ENDPOINT, credential=AZURE_SEARCH_CREDENTIAL) 

indexer_result = indexer_client.create_or_update_indexer(indexer)


print(f' {indexer_name} is created and running. Give the indexer a few minutes before running a query.')  