In [1]:
import os
import re
import base64
import json

from dotenv import load_dotenv

load_dotenv()

# Import Azure libraries for document analysis
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from typing import Optional

# Import custom utility modules
from utils.pdfparser import DocumentAnalysisPdfParser
from utils.textsplitter import TextSplitter, SplitPage

report_name = os.getenv("REPORT_NAME")
report_name

'SOC.05 - ADP GETS'

Create Index

In [4]:
# import os

# from azure.core.credentials import AzureKeyCredential
# from azure.search.documents.indexes import SearchIndexClient

# from azure.search.documents.indexes.models import (
#     SearchIndex,
#     SearchField,
#     SearchFieldDataType,
#     SimpleField,
#     SearchableField,
#     VectorSearch,
#     VectorSearchProfile,
#     HnswAlgorithmConfiguration,
#     SemanticConfiguration,
#     SemanticField,
#     SemanticPrioritizedFields,
#     SemanticSearch,
# )

# from dotenv import load_dotenv

# load_dotenv()

# service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
# index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
# key = os.getenv("AZURE_SEARCH_API_KEY")


# def get_index(name: str):
#     fields = [
#         SimpleField(name="id", type=SearchFieldDataType.String, key=True),
#         SearchableField(
#             name="content",
#             type=SearchFieldDataType.String,
#             sortable=True,
#             filterable=True,
#             facetable=True,
#         ),
#         SearchField(
#             name="embedding",
#             type=SearchFieldDataType.Collection(SearchFieldDataType.Single),
#             searchable=True,
#             vector_search_dimensions=1536,
#             vector_search_profile_name="my-vector-config",
#         ),
#         SimpleField(
#             name="category",
#             type=SearchFieldDataType.String,
#             filterable=True,
#             facetable=True,
#         ),
#         SimpleField(
#             name="sourcepage",
#             type=SearchFieldDataType.String,
#             filterable=True,
#             facetable=True,
#         ),
#         SimpleField(
#             name="sourcefile",
#             type=SearchFieldDataType.String,
#             filterable=True,
#             facetable=True,
#         ),
#         SimpleField(
#             name="pdf_page_num",
#             type=SearchFieldDataType.Int32,
#             filterable=True,
#             facetable=True,
#         ),
#         SimpleField(
#             name="section",
#             type=SearchFieldDataType.String,
#             filterable=True,
#             facetable=True,
#         ),
#     ]
#     vector_search = VectorSearch(
#         profiles=[
#             VectorSearchProfile(
#                 name="my-vector-config",
#                 algorithm_configuration_name="my-algorithms-config",
#             )
#         ],
#         algorithms=[HnswAlgorithmConfiguration(name="my-algorithms-config")],
#     )

#     semantic_config = SemanticConfiguration(
#         name="my-semantic-config",
#         prioritized_fields=SemanticPrioritizedFields(
#             title_field=None,
#             content_fields=[SemanticField(field_name="content")],
#         ),
#     )

#     # Create the semantic settings with the configuration
#     semantic_search = SemanticSearch(configurations=[semantic_config])

#     return SearchIndex(
#         name=name,
#         fields=fields,
#         vector_search=vector_search,
#         semantic_search=semantic_search,
#     )


# if __name__ == "__main__":
#     credential = AzureKeyCredential(key)
#     index_client = SearchIndexClient(service_endpoint, credential)

#     # Delete Index
#     index_client.delete_index(index_name)

#     # Create Index
#     index = get_index(index_name)
#     index_client.create_or_update_index(index)

#     print("Created Index",f"{index_name}")

Chunk the Documents

In [None]:
import os
import re
import base64
import json

from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient
from typing import Optional

from utils.pdfparser import DocumentAnalysisPdfParser
from utils.textsplitter import TextSplitter, SplitPage

from dotenv import load_dotenv

load_dotenv()

endpoint = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_ENDPOINT")
credential = os.getenv("AZURE_DOCUMENT_INTELLIGENCE_KEY")
report_name = os.getenv("REPORT_NAME")

class Section:
    """
    A section of a page that is stored in a search service. These sections are used as context by Azure OpenAI service
    """

    def __init__(
        self,
        split_page: SplitPage,
        content: str,
        pdf_page_no: int,
        section: str,
        category: Optional[str] = None,
    ):
        self.split_page = split_page
        self.content = content
        self.category = category
        self.pdf_page_no = pdf_page_no
        self.section = section

def create_documents(result, pdf_file):

    pdf_parser = DocumentAnalysisPdfParser(endpoint=endpoint, credential=credential)
    text_splitter = TextSplitter(has_image_embeddings=False)

    pages = [page for page in pdf_parser.parse(result=result)]

    sections = [
        Section(
            split_page=split_page,
            content=pdf_file,
            category=None,
            pdf_page_no=split_page.pdf_page_num,
            section=split_page.section,
        )
        for split_page in text_splitter.split_pages(pages)
    ]

    MAX_BATCH_SIZE = 1000
    section_batches = [
        sections[i : i + MAX_BATCH_SIZE]
        for i in range(0, len(sections), MAX_BATCH_SIZE)
    ]

    for batch_index, batch in enumerate(section_batches):
        documents = [
            {
                "id": f"file-{re.sub("[^0-9a-zA-Z_-]", "_", report_name)}-{base64.b16encode(report_name.encode("utf-8")).decode("ascii")}-page-{section_index + batch_index * MAX_BATCH_SIZE}",
                "content": section.split_page.text,
                "category": section.category,
                "section": section.section,
                "sourcepage": f"{report_name}#page={section.split_page.page_num+1}",
                "sourcefile": f"{report_name}",
                "pdf_page_num": section.split_page.pdf_page_num,
            }
            for section_index, section in enumerate(batch)
        ]

    return documents

if __name__ == "__main__":
    # Construct the output folders
    chunks_folder = os.path.join("data", "chunks")

    # Ensure the output foldesr exist
    os.makedirs(chunks_folder, exist_ok=True)

    document_analysis_client = DocumentAnalysisClient(
        endpoint, AzureKeyCredential(credential)
    )

    soc_report_path = os.path.join("data", "document", f"{report_name}.pdf")

    with open(soc_report_path, "rb") as pdf_file:
        poller = document_analysis_client.begin_analyze_document(
            "prebuilt-layout", document=pdf_file
        )

        result = poller.result()

        documents = create_documents(result, pdf_file)

    file_name = f"{report_name}-chunks.txt"
    file_path = os.path.join(chunks_folder, file_name)

    with open(file_path, "w", encoding="utf-8") as output:
        output.write(json.dumps(documents, indent=4))

print("Chunks Created for Document",f"{report_name}")

Chunks Created for Document SOC.05 - ADP GETS


Adding the Chunks to the Vector Index

In [4]:
import os
import json

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from tqdm import tqdm

from dotenv import load_dotenv

load_dotenv()

service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")
key = os.getenv("AZURE_SEARCH_API_KEY")
report_name = os.getenv("REPORT_NAME")

print(report_name)

def get_embeddings(text: str):
    # There are a few ways to get embeddings. This is just one example.
    import openai

    open_ai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    open_ai_key = os.getenv("AZURE_OPENAI_API_KEY")

    client = openai.AzureOpenAI(
        azure_endpoint=open_ai_endpoint,
        api_key=open_ai_key,
        api_version="2023-03-15-preview",
    )
    embedding = client.embeddings.create(input=[text], model="text-embedding-ada-002")
    return embedding.data[0].embedding

def get_documents():
    chunks_folder = os.path.join("data", "chunks")
    file_name = f"{report_name}-chunks.txt"
    file_path = os.path.join(chunks_folder, file_name)

    # Load documents
    with open(file_path) as chunks_file:
        chunks = json.loads(chunks_file.read())

    documents = []

    for chunk in tqdm(chunks):
        if chunk["content"]:
            item_dict = {
                "id": chunk["id"],
                "content": chunk["content"] if chunk["content"] else "empty page",
                "embedding": get_embeddings(chunk["content"]),
                "category": chunk["category"],
                "section": chunk["section"],
                "sourcepage": chunk["sourcepage"],
                "sourcefile": chunk["sourcefile"],
                "pdf_page_num": chunk["pdf_page_num"],
            }

            documents.append(item_dict)

    return documents

if __name__ == "__main__":
    credential = AzureKeyCredential(key)
    client = SearchClient(service_endpoint, index_name, credential)
    documents = get_documents()
    client.upload_documents(documents=documents)
    
    print("Added to Index",f"{report_name}")

SOC.05 - ADP GETS


100%|██████████| 74/74 [01:09<00:00,  1.07it/s]


Added to Index SOC.05 - ADP GETS


Searching/Retrieiving from the Index

In [None]:
import os
import json

from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient
from azure.search.documents.models import VectorizedQuery
from azure.search.documents.models import QueryType

from dotenv import load_dotenv

load_dotenv()

# Set up environment variables for service connection and configuration
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  # Azure Search service endpoint
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  # Name of the Azure Search index
key = os.getenv("AZURE_SEARCH_API_KEY")  # API key for authentication
k_nearest_neighbors = 50  # Number of nearest neighbors to retrieve in the semantic search
report_name = os.getenv("REPORT_NAME")  # Name of the report to filter results

print(report_name)

def get_embeddings(text: str):
    # There are a few ways to get embeddings. This is just one example.
    import openai

    open_ai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    open_ai_key = os.getenv("AZURE_OPENAI_API_KEY")

    client = openai.AzureOpenAI(
        azure_endpoint=open_ai_endpoint,
        api_key=open_ai_key,
        api_version="2023-03-15-preview",
    )
    embedding = client.embeddings.create(input=[text], model="text-embedding-ada-002")
    return embedding.data[0].embedding

def semantic_query(query):
    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
    vector_query = VectorizedQuery(
        vector=get_embeddings(query),
        k_nearest_neighbors=k_nearest_neighbors,
        fields="embedding",
    )

    results = search_client.search(
        query_type=QueryType.SEMANTIC,
        semantic_configuration_name="my-semantic-config",
        search_text=query,
        vector_queries=[vector_query],
        filter=f"sourcefile eq '{report_name}' and section eq 'Section 3'",
        select=["id", "sourcefile", "content", "pdf_page_num"],
    )

    return results

if __name__ == "__main__":
    # Construct the output folder path
    search_output_folder = os.path.join("data", "search")

    # Ensure the output folder exists
    os.makedirs(search_output_folder, exist_ok=True)

    query = (
        "sap applications in scope or sap systems in scope or sap platforms in scope"
    )
    results = semantic_query(query=query)

    file_name = f"{report_name}-search.txt"
    file_path = os.path.join(search_output_folder, file_name)

    # Adding index to results and saving them
    indexed_results = []
    for idx, result in enumerate(results):
        indexed_result = {
            "index": idx + 1,  # Add index starting from 1
            "id": result["id"],
            "sourcefile": result["sourcefile"],
            "content": result["content"],
            "pdf_page_num": result["pdf_page_num"],
        }
        indexed_results.append(indexed_result)

    # Write the indexed results to the output file
    with open(file_path, "w", encoding="utf-8") as output:
        output.write(json.dumps(indexed_results, indent=4))

    print("The Chunks are Retrived and Saved as-", f"{file_name}")

SOC.05 - ADP GETS
The Chunks are Retrived and Saved as- SOC.05 - ADP GETS-search.txt


In [None]:
import os
import json

# Importing necessary modules from Azure SDK
from azure.core.credentials import AzureKeyCredential
from azure.search.documents import SearchClient

from azure.search.documents.models import VectorizedQuery
from azure.search.documents.models import QueryType

from dotenv import load_dotenv

load_dotenv()

# Set up environment variables for service connection and configuration
service_endpoint = os.getenv("AZURE_SEARCH_SERVICE_ENDPOINT")  # Azure Search service endpoint
index_name = os.getenv("AZURE_SEARCH_INDEX_NAME")  # Name of the Azure Search index
key = os.getenv("AZURE_SEARCH_API_KEY")  # API key for authentication
k_nearest_neighbors = 50  # Number of nearest neighbors to retrieve in the semantic search
report_name = os.getenv("REPORT_NAME")  # Name of the report to filter results

print(report_name)

# Function to get embeddings for the query text using OpenAI's API
def get_embeddings(text: str):
    # Importing OpenAI package to get embeddings
    import openai

    # Fetch OpenAI API endpoint and key from environment variables
    open_ai_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
    open_ai_key = os.getenv("AZURE_OPENAI_API_KEY")

    # Initialize the OpenAI client with the necessary credentials
    client = openai.AzureOpenAI(
        azure_endpoint=open_ai_endpoint,
        api_key=open_ai_key,
        api_version="2023-03-15-preview",  # Specify API version
    )
    
    # Request embeddings for the input text using the specified model
    embedding = client.embeddings.create(input=[text], model="text-embedding-ada-002")
    # Return the embedding data for the text
    return embedding.data[0].embedding

# Function to perform a semantic query on Azure Search
def semantic_query(query):
    # Initialize the Azure Search client with the service endpoint and credentials
    search_client = SearchClient(service_endpoint, index_name, AzureKeyCredential(key))
    
    # Create a vectorized query using the embedding generated for the input query
    vector_query = VectorizedQuery(
        vector=get_embeddings(query),  # The vectorized representation of the query text
        k_nearest_neighbors=k_nearest_neighbors,  # Number of nearest neighbors to retrieve
        fields="embedding",  # Field in the index where embeddings are stored
    )

    # Perform the search operation with semantic capabilities
    results = search_client.search(
        query_type=QueryType.SEMANTIC,  # Specify semantic search type
        semantic_configuration_name="my-semantic-config",  # Name of the semantic configuration in Azure Search
        search_text=query,  # The query text that is being searched
        vector_queries=[vector_query],  # List of vector queries
        filter=f"sourcefile eq '{report_name}' and section eq 'Section 3'",  # Filter results based on the report name and section
        select=["id", "sourcefile", "content", "pdf_page_num"],  # Fields to return from the results
    )

    # Return the results from the search query
    return results

# Main function to run the script
if __name__ == "__main__":
    # Define the output folder path where search results will be saved
    search_output_folder = os.path.join("data", "search")

    # Ensure the output folder exists; if not, it will be created
    os.makedirs(search_output_folder, exist_ok=True)

    # Define the search query for semantic search
    # query = ("sap applications in scope or sap systems in scope or sap platforms in scope")
    
    query = ("What services does ADP offer? What is ADP TotalSource? What types of retirement plans does ADP Retirement Services administer?")
    
#     query = (
#     "Please extract the relevant and **unique** information from the report regarding **SAP applications, systems, and platforms**. "
#     "The extracted information should specifically focus on the following:\n\n"

#     "1. **Tabular Data**: Extract any tables that list or describe SAP applications, systems, or platforms. "
#     "Include key details such as their names, functionalities, versions, and any other identifying attributes.\n\n"

#     "2. **Bullet Points**: If the report uses bullet points to list or explain specific SAP applications, systems, or platforms, "
#     "extract only those points that provide **unique** and actionable details about the systems in scope. Avoid extracting general or repetitive content.\n\n"

#     "3. **Textual Information**: Extract relevant textual content that mentions **specific SAP applications**, **systems**, or **platforms**. "
#     "Focus on their role, functionalities, integrations, and how they are being actively implemented or used. Prioritize **unique** details "
#     "that differentiate the systems from generic or introductory content.\n\n"

#     "**Additional Instructions**:\n"
#     "- Ensure the information is **unique** and not redundant.\n"
#     "- Focus only on **SAP systems actively in scope**.\n"
# )

    # Call the semantic query function to get search results
    results = semantic_query(query=query)

    # Define the output file name and path where the results will be saved
    # file_name = f"{report_name}-search.txt"

    file_name = f"{report_name}-simple-search-update.txt"

    file_path = os.path.join(search_output_folder, file_name)

    # Open the output file in write mode and save the search results with an index number
    with open(file_path, "w", encoding="utf-8") as output:
        indexed_results = []
        
        # Iterate through the results and add an index number to each result
        for idx, result in enumerate(results):
            indexed_result = {
                "index": idx + 1,  # Add index starting from 1
                "id": result["id"],
                "sourcefile": result["sourcefile"],
                "content": result["content"],
                "pdf_page_num": result["pdf_page_num"],
            }
            indexed_results.append(indexed_result)

        # Save the results as pretty-printed JSON
        output.write(json.dumps(indexed_results, indent=4))  # Save indexed results in JSON format
    
    print("The Chunks are Retrived and Saved as-", f"{file_name}")  
    # Indicate that the results have been saved successfully

SOC.05 - ADP GETS
The Chunks are Retrived and Saved as- SOC.05 - ADP GETS-simple-search-update.txt


In [None]:
import os
import json
import numpy as np
import pandas as pd

report_name = os.environ["REPORT_NAME"]

def extract_column_value_from_llm(data):
    llm_output = []

    for table_entity in data["extractedTagValues"]["TableEntities"]:
        for row in table_entity["PredictedRows"]:
            for column in row["PredictedColumns"]:
                if column["ColumnKey"] == "tag_IT_applications_tbl_applicationName":
                    llm_output.append(column["ColumnValue"])

    return llm_output

def extract_column_value_from_report(df):
    report_output = []

    for item in df["IT applications, IT processes and ITGCs"].to_list()[3:]:
        if item == "Insert additional rows as needed":
            break
        report_output.append(item)

    return report_output

if __name__ == "__main__":
    # List all the reports in the data/reports folder having file extension .xlsx

    cummulative_output = {
        "report_name": [],
        "report_output": [],
        "llm_output": [],
        "is_carved_out": [],
    }

    for report_file_name in os.listdir("data/reports"):
        if report_file_name.endswith(".xlsx"):
            report_name = os.path.splitext(report_file_name)[0]

            cummulative_output["report_name"].append(report_name)

            output_folder = os.path.join("data", "reports")
            file_name = f"{report_name}.xlsx"
            file_path = os.path.join(output_folder, file_name)
            df = pd.read_excel(file_path, sheet_name="IT apps, IT processes & ITGCs")

            report_output = extract_column_value_from_report(df)
            cummulative_output["report_output"].append(report_output)

            output_folder = os.path.join("data", "output")
            file_name = f"{report_name}-it-apps.txt"
            file_path = os.path.join(output_folder, file_name)

            with open(file_path, "r") as f:
                data = json.load(f)

                llm_ouput = extract_column_value_from_llm(data)
                cummulative_output["llm_output"].append(llm_ouput)

            if len(report_output) == 0:
                cummulative_output["is_carved_out"].append(True)
            else:
                cummulative_output["is_carved_out"].append(False)

    # Report coverage in percentage:
    # Scan through each item in report_output and if it exists in llm_ouput,
    # increment the count. Finally, divide the count by the total number of items
    # in report_output and multiply by 100 to get the percentage coverage.

    # Report hallunication in percentage:
    # Scan through each item in llm_ouput and if it does not exist in report_output,
    # increment the count. Finally, divide the count by the total number of items
    # in llm_ouput and multiply by 100 to get the percentage hallunication.

    header = "Report Name    |    Coverage    |    Hallucination"
    print("-" * len(header))
    print(header)
    print("-" * len(header))

    avg_cov = []
    avg_hall = []

    for i, report_name in enumerate(cummulative_output["report_name"]):
        local_report_output = cummulative_output["report_output"][i]
        local_llm_ouput = cummulative_output["llm_output"][i]

        # print(f"{report_name} | {local_report_output} | {local_llm_ouput}")

        # Coverage
        if len(local_report_output) == 0:
            coverage = "N/A (Carved out)"
        else:
            count = 0

            for item in local_report_output:
                if item in local_llm_ouput:
                    count += 1

            coverage = count / len(local_report_output)
            avg_cov.append(coverage)

            coverage = f"{coverage * 100}%"

        # Hallunication
        if len(local_llm_ouput) == 0:
            hallunication = "Nil"
        else:
            count = 0

            for item in local_llm_ouput:
                if item not in local_report_output:
                    count += 1

            hallunication = count / len(local_llm_ouput)
            avg_hall.append(hallunication)

            hallunication = f"{hallunication * 100}%"

        print(f"{report_name} | {coverage} | {hallunication}")

    print("-" * len(header))
    print(f"Average | {np.mean(avg_cov) * 100}% | {np.mean(avg_hall) * 100}%")
    print("-" * len(header))

FileNotFoundError: [WinError 3] The system cannot find the path specified: 'data/reports'