Before Running the Code.
============


In [11]:
%pip install --force-reinstall -r requirements.txt

### Initializing 

this demo will target single file to do extraction with, please make sure to change the target file name

In [1]:
# import installed libraries
from dotenv import load_dotenv
import os
import json
import datetime
import base64
import requests
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import AnalyzeDocumentRequest, ContentFormat, AnalyzeResult
from openai import AzureOpenAI
import re


# load environment variables
load_dotenv("local.env", override=True)
target_file_name = "Meta-Reports-First-Quarter-2024-Results-2024.pdf"


### Initializing Doc Intel for text extraction

In [2]:

docintel_endpoint=os.getenv("AZURE_DOC_INTEL_ENDPOINT")
print(docintel_endpoint)
docintel_key=os.getenv("AZURE_DOC_INTEL_KEY")
# document_analysis_client = DocumentAnalysisClient(endpoint=docintel_endpoint, credential=AzureKeyCredential(docintel_key))
document_intelligence_client = DocumentIntelligenceClient(endpoint=docintel_endpoint, credential=AzureKeyCredential(docintel_key))



https://jhl-doc-intel.cognitiveservices.azure.com/


### Running OCR, converting contents to makrdown format

In [3]:
path_to_sample_documents = os.path.abspath(
        os.path.join(
            os.path.dirname(os.path.abspath("__file__")),
            "pdf",
            target_file_name
        )
    )

print(path_to_sample_documents)

with open(path_to_sample_documents, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", analyze_request=f, content_type="application/octet-stream", output_content_format=ContentFormat.MARKDOWN
    )
result: AnalyzeResult = poller.result()

md_file_name = target_file_name.replace(".pdf", ".md")
md_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "markdown", md_file_name)
os.makedirs(os.path.dirname(md_file_path), exist_ok=True)
with open(md_file_path, 'w', encoding='utf-8') as md_file:
    md_file.write(result.content)

c:\Users\juhyunlee\workspace\notebooks\doc-facts-and-summary-extraction\pdf\Meta-Reports-First-Quarter-2024-Results-2024.pdf


### Initializing the Azure OpenAI Client

In [4]:
# define openai Client
aoai_api_key = os.getenv("AZURE_OPENAI_KEY")
aoai_api_endpoint =  os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

client = AzureOpenAI(
    api_key=aoai_api_key,
    api_version="2023-05-15",
    azure_endpoint=aoai_api_endpoint
)

### Defining functions to handle API Calls to Azure OpenAI

In [5]:
def clean_json_string(json_string):
    pattern = r'^```json\s*(.*?)\s*```$'
    cleaned_string = re.sub(pattern, r'\1', json_string, flags=re.DOTALL)
    return cleaned_string.strip()

def getResponseFromAoAI(systemPrompt, userPrompt):

    conversaion = [
        {"role": "system", "content": systemPrompt},
        {"role": "user", "content": userPrompt}
    ]

    # Send the conversation to the API
    response = client.chat.completions.create(
        model=aoai_api_deployment_name, # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
        messages=conversaion,
        # response_format={ "type": "json_object" }, #requires ptu enabled gpt4
        temperature=0,
    )

    # Print the assistant's response
    responseText = response.choices[0].message.content
    return responseText

### Passing in the entire markdown to LLM as Context to extract information from

In [6]:
# full_file_text = result.content
md_file_name = target_file_name.replace(".pdf", ".md")
md_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "markdown", md_file_name)
with open(md_file_path, 'r') as file:
    full_file_text = file.read()


prepped_systemPrompt = '''
    You are a helpful assistant that helps extract information, Do not generate output that isn't in properly formatted JSON.
    Context will be providied with triple quotes. Answer only within the user given context. if there is no answer found, always return the key with empty string value.
    Always provide answer in JSON Object.
'''
# "Provide the Business name, Company name, Legal entity name, Date of incorporation?"
prepped_userPrompt = '''
    """
        {context}
    """

    Provide the revenue, capital expenditures, products annouced?
'''.format(context=full_file_text)


aoaiResp = getResponseFromAoAI(
    prepped_systemPrompt,
    prepped_userPrompt
)

cleaned_response = clean_json_string(aoaiResp)
extracted = json.loads(cleaned_response)

print(json.dumps(extracted, indent=2))

{
  "revenue": "$ 36,455",
  "capital_expenditures": "$6.72 billion",
  "products_announced": "Meta Al with Llama 3"
}


### Running prompt to Summerize the Content as a whole by passing in the markdown

In [7]:
# full_file_text = result.content
md_file_name = target_file_name.replace(".pdf", ".md")
md_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "markdown", md_file_name)
with open(md_file_path, 'r') as file:
    full_file_text = file.read()

prepped_systemPrompt = '''
    You are a helpful assistant that helps summarize information, only use the context given by user.
    Context will be provided via triple quotes.
'''

# "Provide the Business name, Company name, Legal entity name, Date of incorporation?"
prepped_userPrompt = '''
    """
        {context}
    """

    Summarize the information provided in the context.
'''.format(context=full_file_text)

aoaiResp = getResponseFromAoAI(
    prepped_systemPrompt,
    prepped_userPrompt
)

print(aoaiResp)

Meta Platforms, Inc. reported its financial results for the first quarter of 2024, ending March 31, 2024. The company saw significant growth, with key highlights including:

- Revenue increased by 27% to $36.455 billion compared to $28.645 billion in Q1 2023.
- Income from operations jumped by 91% to $13.818 billion from $7.227 billion the previous year.
- Net income more than doubled, rising by 117% to $12.369 billion from $5.709 billion in Q1 2023.
- Diluted earnings per share (EPS) also more than doubled, going up by 114% to $4.71 from $2.20.
- Daily active people (DAP) across Meta's family of apps grew by 7% year-over-year to 3.24 billion.
- Ad impressions increased by 20% and the average price per ad rose by 6% year-over-year.
- Total costs and expenses were up by 6% to $22.64 billion.
- Capital expenditures, including principal payments on finance leases, were $6.72 billion.
- The company repurchased $14.64 billion of its Class A common stock and paid $1.27 billion in dividends.


<br />
<br />

<hr />
<h2>Markdown Header chunking strategy for larger files</h2>
<hr />

<br />
<br />

### Chunking the File into Markdown Headers

In [8]:
from langchain_text_splitters import MarkdownHeaderTextSplitter

headers_to_split_on = [
    ("#", "Header 1"),
    ("##", "Header 2"),
    ("###", "Header 3"),
]

md_file_name = target_file_name.replace(".pdf", ".md")
md_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "markdown", md_file_name)
with open(md_file_path, 'r') as file:
    md_data = file.read()

markdown_splitter = MarkdownHeaderTextSplitter(
    headers_to_split_on=headers_to_split_on, strip_headers=False
)
md_header_splits = markdown_splitter.split_text(md_data)

print(md_header_splits)

[Document(page_content='FACEBOOK\n==='), Document(page_content='## Meta Reports First Quarter 2024 Results  \nMENLO PARK, Calif., April 24, 2024 /PRNewswire/ -- Meta Platforms, Inc. (Nasdaq: META) today reported financial results for the quarter ended March 31, 2024.  \n"It\'s been a good start to the year," said Mark Zuckerberg, Meta founder and CEO. "The new version of Meta Al with Llama 3 is another step towards building the world\'s leading Al. We\'re seeing healthy growth across our apps and we continue making steady progress building the metaverse as well."', metadata={'Header 2': 'Meta Reports First Quarter 2024 Results'}), Document(page_content='### First Quarter 2024 Financial Highlights  \n| In millions, except percentages and per share amounts | Three Months Ended March 31, || % Change |\n|| 2024 | 2023 ||\n| - | - | - | - |\n| Revenue | $ 36,455 | $ 28,645 | 27 % |\n| Costs and expenses | 22,637 | 21,418 | 6 % |\n| Income from operations | $ 13,818 | $ 7,227 | 91 % |\n| Ope

### Saving the chunks into json file temporarily

In [9]:
mdjson = []

print(len(md_header_splits))

for docs in md_header_splits:
    mdjson_item = {
        "metadata": docs.metadata,
        "page_content": docs.page_content
    }
    mdjson.append(mdjson_item)

md_file_name = target_file_name.replace(".pdf", ".json")
mdjson_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "json", md_file_name)
os.makedirs(os.path.dirname(mdjson_file_path), exist_ok=True)
with open(mdjson_file_path, 'w') as json_file:
    json.dump(mdjson, json_file)

9


### Running the prompts to extract factual information, summaries, and insights

<span style="color:red">this may take few minutes depending on the size of the file</span>

In [10]:

# loading the mdjson file
mdjson_file_name = target_file_name.replace(".pdf", ".json")
mdjson_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "json", mdjson_file_name)
with open(mdjson_file_path, 'r') as json_file:
    mdjson_data = json.load(json_file)

listoffacts = ""

for mdjson_item in mdjson_data:

    mdstr = mdjson_item["page_content"]

    prepped_systemPrompt = '''
        You are a helpful assistant that helps extract factual information, summerize and give insights from given markdown.
        For factual, only extract information from the given context. DO NOT make any assumptions. Extract as much information as possible.
        For summaries, provide a concise summary of the context given.
        For insights, provide a list of insights from the context given. Focus on business elements, such as key markets, business eviolution history, etc.
        Provide the answer in list format, with header.

        here is example of how to provide the answer in list format:
        ### Factual Information
        - Business Name: Microsoft

        ### Summary
        - Microsoft is a technology company.

        ### Insights
        - Microsoft was founded in 1975.
    '''.format(context=mdstr)

    # "Provide the Business name, Company name, Legal entity name, Date of incorporation?"
    prepped_userPrompt = mdstr

    aoaiResp = getResponseFromAoAI(
        prepped_systemPrompt,
        prepped_userPrompt
    )
    cleaned_response = clean_json_string(aoaiResp)
    # print(cleaned_response)
    # add list of facts to the listoffacts with a new line
    listoffacts += cleaned_response + "\n\n"

# save list of facts to txt file
mdjson_file_name = target_file_name.replace(".pdf", " - results.txt")
listoffacts_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", mdjson_file_name)
os.makedirs(os.path.dirname(listoffacts_file_path), exist_ok=True)
with open(listoffacts_file_path, 'w') as txt_file:
    txt_file.write(listoffacts)



In [13]:
def group_information_by_section(input_text):  
    # Split the text into lines  
    lines = input_text.strip().split('\n')  
  
    # Initialize dictionaries to hold the grouped items  
    factual_info = []  
    summary = []  
    insights = []  
  
    # Define a variable to keep track of the current section  
    current_section = None  
  
    # Iterate over each line in the input text  
    for line in lines:  
        # line = line.strip()  
        if line.startswith('###'):  
            # When a new section header is found, update the current section  
            if 'Factual Information' in line:  
                current_section = factual_info  
            elif 'Summary' in line:  
                current_section = summary  
            elif 'Insights' in line:  
                current_section = insights  
        else: 
            # Add items to the current section  
            if current_section is not None:
                if line.startswith('-'):
                    current_section.append(line)  
                elif line.strip() != '':
                    current_section.append(f'- {line}')
  
    # Return the grouped items  
    return factual_info, summary, insights  
  
# Read the input text file
mdjson_file_name = target_file_name.replace(".pdf", " - results.txt")
listoffacts_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", mdjson_file_name)
with open(listoffacts_file_path, "r") as file:
    input_text = file.read()
    factual_info, summary, insights = group_information_by_section(input_text)  


# save list of facts to txt file
fi_file_name = target_file_name.replace(".pdf", " - factual information.txt")
fi_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", fi_file_name)
os.makedirs(os.path.dirname(fi_file_path), exist_ok=True)
with open(fi_file_path, 'w') as txt_file:
    factual_info_str = "### Factual Information\n" + "\n".join(factual_info)
    txt_file.write(factual_info_str)

# save list of facts to txt file
sum_file_name = target_file_name.replace(".pdf", " - summary.txt")
sum_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", sum_file_name)
os.makedirs(os.path.dirname(sum_file_path), exist_ok=True)
with open(sum_file_path, 'w') as txt_file:
    summary_str = "### Summary\n" + "\n".join(summary)
    txt_file.write(summary_str)

# save list of facts to txt file
insig_file_name = target_file_name.replace(".pdf", " - insight.txt")
insig_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", insig_file_name)
os.makedirs(os.path.dirname(insig_file_path), exist_ok=True)
with open(insig_file_path, 'w') as txt_file:
    insights_str = "### Insights\n" + "\n".join(insights)
    txt_file.write(insights_str)





### Running overall summary

In [51]:
# loading the summary file
sum_file_name = target_file_name.replace(".pdf", " - summary.txt")
sum_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", sum_file_name)
with open(sum_file_path, 'r') as file:
    summary_data = file.read()
    prepped_systemPrompt = '''
        You are a helpful assistant that helps summarize information, only use the context given by user.
        Be as detailed as possible.
        Context will be provided as list, and wrapped in triple quotes.
    '''
    # "Provide the Business name, Company name, Legal entity name, Date of incorporation?"
    prepped_userPrompt = '''
        """
            {context}
        """

        Summarize the information provided in the context.
    '''.format(context=summary_data)

    aoaiResp = getResponseFromAoAI(
        prepped_systemPrompt,
        prepped_userPrompt
    )

    print(aoaiResp)

    # save list of facts to txt file
    sum_file_name = target_file_name.replace(".pdf", " - overall summary.txt")
    sum_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", sum_file_name)
    os.makedirs(os.path.dirname(sum_file_path), exist_ok=True)
    with open(sum_file_path, 'w') as txt_file:
        txt_file.write(aoaiResp)


### Summary
- The context provided does not contain any content to summarize.
- Meta Platforms, Inc. reported its financial results for the first quarter of 2024, which ended on March 31, 2024. CEO Mark Zuckerberg described the start of the year as good and highlighted the release of Meta AI with Llama 3, indicating it as a step towards becoming the world's leading AI. He also noted healthy growth across Meta's apps and steady progress in building the metaverse.
- The financial highlights for the first quarter of 2024 show significant growth compared to the same period in 2023. Revenue increased by 27%, while income from operations saw a dramatic rise of 91%. Net income more than doubled with a 117% increase, and diluted earnings per share also more than doubled, showing a 114% increase. Operating margin improved from 25% to 38%. The provision for income taxes went up by 14%, and the effective tax rate decreased from 22% to 13%.
- The company reported a significant increase in daily ac

### Simulating similar information using temperature and system prompt

In [None]:
def getResponseFromAoAiWithTemp(systemPrompt, userPrompt, temp):

    temperature = temp if temp else 0

    conversaion = [
        {"role": "system", "content": systemPrompt},
        {"role": "user", "content": userPrompt}
    ]

    # Send the conversation to the API
    response = client.chat.completions.create(
        model=aoai_api_deployment_name, # The deployment name you chose when you deployed the GPT-35-Turbo or GPT-4 model.
        messages=conversaion,
        # response_format={ "type": "json_object" }, #requires ptu enabled gpt4
        temperature=temperature,
    )

    # Print the assistant's response
    responseText = response.choices[0].message.content
    return responseText


sum_file_name = target_file_name.replace(".pdf", " - overall summary.txt")
# loading the overall summary file
overall_sum_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", sum_file_name)
with open(overall_sum_file_path, 'r') as file:
    overall_summary_data = file.read()
    print(overall_summary_data)
    prepped_systemPrompt = '''
        You are an assistant that helps generating training information.
        Example will be provided in triple quotes.
        generate a training example similar to the example provided, with new name and dollar amount.
        do not copy the example provided.
    '''
    # "Provide the Business name, Company name, Legal entity name, Date of incorporation?"
    prepped_userPrompt = '''
        """
            {context}
        """

        Summarize the information provided in the context.
    '''.format(context=overall_summary_data)

    aoaiRespWithTemp = getResponseFromAoAiWithTemp(
        prepped_systemPrompt,
        prepped_userPrompt,
        temp=1
    )

    print(aoaiRespWithTemp)

    # save list of facts to txt file
    simulatied_sum_file_name = target_file_name.replace(".pdf", " - overall summary simulated.txt")
    simulated_sum_file_path = os.path.join(os.path.dirname(os.path.abspath("__file__")), "txt", simulatied_sum_file_name)
    os.makedirs(os.path.dirname(sum_file_path), exist_ok=True)
    with open(sum_file_path, 'w') as txt_file:
        txt_file.write(aoaiRespWithTemp)