# Generate QnA synthetic dataset from a Complex PDF using Azure AI Document Intelligence

### Overview
We process the PDF by dividing it into three parts.

- **Text-heavy** - Text-heavy PDF can be processed with open source without the need to use toolkits like Azure AI Document Intelligence or Unstructured.
- **Image-heavy** - Image-heavy PDF can be converted the entire page to images and let a multimodal LLM like GPT-4o summarize each page.
- **Mixed** - After reading the document with Azure AI Document Intelligence, we replace the image descriptions inside the figure tags with text summarized by a multimodal LLM. (Often the image descriptions are blank or have only a short caption.)

## 1. Read & Preprocess PDF file
---

### Split the PDFs into individual pages
Only use a poration of the PDF documents for testing

In [1]:

import os, shutil, random
import openai
from dotenv import load_dotenv
from langchain_community.document_loaders.csv_loader import CSVLoader

load_dotenv()

raw_data_dir = "raw_data"
splitted_raw_data_dir = f"splitted_{raw_data_dir}"

#file_path = f"{raw_data_dir}/prod-unst-pdf/[Sales Talk] 3. QnA3_Handling Objection_(S24)_240227.pdf"
file_path = f"{raw_data_dir}/prod-unst-pdf/SM-S92X_UG_UU_Kor_Rev.1.1_240129.pdf"


In [2]:
import fitz

# Open the first PDF document
doc1 = fitz.open(file_path)
split_pages = [(1, 15)]

for idx, s in enumerate(split_pages):
    # Create a new empty PDF document
    doc2 = fitz.open()

    # Insert the first 2 pages of doc1 into doc2
    doc2.insert_pdf(doc1, from_page=s[0], to_page=s[1])

    # Save the modified document
    doc2.save(f"{raw_data_dir}/prod-unst-pdf/s24-user-manual-part{idx}.pdf")

In [3]:
from util.common_utils import delete_folder_and_make_folder
from util.preprocess import remove_short_sentences, remove_small_images, analyze_pdf_page_content, split_pdf

file_path = f"{raw_data_dir}/prod-unst-pdf/s24-user-manual-part0.pdf"
result = analyze_pdf_page_content(file_path)
delete_folder_and_make_folder(splitted_raw_data_dir)    

print("### PDF Content Analysis Result:")
for content_type, pages in result.items():
    print(f"{content_type} pages: {pages}")
    split_pdf(file_path, f"{splitted_raw_data_dir}/{content_type}.pdf", pages)

### PDF Content Analysis Result:
Text pages: [0, 1, 8, 10, 12, 13]
Mixed pages: [2, 3, 4, 5, 6, 7, 11, 14]
Image pages: [9]


In [4]:
import os
from azure.core.credentials import AzureKeyCredential
from azure.ai.documentintelligence import DocumentIntelligenceClient
from azure.ai.documentintelligence.models import ContentFormat
from openai import AzureOpenAI

doc_intelligence_endpoint = os.getenv("AZURE_DOC_INTELLIGENCE_ENDPOINT")
doc_intelligence_key = os.getenv("AZURE_DOC_INTELLIGENCE_KEY")

document_intelligence_client = DocumentIntelligenceClient(
    endpoint=doc_intelligence_endpoint, 
    credential=AzureKeyCredential(doc_intelligence_key),
    headers={"x-ms-useragent":"sample-code-figure-understanding/1.0.0"},
)

aoai_api_endpoint = os.getenv("AZURE_OPENAI_ENDPOINT")
aoai_api_key = os.getenv("AZURE_OPENAI_API_KEY")
aoai_api_version = os.getenv("AZURE_OPENAI_API_VERSION")
aoai_deployment_name = os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME")

client = AzureOpenAI(
    api_key=aoai_api_key,  
    api_version=aoai_api_version,
    base_url=f"{aoai_api_endpoint}/openai/deployments/{aoai_deployment_name}",
    max_retries=1
)

### Case 1: Mixed page (Images and text mixed appropriately)
After reading the document with Azure AI Document Intelligence, we replace the image descriptions inside the figure tags with text summarized by a multimodal LLM. (Often the image descriptions are blank or have only a short caption.)


#### Analyze Document

In [5]:
pdf_mixed_path = f"{splitted_raw_data_dir}/Mixed.pdf"

with open(pdf_mixed_path, "rb") as f:
    poller = document_intelligence_client.begin_analyze_document(
        "prebuilt-layout", analyze_request=f, content_type="application/octet-stream", 
        output_content_format=ContentFormat.MARKDOWN 
    )

result = poller.result()
md_content = result.content

#### Updates the content of the figure description (empty content or caption) with the image summary text generated by gpt-4o.

In [6]:
%%time
from util.preprocess import (
    image_complexity, is_bounding_box_larger_than, crop_image_from_file, 
    understand_image_with_gpt, update_figure_description
)
output_folder = "pdf_mixed_tmp"
delete_folder_and_make_folder(output_folder)
language = "Korean"
max_tokens = 1024
input_file_path = file_path

if result.figures:
    print("Figures:")
    for idx, figure in enumerate(result.figures):
        figure_content = ""
        img_description = ""
        #print(f"Figure #{idx} has the following spans: {figure.spans}")
        
        for i, span in enumerate(figure.spans):
            #print(f"Span #{i}: {span}")
            figure_content += md_content[span.offset:span.offset + span.length]
        #print(f"Original figure content in markdown: {figure_content}")

        # Note: figure bounding regions currently contain both the bounding region of figure caption and figure body
        if figure.caption:
            caption_region = figure.caption.bounding_regions
            #print(f"\tCaption: {figure.caption.content}")
            #print(f"\tCaption bounding region: {caption_region}")
            for region in figure.bounding_regions:
                if region not in caption_region:
                    #print(f"\tFigure body bounding regions: {region}")
                    # To learn more about bounding regions, see https://aka.ms/bounding-region
                    boundingbox = (
                            region.polygon[0],  # x0 (left)
                            region.polygon[1],  # y0 (top)
                            region.polygon[4],  # x1 (right)
                            region.polygon[5]   # y1 (bottom)
                        )

                    if is_bounding_box_larger_than(boundingbox):
                        #print(f"\tFigure body bounding box in (x0, y0, x1, y1): {boundingbox}")
                        cropped_image = crop_image_from_file(input_file_path, region.page_number - 1, boundingbox) # page_number is 1-indexed

                        if image_complexity(cropped_image)[0] == "Complex":
                            # Get the base name of the file
                            base_name = os.path.basename(input_file_path)
                            # Remove the file extension
                            file_name_without_extension = os.path.splitext(base_name)[0]

                            output_file = f"{file_name_without_extension}_cropped_image_{idx}.png"
                            cropped_image_filename = os.path.join(output_folder, output_file)

                            cropped_image.save(cropped_image_filename)
                            print(f"\tFigure {idx} cropped and saved as {cropped_image_filename}")

                            try: 
                                image_summarization = understand_image_with_gpt(client, aoai_deployment_name, cropped_image_filename, "", max_tokens=max_tokens, language=language)
                            except openai.BadRequestError as e:
                                print(f"BadRequestError: {e}")
                                image_summarization = ""
                            img_description += image_summarization

                            print(f"\tDescription of figure {idx}: {img_description}")
                        else:
                            print(f'simple image at idx {idx}')

        else:
            #print("\tNo caption found for this figure.")
            for region in figure.bounding_regions:
                #print(f"\tFigure body bounding regions: {region}")
                # To learn more about bounding regions, see https://aka.ms/bounding-region
                boundingbox = (
                        region.polygon[0],  # x0 (left)
                        region.polygon[1],  # y0 (top
                        region.polygon[4],  # x1 (right)
                        region.polygon[5]   # y1 (bottom)
                    )

                if is_bounding_box_larger_than(boundingbox):                    
                    #print(f"\tFigure body bounding box in (x0, y0, x1, y1): {boundingbox}")

                    cropped_image = crop_image_from_file(input_file_path, region.page_number - 1, boundingbox) # page_number is 1-indexed

                    if image_complexity(cropped_image)[0] == "Complex":
                        # Get the base name of the file
                        base_name = os.path.basename(input_file_path)
                        # Remove the file extension
                        file_name_without_extension = os.path.splitext(base_name)[0]

                        output_file = f"{file_name_without_extension}_cropped_image_{idx}.png"
                        cropped_image_filename = os.path.join(output_folder, output_file)
                        # cropped_image_filename = f"data/cropped/image_{idx}.png"
                        cropped_image.save(cropped_image_filename)
                        #print(f"\tFigure {idx} cropped and saved as {cropped_image_filename}")

                        try:
                            image_summarization = understand_image_with_gpt(client, aoai_deployment_name, cropped_image_filename, "", max_tokens=max_tokens, language=language)
                        except openai.BadRequestError as e:
                            print(f"BadRequestError: {e}")
                            image_summarization = ""
                        img_description += image_summarization
                        print(f"\tDescription of figure {idx}: {img_description}")
                    else:
                        print(f'simple image at idx {idx}')

        
        md_content = update_figure_description(md_content, img_description, idx)

Figures:
4.7014 5.3766
	Description of figure 0: 이미지에는 여러 기능과 애플리케이션 목록이 표시되어 있습니다. 항목에는 Samsung Notes, Samsung Members, Samsung Kids, Samsung Global Goals, Samsung Find, 삼성닷컴, Galaxy Wearable, PENUP, 캘린더, 리마인더, 라디오, 음성 녹음, 내 파일, 시계, 계산기, Gaming Hub, 게임 부스터, SmartThings, 콘텐츠 공유하기, Music Share, Smart View, Wi-Fi 다이렉트 등이 있습니다.

이중 몇몇 항목들은 다음과 같이 한국어와 영어로 병기되어 있습니다:
- Samsung Notes(삼성 노트)
- 삼성메디아(Samsung Members)
- Samsung Kids(삼성 키즈)
- 삼성 글로벌 목표들(Samsung Global Goals)
- Samsung Find(삼성 파인드)

마지막으로, 이미지 하단에는 "보안 접근성"이라는 제목과 함께 "보안 접근성" 기능을 위한 정보가 나열되어 있습니다.
5.693300000000001 1.6506000000000007
	Description of figure 1: 이 이미지에는 텍스트가 나열되어 있습니다. 텍스트는 다음과 같습니다:

1. Google(구글) 앱
2. 이동통신 사업자 앱
3. 인터넷
4. Samsung Pay(삼성 페이)

텍스트는 주로 영어와 한국어로 혼합되어 있습니다.
4.5553 5.4625
	Description of figure 2: 이 이미지에는 한국어로 작성된 목록이 보입니다. 일부 항목은 페이지 번호와 함께 나와 있으며, 이는 아마도 책이나 메뉴얼의 목차로 보입니다. 

목차의 일부 내용은 다음과 같습니다:

- 절약 모드
- 네트워크 설정
- 듀얼 메신저
- 디지털 웰빙 및 자녀 보호 기능
- 디바이스 케어
- 애플리케이션
- 접근성
- 소프트웨어 업데이트
- 휴대전화 정보
- 이동통신 사업자

In [7]:
# from IPython.display import display, Markdown, Latex
# display(Markdown(md_content[:200]))

Generate chunks for mixed pages

In [8]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
import re

text_splitter = RecursiveCharacterTextSplitter(
    separators=[
        r'<!-- PageNumber="\d+" -->',
        r"\n\n",
        r"\n",
        " ",
        ".",
        "",
    ],   
    is_separator_regex = True,    
    chunk_size=2000,
    chunk_overlap=200,
)

mixed_chunks = text_splitter.split_text(md_content)
print("Length of splits (mixed case): " + str(len(mixed_chunks)))

Length of splits (mixed case): 3


### Case 2: Text-heavy
Text-heavy PDFs can be processed with open source without the need to use toolkits like Azure AI Document Intelligence or Unstructured.

In [9]:
from langchain_community.document_loaders.pdf import PyMuPDFLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

pdf_text_path = f"{splitted_raw_data_dir}/Text.pdf"
loader = PyMuPDFLoader(pdf_text_path)
documents = loader.load()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1200, 
    chunk_overlap=200
)

text_chunks = text_splitter.split_documents(documents)

for idx, chunk in enumerate(text_chunks):
    print(f"Chunk {idx}\n{chunk}")
    print("="*80)
    if idx == 2:
        break

text_chunks = [d.page_content for d in text_chunks]
print("Length of splits (text-heay case): " + str(len(text_chunks)))

Chunk 0
page_content='2\n시작하기\n4\t\n각 부분의 이름과 역할\n11\t\n배터리 충전하기\n15\t\nNano-SIM 카드 및 eSIM\n19\t\n전원 켜기/끄기\n20\t\n제품 초기 설정\n20\t\n삼성 계정\n20\t\n이전 기기의 데이터 가져오기(Smart Switch)\n22\t\n화면 알아두기\n30\t\n알림창\n31\t\n화면 캡처 및 화면 녹화\n33\t\n문자 입력\n35\t\n텍스트 추출\n앱과 기능 알아보기\n36\t\n앱 설치 및 관리\n37\t\nS펜(Galaxy S24 Ultra)\n50\t\n전화\n54\t\n연락처\n56\t\n메시지\n58\t\n카메라\n74\t\n갤러리\n79\t\nAR 존\n84\t\n빅스비\n85\t\n빅스비 비전\n86\t\n멀티윈도우(여러 앱 동시에 사용하기)\n89\t\n삼성 인터넷\n90\t\nSamsung Pay(삼성 페이)\n차례\n93\t\n삼성 헬스\n94\t\nSamsung Notes(삼성 노트)\n98\t\nSamsung Members(삼성 멤버스)\n99\t\nSamsung Kids(삼성 키즈)\n99\t\nSamsung Global Goals(삼성 글로벌 골)\n99\t\nSamsung Find(삼성 파인드)\n100\t 삼성닷컴\n100\t Galaxy Wearable(갤럭시 웨어러블)\n100\t PENUP(펜업)(Galaxy S24 Ultra)\n100\t 캘린더\n101\t\n리마인더(할 일 알림 받기)\n102\t 라디오\n103\t 음성 녹음\n104\t 내 파일(파일 확인 및 관리하기)\n104\t 시계\n105\t 계산기\n105\t Gaming Hub(게이밍 허브)\n106\t 게임 부스터(게임 환경 설정하기)\n107\t\nSmartThings(스마트싱스)\n108\t 콘텐츠(파일) 공유하기\n109\t Music Share(뮤직 쉐어)\n110\t\nSmart View(스마트 뷰)(TV 화면으로 보기)\n111\t\nWindows와 연결

### Case 3: Image-heavy
Image-heavy PDF can be converted the entire page to images and let a multimodal LLM like GPT-4o summarize each page.

### Preprocess Image

In [10]:
image_dir = "./pdf_image_tmp"
delete_folder_and_make_folder(image_dir) 

In [11]:
import fitz
from glob import glob

pdf_image_path = f"{splitted_raw_data_dir}/Image.pdf"
doc = fitz.open(pdf_image_path)
#clip_x, clip_y = 10, 45
clip_x, clip_y = 10, 10

for i, page in enumerate(doc):
    x, y, w, h = page.rect
    clip = fitz.Rect(x+clip_x, y+clip_y, w-clip_x, h-clip_y)
    page.set_cropbox(clip)
    pix = page.get_pixmap()
    pix.save(f"{image_dir}/page_{i:03d}.jpg")

images = sorted(glob(os.path.join(image_dir, "*.jpg")))

In [12]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

max_tokens = 1024
llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''Given image, give a concise summary in Korean. Don't insert any XML tag such as <text> and </text> when answering.'''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()
#summarize_chain = {"image_base64": lambda x:x} | prompt | llm_text | StrOutputParser()

In [13]:
%%time
from util.preprocess import encode_image_base64
#images = glob(os.path.join(image_path, "*.jpg"))
base64_images = [encode_image_base64(img_path) for img_path in images]
image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 8})
image_summaries = remove_short_sentences(image_summaries)


CPU times: user 25.1 ms, sys: 16.9 ms, total: 42 ms
Wall time: 4.42 s


In [14]:
print("Length of image_summaries (image-heavy case): " + str(len(image_summaries)))

Length of image_summaries (image-heavy case): 1


## 2. Construct QnA Pairs
----

### Option 1. 
Leverage the azure-ai-generative package. The QADataGenerator class in this package makes it easy to generate QnA synthetic questions. However, using this class as is has the disadvantage of not being able to use custom prompts, so we inherited from it and created the CustomQADataGenerator class.


In [15]:
from util.qa import CustomQADataGenerator
model_config = {
    "deployment": os.getenv("AZURE_OPENAI_DEPLOYMENT_NAME"),
    "model": "gpt-4o",
    "max_tokens": 2000,
}

qa_generator = CustomQADataGenerator(model_config=model_config, templates_dir="./prompt_template/ko")

In [16]:
import asyncio
from collections import Counter
from typing import Dict
import os
from azure.ai.generative.synthetic.qa import QAType
concurrency = 6  # number of concurrent calls
sem = asyncio.Semaphore(concurrency)

#qa_type = QAType.CONVERSATION
qa_type = QAType.LONG_ANSWER

async def generate_async(text: str) -> Dict:
    async with sem:
        return await qa_generator.generate_async(
            text=text,
            qa_type=qa_type,
            num_questions=3,  # Number of questions to generate per text
        )

In [17]:
input_batch = mixed_chunks + text_chunks + image_summaries
results = await asyncio.gather(*[generate_async(text) for text in input_batch], return_exceptions=True)

question_answer_list = []
token_usage = Counter()
for result in results:
    if isinstance(result, Exception):
        raise result  # exception raised inside generate_async()
    question_answer_list.append(result["question_answers"])
    token_usage += result["token_usage"]

print("Successfully generated QAs")

Successfully generated QAs


In [24]:
question_answer_list[0]

[('Galaxy S24 Ultra의 이미지에는 어떤 기능과 애플리케이션 목록이 표시되어 있습니까?',
  'Galaxy S24 Ultra의 이미지에는 Samsung Notes, Samsung Members, Samsung Kids, Samsung Global Goals, Samsung Find, 삼성닷컴, Galaxy Wearable, PENUP, 캘린더, 리마인더, 라디오, 음성 녹음, 내 파일, 시계, 계산기, Gaming Hub, 게임 부스터, SmartThings, 콘텐츠 공유하기, Music Share, Smart View, Wi-Fi 다이렉트 등의 기능과 애플리케이션 목록이 표시되어 있습니다.\n'),
 ('Galaxy S24+의 다이어그램에는 어떤 부품들이 표시되어 있습니까?',
  'Galaxy S24+의 다이어그램에는 수화부/스피커, 전면 카메라, 근접/조도 센서, 화면, 마이크, 에어 벤트 홈, 음량 버튼, 측면 버튼, 지문 인식 센서 등의 부품들이 표시되어 있습니다. 이 설명은 스마트폰 부품들의 위치와 역할을 시각적으로 안내합니다.\n'),
 ('"제품 사용 시 알아두기" 항목에는 어떤 내용이 포함되어 있습니까?',
  '"제품 사용 시 알아두기" 항목에는 제품 사용 시 주의 사항, 구성품 및 별매품 안내 사항, 제품 방수 및 방진 기능 주의사항, 제품에서 열이 발생하는 경우, 개인정보 및 제품 분실 시 대비 등의 내용이 포함되어 있습니다.')]

### Option 2. 
You write the entire sequence of code to create a QnA dataset without using a separate toolkit. 

In [19]:
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from util.qa_pair import get_qna_prompt_template, QAPair

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=1024,
    openai_api_version=aoai_api_version,
    azure_deployment=aoai_deployment_name                    
)

parser = JsonOutputParser(pydantic_object=QAPair)
prompt = get_qna_prompt_template()
#prompt = get_qna_repair_cost_prompt_template()
chain = prompt | llm | parser

In [20]:
input_batch = []

for doc in mixed_chunks:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone, especially S23 and S24 series", "num_questions": "3"}
    input_batch.append(dic)

for doc in text_chunks:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone, especially S23 and S24 series", "num_questions": "3"}
    input_batch.append(dic)

for doc in image_summaries:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone, especially S23 and S24 series", "num_questions": "3"}
    input_batch.append(dic)        


In [21]:
%%time
#input_query = {"context": md_content, "domain": "Samsung Galaxy S series Smartphone", "num_questions": "3"}
qa_pair = chain.batch(input_batch, {"max_concurrency": 5})

CPU times: user 137 ms, sys: 17.3 ms, total: 154 ms
Wall time: 12.6 s


## 3. Save to jsonl for fine-tuning
---

In [25]:
save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
dd

In [22]:
import json
from util.common_utils import convert_to_oai_format, save_jsonl

output_dir = './dataset'
os.makedirs(output_dir, exist_ok=True)

system_prompt_msg = """You are an AI assistant that provides guidance to help users self-service resolve abnormalities in their Galaxy mobile phone.\n
Please answer the questions accurately. If the question is in Korean, write your answer in Korean. If the question is in English, write your answer in English."""

save_filename = "cs-self-solve"
oai_qa_pair = convert_to_oai_format(question_answer_list)

#save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
save_jsonl(oai_qa_pair, f"{output_dir}/{save_filename}-oai.jsonl")

### Clean up

In [23]:
!rm -rf {splitted_raw_data_dir} pdf_image_tmp pdf_mixed_tmp outputs_tmp images