# Generate QnA synthetic dataset from a PDF - Image-heavy PDF 

### Split PDF into individual pages

In [1]:
# import fitz
# raw_data_dir = "raw_data"

# file_path = f"{raw_data_dir}/prod-unst-pdf/SM-S91X_UM_UU_Kor_Rev.1.1_231205.pdf"
# # Open the first PDF document
# doc1 = fitz.open(file_path)
# split_pages = [(4, 122), (4, 194)]

# for idx, s in enumerate(split_pages):
#     # Create a new empty PDF document
#     doc2 = fitz.open()

#     # Insert the first 2 pages of doc1 into doc2
#     doc2.insert_pdf(doc1, from_page=s[0], to_page=s[1])

#     # Save the modified document
#     doc2.save(f"{raw_data_dir}/prod-unst-pdf/s23-user-manual-part{idx}.pdf")

In [4]:
from dotenv import load_dotenv
import os, shutil, random
from langchain_community.document_loaders.csv_loader import CSVLoader
from util.preprocess import remove_short_sentences, remove_small_images

load_dotenv()

image_dir = "./image"
raw_data_dir = "raw_data"

if os.path.isdir(image_dir): shutil.rmtree(image_dir)
os.makedirs(image_dir, exist_ok=True)

### Preprocess PDF file (image part)

In [5]:
import fitz
from glob import glob

file_path = f"{raw_data_dir}/prod-unst-pdf/[Sales Guide] 2.CXP Brief_Nightography_(S23)_230201.pdf"

doc = fitz.open(file_path)
#clip_x, clip_y = 10, 45
clip_x, clip_y = 10, 10

for i, page in enumerate(doc):
    x, y, w, h = page.rect
    clip = fitz.Rect(x+clip_x, y+clip_y, w-clip_x, h-clip_y)
    page.set_cropbox(clip)
    pix = page.get_pixmap()
    pix.save(f"{image_dir}/page_{i:03d}.jpg")

images = sorted(glob(os.path.join(image_dir, "*.jpg")))
max_tokens = 1024

In [6]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate

from langchain_openai import AzureChatOpenAI
llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''Given image, give a concise summary in Korean. Don't insert any XML tag such as <text> and </text> when answering.'''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()
#summarize_chain = {"image_base64": lambda x:x} | prompt | llm_text | StrOutputParser()

In [7]:
%%time
from util.preprocess import encode_image_base64
#images = glob(os.path.join(image_dir, "*.jpg"))
base64_images = [encode_image_base64(img_path) for img_path in images]
image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 8})
image_summaries = remove_short_sentences(image_summaries)

CPU times: user 82.1 ms, sys: 32 ms, total: 114 ms
Wall time: 4.81 s


In [8]:
image_summaries[:3]

['이미지는 스마트폰 제품의 변경 사항을 기록한 표입니다. 주요 변경 사항은 다음과 같습니다:\n- 2022년 12월 9일: 첫 번째 버전 업로드 (지속적인 업데이트 예정)\n- 2023년 1월 20일: #6 텍스트 업데이트 (2x Wider ±3° OIS -> 2x Wider OIS)\n- 2023년 2월 1일: 제품 이름 및 로고 업데이트',
 '삼성 갤럭시 S23 시리즈의 광고 이미지입니다. "나이토그래피"라는 문구와 함께 "최고의 야간 카메라"라는 문구가 포함되어 있습니다.',
 '공유 가능한 멋진 야간 사진을 찍어보세요. S23 시리즈로 밤을 밝히며, 당신의 세계에서 "비공식 사진작가"가 되어보세요.']

### Preprocess PDF file (text part)

In [15]:
from langchain_community.document_loaders import PyMuPDFLoader

loader = PyMuPDFLoader(file_path)
docs = loader.load()

In [16]:
import re
import tiktoken
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter

tokenizer = tiktoken.get_encoding('o200k_base')

# create the length function
def tiktoken_len(text):
    tokens = tokenizer.encode(
        text,
        disallowed_special=()
    )
    return len(tokens)

text_splitter = RecursiveCharacterTextSplitter(
    # Set a really small chunk size, just to show.
    chunk_size=1024,
    chunk_overlap=100,
    length_function=tiktoken_len,
    separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ],    
)

# split_docs = text_splitter.split_documents(docs)
# print(f'Number of splitted docs: {len(split_docs)}')

a = [re.sub(' +', ' ', doc.page_content) for doc in docs]
joined_docs = '\n\n'.join(a)

split_docs = text_splitter.split_text(joined_docs)
print(f'Number of splitted docs: {len(split_docs)}')


Number of splitted docs: 1


In [17]:
split_docs[:5]

['1st version upload (Subject to continuous updates)\n2022.12.09\n#6 Text updated (2x Wider ±3° OIS → 2x Wider OIS)\n2023.01.20\nProduct name and logo updated\n2023.02.01\nTRAINING USE ONLY\nTRAINING USE ONLY\nRetail | Training\n\n\nTRAINING USE ONLY\nTRAINING USE ONLY\nRetail | Training\n\n\nTRAINING USE ONLY\nGET SHAREABLE, EPIC SHOTS. \nBE THE “UNOFFICIAL PHOTOGRAPHER”\nIN YOUR WORLD.\nBRIGHTEN UP YOUR NIGHTS\nWITH THE S23 SERIES.\n※ “Nightography” is the word that represents the Galaxy’s night \nshooting experience and brilliant results, not a specific mode name.\n“Can you send me that?”\n“Send me too!”\n“I want to post it!”\nRetail | Training\n\n\nTRAINING USE ONLY\nAI Stereo Depth with \nDual Pixel AF\nImpressive portrait selfies with \nprecise, fast focusing (Dual Pixel \nAF) and stunning bokeh effects \n(AI Stereo Depth)\nAI Object-aware Engine\nAuthentic colors\nand textures\nNights are the \nbest time for selfies\nRetail | Training\n\n\nTRAINING USE ONLY\nRecord Saturday nigh

### Construct QnA Pairs


In [18]:
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from util.qa_pair import get_qna_prompt_template, QAPair

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=1024,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

parser = JsonOutputParser(pydantic_object=QAPair)
prompt = get_qna_prompt_template()
#prompt = get_qna_repair_cost_prompt_template()
chain = prompt | llm | parser

In [19]:
input_batch = []

for doc in split_docs:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone", "num_questions": "3"}
    input_batch.append(dic)

#for doc in image_summaries_tiktoken:
for doc in image_summaries:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone", "num_questions": "3"}
    input_batch.append(dic)


In [20]:
%%time
qa_pair = chain.batch(input_batch, {"max_concurrency": 8})

### Save to jsonl for fine-tuning

In [85]:
import json
from util.common_utils import convert_to_oai_format, save_jsonl

output_dir = './dataset'
os.makedirs(output_dir, exist_ok=True)

system_prompt_msg = """You are an AI assistant that is familiar with the details of the user manual for your Galaxy mobile phone.
Please answer the questions accurately. If the question is in Korean, write your answer in Korean. If the question is in English, write your answer in English.
"""
# system_prompt_msg = """You are an AI assistant that guides users on how to self-repair their Galaxy series mobile phones.
# Please answer the questions accurately. If the question is in Korean, write your answer in Korean. If the question is in English, write your answer in English.
# """

save_filename = "cs-prod-qna-handling2-s24"
oai_qa_pair = convert_to_oai_format(qa_pair, system_prompt_msg=system_prompt_msg)

#save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
save_jsonl(oai_qa_pair, f"{output_dir}/{save_filename}-oai.jsonl")