# Generate QnA synthetic dataset from multiple PDFs - Image-heavy PDF 

In [1]:
import time
from dotenv import load_dotenv
import os, shutil, random
from langchain_community.document_loaders.csv_loader import CSVLoader
from util.preprocess import convert_html_to_md, remove_short_sentences, remove_small_images
import glob
import pandas as pd
load_dotenv()

raw_data_dir = "raw_data"
csv_path = f"{raw_data_dir}/prod-unst-pdf"
all_files = glob.glob(os.path.join(csv_path, "[Sales*.pdf"))

In [3]:
all_files = all_files[:2]

In [4]:
all_files

['raw_data/prod-unst-pdf/[Sales Guide] 1.Workshop Deck_(S23)_230201.pdf',
 'raw_data/prod-unst-pdf/[Sales Talk] 4. Why Galaxy_(S24)_240116.pdf']

In [6]:
import json
import fitz
from glob import glob
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

from util.preprocess import encode_image_base64
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler
from util.qa_pair import get_qna_prompt_template, QAPair
from util.common_utils import convert_to_oai_format, save_jsonl

max_tokens = 1024

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''Given image, give a concise summary in Korean. Don't insert any XML tag such as <text> and </text> when answering.'''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

## 2. Preprocess each PDF file 
---

In [7]:
for idx, file_path in enumerate(all_files):

    print(f"\n##### Idx {idx} - Processing {file_path}...")

    image_path = "./image"
    if os.path.isdir(image_path): shutil.rmtree(image_path)
    os.makedirs(image_path, exist_ok=True)

    doc = fitz.open(file_path)
    doc.delete_page(0) # 1st page is the cover page, so we delete it.
    #clip_x, clip_y = 10, 45
    clip_x, clip_y = 10, 10

    for i, page in enumerate(doc):
        x, y, w, h = page.rect
        clip = fitz.Rect(x+clip_x, y+clip_y, w-clip_x, h-clip_y)
        page.set_cropbox(clip)
        pix = page.get_pixmap()
        pix.save(f"{image_path}/page_{i:03d}.jpg")

    images = sorted(glob(os.path.join(image_path, "*.jpg")))

    ### Generate image summariesd
    print(f"### Generating image summaries using LLM - path: {file_path}")

    start = time.time()

    system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
    system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
    human_prompt = [
        {
            "type": "image_url",
            "image_url": {
                "url": "data:image/png;base64," + "{image_base64}",
            },
        },
        {
            "type": "text",
            "text": '''Given image, give a concise summary in Korean. Don't insert any XML tag such as <text> and </text> when answering.'''
        },
    ]
    human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

    prompt = ChatPromptTemplate.from_messages(
        [
            system_message_template,
            human_message_template
        ]
    )

    summarize_chain = prompt | llm | StrOutputParser()
    base64_images = [encode_image_base64(img_path) for img_path in images]
    image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 8})
    image_summaries = remove_short_sentences(image_summaries)
    end = time.time()

    print(f"Elasped {end - start:.5f} ses for generating image summaries using LLM")

    ### Generate QA pair
    print(f"### Generating QA pairs using LLM - path: {file_path}")
    start = time.time()

    parser = JsonOutputParser(pydantic_object=QAPair)
    prompt = get_qna_prompt_template()
    #prompt = get_qna_repair_cost_prompt_template()
    chain = prompt | llm | parser

    input_batch = []

    for doc in image_summaries:
        dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone", "num_questions": "3"}
        input_batch.append(dic)


    qa_pair = chain.batch(input_batch, {"max_concurrency": 8})
    end = time.time()

    print(f"Elasped {end - start:.5f} ses for generating image summaries using LLM")

    ### Save to jsonl for fine-tuning
    print(f"### Saving QA pairs to jsonl")
    output_dir = './dataset_tmp'
    os.makedirs(output_dir, exist_ok=True)

    system_prompt_msg = """You are an AI assistant that is familiar with the details of the user manual for your Galaxy mobile phone.
    Please answer the questions accurately. If the question is in Korean, write your answer in Korean. If the question is in English, write your answer in English.
    """

    oai_qa_pair = convert_to_oai_format(qa_pair, system_prompt_msg=system_prompt_msg)

    #save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
    save_jsonl(oai_qa_pair, f"{output_dir}/{idx}-oai.jsonl")


##### Idx 0 - Processing raw_data/prod-unst-pdf/[Sales Guide] 1.Workshop Deck_(S23)_230201.pdf...
### Generating image summaries using LLM - path: raw_data/prod-unst-pdf/[Sales Guide] 1.Workshop Deck_(S23)_230201.pdf
Elasped 75.40643 ses for generating image summaries using LLM
### Generating QA pairs using LLM - path: raw_data/prod-unst-pdf/[Sales Guide] 1.Workshop Deck_(S23)_230201.pdf
Elasped 54.81720 ses for generating image summaries using LLM
### Saving QA pairs to jsonl

##### Idx 1 - Processing raw_data/prod-unst-pdf/[Sales Talk] 4. Why Galaxy_(S24)_240116.pdf...
### Generating image summaries using LLM - path: raw_data/prod-unst-pdf/[Sales Talk] 4. Why Galaxy_(S24)_240116.pdf
Elasped 10.72489 ses for generating image summaries using LLM
### Generating QA pairs using LLM - path: raw_data/prod-unst-pdf/[Sales Talk] 4. Why Galaxy_(S24)_240116.pdf
Elasped 7.34492 ses for generating image summaries using LLM
### Saving QA pairs to jsonl


In [8]:
import os, shutil, random
from util.preprocess import convert_html_to_md
import json
import glob
import pandas as pd

dataset_dir = "dataset_tmp"
all_files = glob.glob(os.path.join(dataset_dir, "*-oai.jsonl"))

result = []
for f in all_files:
    with open(f, 'r', encoding='utf-8-sig') as infile:
        for line in infile.readlines():
            try:
                result.append(json.loads(line)) # read each line of the file
            except ValueError:
                print(f)

# This would output jsonl
with open('dataset/prod-unst-sales-brochure-oai.jsonl','w', encoding= 'utf-8-sig') as outfile:
    for entry in result:
        outfile.write(json.dumps(entry, ensure_ascii=False) + "\n")