In [1]:
# Import PyMuPDF
import fitz
raw_data_dir = "raw_data"

file_path = f"{raw_data_dir}/prod-unst-pdf/SM-S92X_UG_UU_Kor_Rev.1.1_240129.pdf"
# Open the first PDF document
doc1 = fitz.open(file_path)
split_pages = [(10, 15)]

for idx, s in enumerate(split_pages):
    # Create a new empty PDF document
    doc2 = fitz.open()

    # Insert the first 2 pages of doc1 into doc2
    doc2.insert_pdf(doc1, from_page=s[0], to_page=s[1])

    # Save the modified document
    doc2.save(f"{raw_data_dir}/prod-unst-pdf/s24-user-manual-part{idx}.pdf")

In [2]:
from dotenv import load_dotenv
import os, shutil, random
from unstructured.cleaners.core import clean_bullets, clean_extra_whitespace, remove_punctuation
from langchain_community.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader, UnstructuredAPIFileLoader
from langchain_community.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
from util.preprocess import remove_short_sentences, remove_small_images

load_dotenv()

file_path = f"{raw_data_dir}/prod-unst-pdf/s24-user-manual-part{idx}.pdf"
image_path = "./image"

if os.path.isdir(image_path): shutil.rmtree(image_path)


## 1. Read & Preprocess PDF file
---

### Read PDF

In [3]:
%%time

chunk_size = 1500
new_after_n_chars = 1200
combine_text_under_n_chars = 1000
chunk_overlap = 100
max_tokens = 1024

loader = UnstructuredFileLoader(
    file_path=file_path,

    chunking_strategy = "by_title",
    mode="elements",

    extract_image_block_types=["Image", "Table"],
    hi_res_model_name="yolox_quantized", #"detectron2_onnx", "yolox", "yolox_quantized"

    extract_images_in_pdf=True,
    #skip_infer_table_types='[]', # ['pdf', 'jpg', 'png', 'xls', 'xlsx', 'heic']
    skip_infer_table_types=True, ## enable to get table as html using tabletrasformer

    extract_image_block_output_dir=image_path,
    extract_image_block_to_payload=False, ## False: to save image

    max_characters=chunk_size,
    new_after_n_chars=new_after_n_chars,
    combine_text_under_n_chars=combine_text_under_n_chars, # 이 문자 수 이하의 텍스트는 결합

    languages= ["kor+eng"],

    post_processors=[clean_bullets, clean_extra_whitespace, remove_punctuation]
)
docs = loader.load()

CPU times: user 36 s, sys: 4.03 s, total: 40 s
Wall time: 40.4 s


In [4]:
# from langchain import hub
# from langchain_community.document_loaders import AzureAIDocumentIntelligenceLoader
# from langchain.text_splitter import MarkdownHeaderTextSplitter
# from langchain.vectorstores.azuresearch import AzureSearch

# # Initiate Azure AI Document Intelligence to load the document. You can either specify file_path or url_path to load the document.
# loader = AzureAIDocumentIntelligenceLoader(
#     file_path=file_path, 
#     api_key=os.getenv("AZURE_DOC_INTELLIGENCE_API_KEY"), 
#     api_endpoint=os.getenv("AZURE_DOC_INTELLIGENCE_ENDPOINT"), 
#     api_model="prebuilt-layout"
# )
# docs = loader.load()

# # Split the document into chunks base on markdown headers.
# headers_to_split_on = [
#     ("#", "Header 1"),
#     ("##", "Header 2"),
#     ("###", "Header 3"),
#     ("####", "Header 4"),
# ]
# text_splitter = MarkdownHeaderTextSplitter(headers_to_split_on=headers_to_split_on)

# docs_string = docs[0].page_content
# splits = text_splitter.split_text(docs_string)

# print("Length of splits: " + str(len(splits)))

In [5]:
images = remove_small_images(image_path, image_dim_thres=16)
tables, texts = [], []

for doc in docs:
    category = doc.metadata["category"]
    if category == "Table": tables.append(doc)
    else: texts.append(doc)

print (f' # texts: {len(texts)} \n # tables: {len(tables)} \n # images: {len(images)}')

 # texts: 5 
 # tables: 0 
 # images: 6


### Summarize images

In [50]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=max_tokens,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

system_prompt = "You are an assistant tasked with describing table or image, specialized in Smartphone product."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''Given image, give a concise summary in Korean. Don't insert any XML tag such as <text> and </text> when answering.'''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()
#summarize_chain = {"image_base64": lambda x:x} | prompt | llm_text | StrOutputParser()

In [53]:
%%time
from util.preprocess import encode_image_base64
#images = glob(os.path.join(image_path, "*.jpg"))
base64_images = [encode_image_base64(img_path) for img_path in images]
image_summaries = summarize_chain.batch(base64_images, {"max_concurrency": 3})
image_summaries = remove_short_sentences(image_summaries)

CPU times: user 67.8 ms, sys: 12.2 ms, total: 80 ms
Wall time: 5.91 s


In [54]:
image_summaries

['이 이미지는 스마트폰의 SIM 카드 트레이에 SIM 카드를 삽입하고 트레이를 스마트폰에 다시 넣는 방법을 보여줍니다. 첫 번째 그림에서는 SIM 카드를 트레이에 넣는 방법을, 두 번째 그림에서는 트레이를 스마트폰에 삽입하는 방법을 설명하고 있습니다.',
 '이 이미지는 스마트폰의 SIM 카드 트레이를 꺼내는 방법을 보여줍니다. 첫 번째 단계에서는 핀 도구를 사용하여 트레이 옆의 작은 구멍에 삽입합니다. 두 번째 단계에서는 트레이가 튀어나오면 손으로 잡아 빼냅니다.',
 '이미지는 스마트폰의 SIM 카드 슬롯을 여는 방법을 보여줍니다. 핀 도구를 사용하여 슬롯 옆의 작은 구멍에 넣고 눌러서 슬롯을 엽니다.',
 '이 이미지는 스마트폰 또는 태블릿의 충전 포트에 충전 케이블을 연결하는 방법을 보여줍니다.',
 "이 이미지는 스마트폰의 하단 버튼을 설명하고 있습니다. 왼쪽부터 '최근 앱 버튼', '홈 버튼', '뒤로가기 버튼'이 있습니다."]

In [55]:
from util.preprocess import split_text_using_tiktoken

texts_tiktoken = split_text_using_tiktoken(texts, chunk_size, chunk_overlap)
image_summaries_tiktoken = split_text_using_tiktoken(image_summaries, chunk_size, chunk_overlap)

## 2. Construct QnA Pairs
----

In [56]:
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from util.qa_pair import get_qna_prompt_template, QAPair

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=1024,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

parser = JsonOutputParser(pydantic_object=QAPair)
prompt = get_qna_prompt_template()
#prompt = get_qna_repair_cost_prompt_template()
chain = prompt | llm | parser

In [57]:
input_batch = []

for doc in texts_tiktoken:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone", "num_questions": "3"}
    input_batch.append(dic)

#for doc in image_summaries_tiktoken:
for doc in image_summaries:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone", "num_questions": "3"}
    input_batch.append(dic)


In [58]:
%%time
qa_pair = chain.batch(input_batch, {"max_concurrency": 5})

CPU times: user 91.1 ms, sys: 9.84 ms, total: 101 ms
Wall time: 9.71 s


## 3. Save to jsonl for fine-tuning
---

In [None]:
import json
from util.common_utils import convert_to_oai_format, save_jsonl

output_dir = './dataset'
os.makedirs(output_dir, exist_ok=True)

system_prompt_msg = """You are an AI assistant that provides guidance to help users self-service resolve abnormalities in their Galaxy mobile phone.\n
Please answer the questions accurately. If the question is in Korean, write your answer in Korean. If the question is in English, write your answer in English."""

save_filename = "cs-self-solve"
oai_qa_pair = convert_to_oai_format(qa_pair)

s#ave_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
save_jsonl(oai_qa_pair, f"{output_dir}/{save_filename}-oai.jsonl")