# Generate QnA synthetic dataset from CSV, containing Image URL

This is another common case. If image url information is included, change this url to a summary result for that image.


## 1. Read & Preprocess CSV file
---

Read multiple csv files into a single dataframe.

In [1]:
from dotenv import load_dotenv
import os, shutil, random
from unstructured.cleaners.core import clean_bullets, clean_extra_whitespace, remove_punctuation
from langchain_community.document_loaders import UnstructuredFileLoader, UnstructuredMarkdownLoader, UnstructuredAPIFileLoader
from langchain_community.document_loaders.csv_loader import CSVLoader, UnstructuredCSVLoader
from util.preprocess import convert_html_to_md
import glob
import pandas as pd

load_dotenv()

raw_data_dir = "raw_data"
csv_path = f"{raw_data_dir}/cs-self-solve-web"
all_files = glob.glob(os.path.join(csv_path, "*_modified.csv"))
df = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)

In [3]:
df.head(2)

Unnamed: 0,title,content,image_info,Content
0,갤럭시 S23 전화 통화 시 상대방 목소리가 작게 들리거나 내 목소리가 작게 전달...,갤럭시 스마트폰의 전화 통화 시 송화음 수화음이 잘 전달되지 않거나 들리지 않는 ...,{'갤럭시 S23/S23+ 마이크 및 에어 벤트 홈 위치': 'https://api...,title : 갤럭시 S23 전화 통화 시 상대방 목소리가 작게 들리거나 내 목소...
1,갤럭시 S24 시리즈 보호필름 부착 방법이 궁금합니다.,갤럭시 S24 시리즈의 보호 필름의 패키지 구성은 보호 필름(2매) 어플리케이터(...,{'S24 보호필름 박스 전면 이미지': 'https://api.samsungsvc...,"title : 갤럭시 S24 시리즈 보호필름 부착 방법이 궁금합니다., conte..."


Convert to a base64 image input format that can be recognized by multimodal models such as GPT-4o.

- Download the image (http://xyz.com/a.jpg)
- Convert to image to base64 encoded strin

In [10]:
import json
from util.preprocess import encode_url_image_base64

def encode_images(img_dict):
    return [encode_url_image_base64(v) for k, v in img_dict.items()]

img_dict = eval(df['image_info'][0])
df['image_info'] = df['image_info'].apply(lambda x: eval(x))  
df['image_base64'] = df['image_info'].apply(encode_images)

In [12]:
df.head(2)

Unnamed: 0,title,content,image_info,Content,image_base64
0,갤럭시 S23 전화 통화 시 상대방 목소리가 작게 들리거나 내 목소리가 작게 전달...,갤럭시 스마트폰의 전화 통화 시 송화음 수화음이 잘 전달되지 않거나 들리지 않는 ...,{'갤럭시 S23/S23+ 마이크 및 에어 벤트 홈 위치': 'https://api...,title : 갤럭시 S23 전화 통화 시 상대방 목소리가 작게 들리거나 내 목소...,[iVBORw0KGgoAAAANSUhEUgAAArwAAAINCAIAAABTTjvEA...
1,갤럭시 S24 시리즈 보호필름 부착 방법이 궁금합니다.,갤럭시 S24 시리즈의 보호 필름의 패키지 구성은 보호 필름(2매) 어플리케이터(...,{'S24 보호필름 박스 전면 이미지': 'https://api.samsungsvc...,"title : 갤럭시 S24 시리즈 보호필름 부착 방법이 궁금합니다., conte...",[iVBORw0KGgoAAAANSUhEUgAAAMgAAAG0CAYAAAB3z2iQA...


#### Image Summarization using GPT-4o

In [5]:
from langchain.schema.output_parser import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate, HumanMessagePromptTemplate, SystemMessagePromptTemplate
from langchain_openai import AzureChatOpenAI

llm = AzureChatOpenAI(
    temperature=0, 
    max_tokens=700,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

system_prompt = "You are an AI assistant tasked with describing table or image, specialized in IT devices and mobile phone products."
system_message_template = SystemMessagePromptTemplate.from_template(system_prompt)
human_prompt = [
    {
        "type": "image_url",
        "image_url": {
            "url": "data:image/png;base64," + "{image_base64}",
        },
    },
    {
        "type": "text",
        "text": '''Given image, give a concise summary in Korean. Don't insert any XML tag such as <text> and </text> when answering.'''
    },
]
human_message_template = HumanMessagePromptTemplate.from_template(human_prompt)

prompt = ChatPromptTemplate.from_messages(
    [
        system_message_template,
        human_message_template
    ]
)

summarize_chain = prompt | llm | StrOutputParser()
#summarize_chain = {"image_base64": lambda x:x} | prompt | llm_text | StrOutputParser()

In [23]:
# %pip install langchainhub

In [None]:
%%time
df['image_summary'] = df['image_base64'].apply(lambda x: summarize_chain.batch(eval(x), {"max_concurrency": 5}))

In [None]:
df.to_csv(f"{csv_path}/cs-self-solve.csv", index=False)

In [None]:
df = pd.read_csv(f"{csv_path}/cs-self-solve.csv")
df['image_info'] = df['image_info'].apply(lambda x: eval(x))
df['image_base64'] = df['image_base64'].apply(lambda x: eval(x))  
df['image_summary'] = df['image_summary'].apply(lambda x: eval(x))  

In [None]:
import re
def get_final_context(x):

    context =  f"### Title:\n{x['title']}\n\n### Context:\n{x['content']}\n\n### Image:\n"

    for idx, ((k,v), summary) in enumerate(zip(x['image_info'].items(), x['image_summary'])):
        context += f'<image>{idx+1}번째 이미지 - {k}: {summary}</image>' + '\n'
    context = re.sub(' +', ' ', context)
    
    return context

In [None]:
df['final_context'] = df.apply(get_final_context, axis=1)
preprocessed_docs = df['final_context'].to_list()


## 2. Construct QnA Pairs
---

In [None]:
from langchain_openai import AzureChatOpenAI
from langchain_core.runnables import RunnablePassthrough, RunnableLambda
from langchain_core.output_parsers import JsonOutputParser
from langchain.callbacks.streaming_stdout import StreamingStdOutCallbackHandler

from util.qa_pair import get_qna_repair_self_prompt_template, QAPair

llm = AzureChatOpenAI(
    temperature=0.1, 
    max_tokens=1500,
    openai_api_version="2024-05-01-preview",
    azure_deployment="gpt-4o"                       
)

parser = JsonOutputParser(pydantic_object=QAPair)

prompt = get_qna_repair_self_prompt_template()
chain = prompt | llm | parser

In [None]:
input_batch = []

for doc in preprocessed_docs:
    dic = {"context": doc, "domain": "Samsung Galaxy S series Smartphone", "num_questions": "4"}
    input_batch.append(dic)

In [None]:
%%time
qa_pair = chain.batch(input_batch, {"max_concurrency": 5})

CPU times: user 302 ms, sys: 56.6 ms, total: 359 ms
Wall time: 1min 12s


## 3. Save to jsonl for fine-tuning
---

In [None]:
import json
from util.common_utils import convert_to_oai_format, save_jsonl

output_dir = './dataset'
os.makedirs(output_dir, exist_ok=True)

system_prompt_msg = """You are an AI assistant that provides guidance to help users self-service resolve abnormalities in their Galaxy mobile phone.\n
Please answer the questions accurately. If the question is in Korean, write your answer in Korean. If the question is in English, write your answer in English."""

save_filename = "cs-self-solve"
oai_qa_pair = convert_to_oai_format(qa_pair)

save_jsonl(qa_pair, f"{output_dir}/{save_filename}.jsonl")
save_jsonl(oai_qa_pair, f"{output_dir}/{save_filename}-oai.jsonl")