In [2]:
from src.utils import *
import os
import tqdm
import re
import json

In [3]:
def get_fig_path(file_path):
    fig_path = []
    for root, dirs, files in os.walk(file_path):
        for file in files:
            if file.endswith(".png") or file.endswith("fig"):
                fig_path.append(os.path.join(root, file))
    return fig_path

def save_questions(content, file_path):
    if "1)" in content:
        questions = re.split(r'\d+\)', content)[1:]
    elif "1." in content:
        questions = re.split(r'\d+\.', content)[1:]
    else:
        print("Error: No question number found in the content. PLEASE CHECK!")
    # Remove text spaces and escape characters
    questions = [q.strip() for q in questions if q.strip()]
    with open(file_path, 'a+') as file:
        for question in questions:
            # Formatting the question
            formatted_question = f"<s>[INST] {question if question.endswith('?') else question + '?'} [/INST] \n"
            # Creating a dictionary
            question_dict = {"text": formatted_question}
            # Writing the dictionary as a JSON line in the file
            file.write(json.dumps(question_dict) + '\n')

### Step-1: Problem generation

In [4]:
fig_path = get_fig_path('./input_img/')
for img_path in tqdm.tqdm(fig_path):
    img_url = f"data:image/jpeg;base64,{encode_image(img_path)}"
    # Change the prompt according to your product
    messages = [{"role": "user","content": [{"type": "text", "text": "Assuming you are a novice user facing the webpage of an AI painting platform named 'Maze.Guru', please list the 8 most likely questions a novice user would ask about user operations or product information.",},
                                            {"type": "image_url","image_url": {"url": img_url}}]}]
    content = create_chat_completion("cogagent-vqa-18b", messages=messages, use_stream=False)
    # Save in the same directory
    if "png" in img_path:
        save_path = img_path.replace('.png', '.jsonl')
    elif "fig" in img_path:
        save_path = img_path.replace('.fig', '.jsonl')
    save_questions(content, save_path)

  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [02:06<00:00, 15.76s/it]


### Step-2: Question answer generation

In [5]:
fig_path = get_fig_path('./input_img/')
flag = True
for img_path in tqdm.tqdm(fig_path):
    img_url = f"data:image/jpeg;base64,{encode_image(img_path)}"
    if "png" in img_path:
        question_path = img_path.replace('.png', '.jsonl')
    elif "fig" in img_path:
        question_path = img_path.replace('.fig', '.jsonl')
    questions = read_jsonl(question_path)
    for i in range(len(questions)):
        messages = [{"role": "user","content": [{"type": "text", "text": re.findall(r'\[INST\]\ (.*?)\ \[/INST\]', questions[i]['text'])[0] + "\nIf unable to answer, please reply directly 'Sorry, I don't know.'",},
                                                {"type": "image_url","image_url": {"url": img_url}}]}]
        content = create_chat_completion("cogagent-vqa-18b", messages=messages, use_stream=False)
        questions[i]['text'] = questions[i]['text'] + " " + content + "</s>"
    if flag:
        qa = questions
        flag = False
    else:
        qa = qa + questions
    write_jsonl("./output_dataset.jsonl", qa)


  0%|          | 0/8 [00:00<?, ?it/s]

100%|██████████| 8/8 [05:04<00:00, 38.07s/it]


### Step-3: Clean QA

In [6]:
qa = read_jsonl("./output_dataset.jsonl")
qa_new = []
for i in range(len(qa)):
    if "Sorry" in qa[i]['text'] or "image does not" in qa[i]['text'] or "image doesn't" in qa[i]['text']:
        # Remove the question if the answer is not available
        continue
    else:
        qa_new.append(qa[i])
write_jsonl("./output_dataset.jsonl", qa_new)

### Step-4: FineTuning