## 1. Install Necessary Python Packages

In [1]:
!python -m pip install -q -U pip   # [optional] upgrade pip to the latest version
!pip install pandas==2.2.2 -q -U   # install pandas for handling CSV files
!pip install openai==1.43.0 -q -U  # install OpenAI library for generating question-answer pairs
!pip install datasets==3.2.0 -q -U # install Hugging Face datasets library

[0m

## 2. Restart Kernel to Activate Installed Packages

## 3. Define Utility Functions

In [1]:
import pandas as pd
from openai import OpenAI

# Define a function to display a progress bar during synthetic data generation
# e.g. [██████████████████████████████--------------------] 60.67%
def print_progress(cur_data, total_data):
    total_bar = 50  # set total size of the progress bar to be 50
    cur_percent = (cur_data / total_data) * 100
    cur_bar = int((cur_data / total_data) * total_bar)
    cur_bar_display = '█' * cur_bar + '-' * (total_bar - cur_bar)
    # for intermediate print, always start from linehead using \r and avoid moving to the next line using end=''
    print(f'\r[{cur_bar_display}] {cur_percent:.2f}%', end='')

# Define a function to calculate the cost of API calls based on the number of tokens used
def calculate_token_cost(model, input_token_count=0, output_token_count=0):
    input_token_price = 0
    output_token_price = 0
    
    # define token prices (dollars per million tokens, price as of Jan 2025) based on the model
    if model == 'gpt-4o':
        input_token_price = 2.5
        output_token_price = 10.0
    elif model == 'gpt-4o-mini':
        input_token_price = 0.15
        output_token_price = 0.6

    # calculate the total cost of API calls
    cost = (input_token_count / 1_000_000 * input_token_price +
            output_token_count / 1_000_000 * output_token_price)
    return round(cost, 2)

## 4. Convert 'Raw Dataset' (raw_csv, containing exhibits' raw descriptions) into 'Synthetic Dataset' (synthetic_csv, containing exhibits-related question-answer pairs)

In [2]:
# Define a function to convert one exhibit's raw description into 15 question-answer pairs using OpenAI API
#
# Format of raw exhibit description:
#   "Object Details Physical Description Canard biplane with one 12-horsepower Wright horizontal four-cylinder engine driving two pusher propellers ..."
#
# Format of generated 15 question-answer pairs:
#   "[Visitor]: Can you describe this exhibit?  [Guide]: This exhibit showcases the remarkable 1903 Wright Flyer ...
#    [Visitor]: What can you tell me about this exhibit?  [Guide]: The exhibit features the iconic 1903 Wright Flyer ...
#    ...
#    [Visitor]: Can you summarize the significance of this exhibit?  [Guide]: The exhibit features the 1903 Wright Flyer, a groundbreaking ...
#   "
def generate_synthetic_data(raw_description):
    client = OpenAI(api_key="your OpenAI API key")  # replace with your OpenAI API key
    questions_per_description_per_temperature = 5

    system_prompt = f'''
You are an expert in creating high-quality museum exhibit descriptions. You will be provided with a detailed description of an exhibition below, delimited with XML tags. Your task is to generate {questions_per_description_per_temperature} different descriptions that convey the same core information but are rewritten with varied wording, structure, and style. The new descriptions should be as informative and accurate as the original but should appear unique in phrasing to serve as synthetic data for fine-tuning a large language model. Ensure that the technical details are preserved and that each description is clear and concise, suitable for inclusion in a museum's official records. Each description should be in one paragraph and no more than 300 words.

In addition, prepend each description with a varied question that inquires about the exhibition, such as 'What can you tell me about this exhibit?', 'Can you describe this exhibit?', 'Could you provide an overview of what is on display here?', or other similar phrasings, assuming the visitor has no prior knowledge about the exhibition, so do not include the provided exhibition info in the question, to make it more like a visitor-guide chat.

Print out your answer in the following format: [Visitor]: Visitor's question\n[Guide]: Guide's description\n
    '''
    user_prompt = f"<exhibition>{raw_description}</exhibition>"

    temperature_list = [0.4, 0.7, 0.9]
    result = ""
    input_tokens = 0
    output_tokens = 0

    for temperature in temperature_list:
        try:
            completion = client.chat.completions.create(
                model="gpt-4o-mini",
                messages=[
                    {"role": "system", "content": system_prompt},
                    {"role": "user", "content": user_prompt}
                ],
                temperature=temperature,
                max_tokens=3000,
                top_p=1
            )
            response_content = completion.choices[0].message.content
            result += response_content + '\n'
            # accumulate token usage for cost calculation
            input_tokens += completion.usage.prompt_tokens
            output_tokens += completion.usage.completion_tokens
        except Exception as e:
            print(f"OpenAI API call failed at temperature: {temperature}, Error: {e}")
            
    return result, input_tokens, output_tokens

# Define a function to convert 'Raw Dataset' (raw_csv, containing all exhibits' raw descriptions) into 'Synthetic Dataset' (synthetic_csv, containing all exhibits' question-answer pairs)
#
# Format of raw_csv:
# |item_id|item_name        |link                           |train_images                       |validation_images|test_images     |description                                             |
# |0      |1903-wright-flyer|https://...1903-wright-flyer...|"1903-wright-flyer-1.jpg  a.jpg .."|"b.jpg c.jpg .." |"d.jpg e.jpg .."|"Object Details Physical Description Canard biplane ..."|
# ...
#
# Format of synthetic_csv:
# |item_id|item_name        |train_images                       |validation_images|test_images     |synthetic_dialogs                                                                          |
# |0      |1903-wright-flyer|"1903-wright-flyer-1.jpg  a.jpg .."|"b.jpg c.jpg .." |"d.jpg e.jpg .."|"[Visitor]: Q1? [Guide]: A1. [Visitor]: Q2? [Guide]: A2. ... [Visitor]: Q15? [Guide]: A15."|
# ...
def generate_synthetic_csv(raw_csv, synthetic_csv):
    raw_df = pd.read_csv(raw_csv)
    syn_df = raw_df[['item_id', 'item_name', 'train_images', 'validation_images', 'test_images']].copy()
    syn_df['synthetic_dialogs'] = None

    total_input_tokens = 0
    total_output_tokens = 0

    for idx, row in raw_df.iterrows():
        # generate synthetic question-answer pairs from the raw description
        valid_response = False
        while not valid_response:
            generated_text, input_tokens, output_tokens = generate_synthetic_data(row['description'])
            if generated_text.count('[Visitor]') == 15 and generated_text.count('[Guide]') == 15: 
                valid_response = True
            else:
                print(f"\nInvalid response, retrying item: {row['item_name']} ...")
        syn_df.at[idx, 'synthetic_dialogs'] = generated_text.replace("’", "'")  # clean up formatting issues

        # display progress and the total cost of the API calls
        print_progress(idx + 1, len(raw_df))
        total_input_tokens += input_tokens
        total_output_tokens += output_tokens
        total_cost = calculate_token_cost('gpt-4o-mini', total_input_tokens, total_output_tokens)
        print(f' [gpt-4o-mini] total_tokens: {total_input_tokens}(input), {total_output_tokens}(output), total_cost: ${total_cost}', end='')

    # save the synthetic dataset to the specified CSV file
    syn_df.to_csv(synthetic_csv, index=False, encoding='utf-8')

In [3]:
# Convert 'Raw Dataset' into 'Synthetic Dataset'
generate_synthetic_csv("./smithsonian_raw_data.csv", "./smithsonian_synthetic_data.csv")

[██████████████████████████████████████████████████] 100.00% [gpt-4o-mini] total_tokens: 136209(input), 216127(output), total_cost: $0.15

## 5. Convert 'Synthetic Dataset' (synthetic_csv, containing exhibits' question-answer pairs) into 'Synthetic Dataset Splits' (train_csv, validation_csv, test_csv)

In [4]:
# Define a function to apply Idefics2-style formatting on raw question-answer pair text
#
# Format of text_in:
#   [Visitor]: Can you describe this exhibit? [Guide]: This exhibit showcases xxx
#
# Format of text_out:
#   {"messages": [
#                   {"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "Can you describe this exhibit?"} ]}, 
#                   {"role": "assistant", "content": [{"type": "text", "text": "This exhibit showcases xxx"} ]} 
#                ]
#   }
import re
def apply_idefict2_style(text_in):
    text_seg = re.split(r'\[Visitor\]:|\[Guide\]:', text_in)
    text_out = '{"messages": [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": "'
    text_out += text_seg[1].replace('"', '\\"').strip()
    text_out += '"} ]}, {"role": "assistant", "content": [{"type": "text", "text": "'
    text_out += text_seg[2].replace('"', '\\"').strip()
    text_out += '"} ]} ]}'
    return text_out

# Define a function to compose a 'Synthetic Dataset Split' from 'Synthetic Dataset' (synthetic_csv)
#
# Format of synthetic_csv:
# |item_id|item_name        |train_images                       |validation_images|test_images     |synthetic_dialogs                                                                          |
# |0      |1903-wright-flyer|"1903-wright-flyer-1.jpg  a.jpg .."|"b.jpg c.jpg .." |"d.jpg e.jpg .."|"[Visitor]: Q1? [Guide]: A1. [Visitor]: Q2? [Guide]: A2. ... [Visitor]: Q15? [Guide]: A15."| 
# ...
#
# Format of dataset-split (e.g. validation_csv):
# |data_id|item_name        |image_name             |file_name              |dialog                                                                                    |
# |0      |1903-wright-flyer|b.jpg                  |b.jpg                  |{"messages": [{"role": "user", "content": Q10?}, {"role": "assistant", "content": A10.}] }|
# |1      |1903-wright-flyer|c.jpg                  |c.jpg                  |{"messages": [{"role": "user", "content": Q11?}, {"role": "assistant", "content": A11.}] }|
# ...
def add_data_in_df(df, item, img_list, dialog_list):
    existing_data_count = len(df)
    if len(img_list) != len(dialog_list):
        exit(f"Error: unmatched image count ({len(img_list)}) and dialog count ({len(dialog_list)}) when updating {item}")
    new_rows = []
    for i in range(len(dialog_list)):
        new_rows.append({
            'data_id': existing_data_count + i,
            'item_name': item,
            'image_name': img_list[i],
            'file_name': img_list[i],
            'dialog': apply_idefict2_style(dialog_list[i])
        })
    return pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)

# Define a function to partition 'Synthetic Dataset' (synthetic_csv) into 'Synthetic Dataset Splits' (train_csv, validation_csv, test_csv)
#
# Format of synthetic_csv:
# |item_id|item_name        |train_images                       |validation_images|test_images     |synthetic_dialogs                                                                          |
# |0      |1903-wright-flyer|"1903-wright-flyer-1.jpg  a.jpg .."|"b.jpg c.jpg .." |"d.jpg e.jpg .."|"[Visitor]: Q1? [Guide]: A1. [Visitor]: Q2? [Guide]: A2. ... [Visitor]: Q15? [Guide]: A15."|
# ...
#
# Format of dataset-split (train_csv):
# |data_id|item_name        |image_name             |file_name              |dialog                                                                                  |
# |0      |1903-wright-flyer|1903-wright-flyer-1.jpg|1903-wright-flyer-1.jpg|{"messages": [{"role": "user", "content": Q1?}, {"role": "assistant", "content": A1.}] }|
# |1      |1903-wright-flyer|a.jpg                  |a.jpg                  |{"messages": [{"role": "user", "content": Q2?}, {"role": "assistant", "content": A2.}] }|
# ...
#
# Format of dataset-split (validation_csv):
# |data_id|item_name        |image_name             |file_name              |dialog                                                                                    |
# |0      |1903-wright-flyer|b.jpg                  |b.jpg                  |{"messages": [{"role": "user", "content": Q10?}, {"role": "assistant", "content": A10.}] }|
# |1      |1903-wright-flyer|c.jpg                  |c.jpg                  |{"messages": [{"role": "user", "content": Q11?}, {"role": "assistant", "content": A11.}] }|
# ...
#
# Format of dataset-split (test_csv):
# |data_id|item_name        |image_name             |file_name              |dialog                                                                                    |
# |0      |1903-wright-flyer|d.jpg                  |d.jpg                  |{"messages": [{"role": "user", "content": Q13?}, {"role": "assistant", "content": A13.}] }|
# |1      |1903-wright-flyer|e.jpg                  |e.jpg                  |{"messages": [{"role": "user", "content": Q14?}, {"role": "assistant", "content": A14.}] }|
# ...
def generate_dataset(synthetic_csv, train_csv, validation_csv, test_csv):
    syn_df = pd.read_csv(synthetic_csv)
    # 'file_name' is a reserved column name in Hugging Face used to link image data
    # https://huggingface.co/docs/datasets/main/en/image_dataset#imagefolder
    test_df = pd.DataFrame(columns=['data_id', 'item_name', 'image_name', 'file_name', 'dialog'])
    valid_df = pd.DataFrame(columns=['data_id', 'item_name', 'image_name', 'file_name', 'dialog'])
    train_df = pd.DataFrame(columns=['data_id', 'item_name', 'image_name', 'file_name', 'dialog'])

    for idx, row in syn_df.iterrows():
        dialog_list = row['synthetic_dialogs'].split('[Visitor]')
        dialog_list = ['[Visitor]' + elem.strip() for elem in dialog_list if elem.strip()]

        test_img_list = [elem.strip() for elem in row['test_images'].split('\n') if elem.strip()]
        valid_img_list = [elem.strip() for elem in row['validation_images'].split('\n') if elem.strip()]
        train_img_list = [elem.strip() for elem in row['train_images'].split('\n') if elem.strip()]

        if not test_img_list or not valid_img_list or not train_img_list:
            print("Error: img_list is empty.")

        test_df = add_data_in_df(test_df, row['item_name'], test_img_list, dialog_list[0:len(test_img_list)])
        valid_df = add_data_in_df(valid_df, row['item_name'], valid_img_list, dialog_list[len(test_img_list):len(test_img_list) + len(valid_img_list)])
        train_df = add_data_in_df(train_df, row['item_name'], train_img_list, dialog_list[len(test_img_list) + len(valid_img_list):])

    test_df.to_csv(test_csv, index=False, encoding='utf-8') 
    valid_df.to_csv(validation_csv, index=False, encoding='utf-8') 
    train_df.to_csv(train_csv, index=False, encoding='utf-8') 

In [5]:
# Convert 'Synthetic Dataset' into 'Synthetic Dataset Splits'
generate_dataset("./smithsonian_synthetic_data.csv", "./image/train/metadata.csv", "./image/validation/metadata.csv", "./image/test/metadata.csv")

## 6. Create Hugging Face Dataset from Synthetic Dataset Splits (train_csv, validation_csv, test_csv)

In [6]:
import os
from datasets import load_dataset
from huggingface_hub import HfApi, HfFolder

# Define a function to create a local Hugging Face dataset and then upload it to Hugging Face
# https://huggingface.co/docs/datasets/main/en/image_load#imagefolder
# https://huggingface.co/docs/huggingface_hub/en/package_reference/hf_api#huggingface_hub.HfApi.create_repo
# https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.push_to_hub
def create_and_upload_hf_dataset(local_dataset_dir, repo_id):
    # create a local Hugging Face dataset
    dataset_hf = load_dataset("imagefolder", data_dir=os.path.join(local_dataset_dir, "image"))
    dataset_hf.save_to_disk(os.path.join(local_dataset_dir, "hf_dataset"))
    # inspect the dataset splits ('train', 'validation', 'test')
    print(dataset_hf)

    # upload local Hugging Face dataset to Hugging Face Hub
    HfFolder.save_token("your Hugging Face Access Token")  # replace with your Hugging Face access token
    api = HfApi()
    api.create_repo(repo_id, repo_type="dataset")
    dataset_hf.push_to_hub(repo_id)

In [7]:
# Create and upload the Hugging Face dataset
create_and_upload_hf_dataset("./", "xugefu/MuseQuest")

Resolving data files:   0%|          | 0/820 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/83 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/75 [00:00<?, ?it/s]

Downloading data:   0%|          | 0/823 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/86 [00:00<?, ?files/s]

Downloading data:   0%|          | 0/78 [00:00<?, ?files/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/82 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/74 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'data_id', 'item_name', 'image_name', 'dialog'],
        num_rows: 819
    })
    validation: Dataset({
        features: ['image', 'data_id', 'item_name', 'image_name', 'dialog'],
        num_rows: 82
    })
    test: Dataset({
        features: ['image', 'data_id', 'item_name', 'image_name', 'dialog'],
        num_rows: 74
    })
})


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/819 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/9 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/82 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/74 [00:00<?, ? examples/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]