In [113]:
import os
import json
import re
from itertools import chain, groupby
import pandas as pd
import numpy as np
import random

In [None]:
def extract_grid_info(prompt) -> tuple[int, int]:
    # Grid can be represented as int x int, or int ⨉ int. ⨉ has unicode \u2a09
    match = re.findall(r'(\d+)\s*[x⨉]\s*(\d+)', prompt)
    if len(match) == 0:
        return (1, 1)

    return (int(match[0][0]), int(match[0][1]))

In [135]:
with open('dataset/T2IS/T2IS_data_summary.jsonl', 'r') as f:
    data = [json.loads(line) for line in f]

with open('dataset/T2IS/generated_data_summary.jsonl', 'r') as f:
    gen_data = [json.loads(line) for line in f]

In [157]:
simplified_data = []
for i, item in enumerate(data):
    prompt = item["prompt"]
    grid_info = extract_grid_info(prompt)
    simplified_item = {'idx': str(i).zfill(4)}
    simplified_item.update({
        k: v for k, v in item.items() if k in ["prompt", "height", 'width']
    })
    simplified_item['layout'] = 'x'.join(map(str, grid_info))
    simplified_item['theme'] = item['instruction']
    simplified_item.update(
        {k:item[k] for k in ['category', 'task_name', 'criteria']}
    )
    simplified_item['from'] = 'T2IS'
    simplified_item['original_idx'] = item['idx']
    simplified_data.append(simplified_item)

In [158]:
for i, item in enumerate(gen_data, len(simplified_data)):
    grid_info = item['layout'].split('x')
    height = int(grid_info[0]) * 512
    width = int(grid_info[1]) * 512
    new_item : dict = {'idx': str(i).zfill(4)}
    new_item.update({
        k: item[k] for k in ['prompt']
    })
    new_item['height'] = height
    new_item['width'] = width
    new_item['layout'] = item['layout']
    new_item['theme'] = item['theme']
    new_item.update(
        {k:item[k] for k in ['category', 'task_name', 'criteria']}
    )
    new_item['from'] = 'Generated'
    new_item['original_idx'] = str(i - len(simplified_data)).zfill(4)
    simplified_data.append(new_item)

In [159]:
with open('dataset/T2IS/extended_data_summary.jsonl', 'w') as f:
    for item in simplified_data:
        f.write(json.dumps(item) + '\n')

In [None]:
with open('dataset/T2IS/extended_data_summary.jsonl', 'r') as f:
    loaded_data = [json.loads(line) for line in f]

loaded_data = {item['original_idx']: item for item in loaded_data}

In [170]:
label = 'train_half_leq_6'
dataset_dir = f'dataset/T2IS/{label}'
for file in ['train_metadata.jsonl', 'test_metadata.jsonl']:
    with open(os.path.join(dataset_dir, file), 'r') as f:
        metadata = [json.loads(line) for line in f]
    
    metadata = [
        loaded_data[item['idx']] for item in metadata
    ]

    with open(os.path.join(dataset_dir, file), 'w') as f:
        for item in metadata:
            f.write(json.dumps(item) + '\n')

# Split Train/Test set

In [175]:
with open('dataset/T2IS/extended_data_summary.jsonl', 'r') as f:
    data_ext = [json.loads(line) for line in f]

In [185]:
# Filter out items with resolution
grid_limit = 5
resolution_limit = grid_limit * 512 * 512
data_ext = [
    item for item in data_ext
    if (item['height'] * item['width']) <= resolution_limit
]

data_ext = [
    item for item in data_ext
    if item['layout'] == '2x2'
]

In [186]:
seed = 42
ratio = 0.5
np.random.seed(seed)
# Sample `ratio` of the data for training and rest for testing
train_indices = np.random.choice(len(data_ext), size=int(len(data_ext)*ratio), replace=False)
train_data = [data_ext[i] for i in train_indices]
test_data = [data_ext[i] for i in range(len(data_ext)) if i not in train_indices]
train_set = pd.DataFrame(train_data)
test_set = pd.DataFrame(test_data)

In [187]:
grouped_train = train_set.groupby(['category', 'task_name', 'layout'])
grouped_test = test_set.groupby(['category', 'task_name', 'layout'])

grouped_train = {
    name: group.to_dict('records')
    for name, group in grouped_train
}
grouped_test = {
    name: group.to_dict('records')
    for name, group in grouped_test
}

In [188]:
len(grouped_train), len(grouped_test)

(27, 27)

In [189]:
[np.unique([len(x) for x in grouped.values()]) for grouped in [grouped_train, grouped_test]]

[array([ 1,  4,  5,  6,  7,  8,  9, 10, 11, 12, 14, 15, 16]),
 array([ 3,  5,  6,  7,  8,  9, 10, 11, 13, 14, 15, 17])]

In [None]:
output_dir = f"dataset/T2IS/train_extended_half_leq_{grid_limit}"
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, "train_metadata.jsonl"), "w") as f:
    for group in grouped_train.values():
        for item in group:
            f.write(json.dumps(item) + "\n")

with open(os.path.join(output_dir, "test_metadata.jsonl"), "w") as f:
    for group in grouped_test.values():
        for item in group:
            f.write(json.dumps(item) + "\n")

# Check criteria num

In [206]:
def check_criteria_num(file):
    with open(file, 'r') as f:
        data = [json.loads(line) for line in f]
    
    get_nested_dict_num = lambda d: (len(d), *[len(v) for v in d.values()])
    criterion = [get_nested_dict_num(item['criteria']) for item in data]
    num_array = np.array(list(chain(*criterion)))
    return np.unique(num_array, axis=0)


for directory in os.listdir('dataset/T2IS'):
    for file in ['train_metadata.jsonl', 'test_metadata.jsonl']:
        if os.path.isfile(os.path.join('dataset/T2IS', directory, file)):
            print(f"{directory}/{file}: ", end='')
            print(check_criteria_num(os.path.join('dataset/T2IS', directory, file)))

# check_criteria_num('dataset/T2IS/.jsonl')

train_half/train_metadata.jsonl: [2 3 4 5]
train_half/test_metadata.jsonl: [2 3 4 5]
train_half_leq_5/train_metadata.jsonl: [2 3 4 5]
train_half_leq_5/test_metadata.jsonl: [2 3 4 5]
train_half_leq_4/train_metadata.jsonl: [2 3 4]
train_half_leq_4/test_metadata.jsonl: [2 3 4 5]
train_extended_half_leq_5/train_metadata.jsonl: [2 3 4 5]
train_extended_half_leq_5/test_metadata.jsonl: [2 3 4 5]
train_half_2by2/train_metadata.jsonl: [2 3 4]
train_half_2by2/test_metadata.jsonl: [2 3 4 5]
train_half_leq_6/train_metadata.jsonl: [2 3 4 5]
train_half_leq_6/test_metadata.jsonl: [2 3 4 5]
train_all_2by2/train_metadata.jsonl: [2 3 4 5]
train_all_2by2/test_metadata.jsonl: [2 3 4]
train_extended_half_2by2/train_metadata.jsonl: [2 3 4]
train_extended_half_2by2/test_metadata.jsonl: [2 3 4 5]
train_all/train_metadata.jsonl: [2 3 4 5]
train_all/test_metadata.jsonl: [2 3 4 5]
