In [1]:
from datasets import load_dataset, load_from_disk, Dataset
from collections import Counter
import pandas as pd
from sklearn.model_selection import train_test_split

data_root = "./data/RLAIF-V-Dataset"
data_file = [f'{data_root}/RLAIF-V-Dataset_{i:03d}.parquet' for i in range(14)]
data = load_dataset('parquet', data_files=data_file)
print(data)


Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['ds_name', 'image', 'question', 'chosen', 'rejected', 'origin_dataset', 'origin_split', 'idx', 'image_path'],
        num_rows: 83132
    })
})


In [2]:
origin_dataset = data['train']['origin_dataset']
counts = Counter(origin_dataset)
print(counts)
total_len = len(origin_dataset)
propotions = {ds: num/total_len for ds, num in counts.items()}
print('==================================')
for ds, num in propotions.items():
    print(f'{ds}: {num:.2%}')
print('==================================')

Counter({'LCS-558K': 15956, 'COCO': 15199, 'OK-VQA': 14802, 'VQAv2': 12942, 'GQA': 5411, 'TextVQA': 4740, 'OCR-VQA': 3025, 'sharegpt4v-wikiart': 1972, 'sharegpt4v-textvqa': 1966, 'sharegpt4v-web-landmark': 1918, 'sharegpt4v-web-celebrity': 1895, 'MovieNet': 1131, 'ART500K': 1096, 'Google-Landmark': 1079})
OK-VQA: 17.81%
TextVQA: 5.70%
COCO: 18.28%
LCS-558K: 19.19%
sharegpt4v-wikiart: 2.37%
VQAv2: 15.57%
sharegpt4v-textvqa: 2.36%
sharegpt4v-web-celebrity: 2.28%
sharegpt4v-web-landmark: 2.31%
GQA: 6.51%
OCR-VQA: 3.64%
MovieNet: 1.36%
Google-Landmark: 1.30%
ART500K: 1.32%


In [3]:
sample_num = 2500*4
df = pd.DataFrame(data['train'])
_, df_sampled0 = train_test_split(df, test_size=sample_num, stratify=df['origin_dataset'], random_state=42)
df_sampled01, df_sampled02 = train_test_split(df_sampled0, test_size=sample_num//2, stratify=df_sampled0['origin_dataset'], random_state=42)
df_sampled1, df_sampled2 = train_test_split(df_sampled01, test_size=sample_num//4, stratify=df_sampled01['origin_dataset'], random_state=42)
df_sampled3, df_sampled4 = train_test_split(df_sampled02, test_size=sample_num//4, stratify=df_sampled02['origin_dataset'], random_state=42)

for df_sampled in (df_sampled1, df_sampled2, df_sampled3, df_sampled4):
    counts = Counter(df_sampled['origin_dataset'])
    total_count = len(df_sampled['origin_dataset'])
    proportions = {ds: count / total_count for ds, count in counts.items()}
    print("=====================================")
    print("Sampled dataset proportions: total count:", total_count)
    for ds, proportion in proportions.items():
        print(f"{ds}: {proportion:.2%}")
    print("=====================================")

Sampled dataset proportions: total count: 2500
TextVQA: 5.68%
OCR-VQA: 3.64%
sharegpt4v-textvqa: 2.36%
GQA: 6.52%
sharegpt4v-wikiart: 2.40%
LCS-558K: 19.16%
COCO: 18.28%
VQAv2: 15.60%
MovieNet: 1.36%
OK-VQA: 17.80%
sharegpt4v-web-celebrity: 2.28%
Google-Landmark: 1.32%
ART500K: 1.32%
sharegpt4v-web-landmark: 2.28%
Sampled dataset proportions: total count: 2500
ART500K: 1.32%
VQAv2: 15.56%
OCR-VQA: 3.64%
COCO: 18.28%
OK-VQA: 17.84%
GQA: 6.48%
LCS-558K: 19.20%
sharegpt4v-web-celebrity: 2.28%
sharegpt4v-wikiart: 2.36%
MovieNet: 1.36%
TextVQA: 5.72%
sharegpt4v-textvqa: 2.36%
Google-Landmark: 1.28%
sharegpt4v-web-landmark: 2.32%
Sampled dataset proportions: total count: 2500
LCS-558K: 19.20%
GQA: 6.52%
OK-VQA: 17.80%
VQAv2: 15.56%
COCO: 18.28%
sharegpt4v-wikiart: 2.36%
ART500K: 1.32%
sharegpt4v-web-celebrity: 2.28%
OCR-VQA: 3.64%
sharegpt4v-textvqa: 2.36%
sharegpt4v-web-landmark: 2.32%
Google-Landmark: 1.28%
TextVQA: 5.72%
MovieNet: 1.36%
Sampled dataset proportions: total count: 2500
ART50

In [4]:
save_paths = {
    'df_sampled1': './data/RLAIF_Sample/subset1',
    'df_sampled2': './data/RLAIF_Sample/subset2',
    'df_sampled3': './data/RLAIF_Sample/subset3',
    'df_sampled4': './data/RLAIF_Sample/subset4',
}
for path, df in zip(save_paths.keys(), [df_sampled1, df_sampled2, df_sampled3, df_sampled4]):
    dataset = Dataset.from_pandas(df)
    dataset.save_to_disk(save_paths[path])

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2500 [00:00<?, ? examples/s]