In [1]:
import os
import unicodedata
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def is_english_only(string):
    for s in string:
        cat = unicodedata.category(s)         
        if not cat in ['Ll', 'Lu', 'Nd', 'Po', 'Pd', 'Zs']:
            return False
    return True

In [3]:
df = pd.read_parquet('/kaggle/input/diffusiondb-metadata/metadata.parquet', columns=['image_name', 'prompt', 'width', 'height'])
df = df[(df['width'] == 512) & (df['height'] == 512)]
df['prompt'] = df['prompt'].str.strip()
df = df[df['prompt'].map(lambda x: len(x.split())) >= 5]
df = df[~df['prompt'].str.contains('^(?:\s*|NULL|null|NaN)$', na=True)]
df = df[df['prompt'].apply(is_english_only)]
df['head'] = df['prompt'].str[:15]
df['tail'] = df['prompt'].str[-15:]
df.drop_duplicates(subset='head', inplace=True)
df.drop_duplicates(subset='tail', inplace=True)
df.reset_index(drop=True, inplace=True)

for i in tqdm(range(1, 2000, 100)):
    image_dir = f'/kaggle/input/diffusiondb-2m-part-{str(i).zfill(4)}-to-{str(i+99).zfill(4)}-of-2000/'
    images = os.listdir(image_dir)
    df.loc[df['image_name'].isin(images), 'filepath'] = image_dir + df['image_name']

df = df[['filepath', 'prompt']].copy()
assert not df['filepath'].isnull().any()
df

  0%|          | 0/20 [00:00<?, ?it/s]

Unnamed: 0,filepath,prompt
0,/kaggle/input/diffusiondb-2m-part-0001-to-0100...,"a portrait of a female robot made from code, v..."
1,/kaggle/input/diffusiondb-2m-part-0001-to-0100...,dream swimming pool with nobody
2,/kaggle/input/diffusiondb-2m-part-0001-to-0100...,a beautiful paint of cultists dancing surround...
3,/kaggle/input/diffusiondb-2m-part-0001-to-0100...,"frontal portrait of ragged, worried twin women..."
4,/kaggle/input/diffusiondb-2m-part-0001-to-0100...,a stunning portrait of an asian samurai with l...
...,...,...
154315,/kaggle/input/diffusiondb-2m-part-1901-to-2000...,"obama transformed into a penguin, a combinatio..."
154316,/kaggle/input/diffusiondb-2m-part-1901-to-2000...,"new york invaded by nazis, concept art"
154317,/kaggle/input/diffusiondb-2m-part-1901-to-2000...,"a owlish, aquiline picture of an owl sitting o..."
154318,/kaggle/input/diffusiondb-2m-part-1901-to-2000...,"a owlish, elaborate painting of an owl sitting..."


In [4]:
df.to_csv('diffusiondb.csv', index=False)