In [1]:
import os
import unicodedata
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def is_english_only(string):
    for s in string:
        cat = unicodedata.category(s)         
        if not cat in ['Ll', 'Lu', 'Nd', 'Po', 'Pd', 'Zs']:
            return False
    return True

In [3]:
df = pd.read_parquet('/kaggle/input/diffusiondb-metadata/metadata.parquet', columns=['image_name', 'prompt', 'width', 'height'])
print(df.shape)

df = df[df['width'] == df['height']]
print(df.shape)

df['prompt'] = df['prompt'].str.strip()
df = df[df['prompt'].map(lambda x: len(x.split())) >= 5]
print(df.shape)

(2000000, 4)
(1122454, 4)
(1025543, 4)


In [4]:
df = df[~df['prompt'].str.contains('^(?:\s*|NULL|null|NaN)$', na=True)]
print(df.shape)

df = df[df['prompt'].apply(is_english_only)]
print(df.shape)

df['tail'] = df['prompt'].apply(lambda x: x[-15:])
df.drop_duplicates(subset='tail', inplace=True)
print(df.shape)

df.drop_duplicates(subset='prompt', inplace=True)
print(df.shape)

(1025543, 4)
(973295, 4)
(274789, 5)
(274789, 5)


In [5]:
df.reset_index(drop=True, inplace=True)

for i in tqdm(range(1, 2000, 100)):
    image_dir = f'/kaggle/input/diffusiondb-2m-part-{str(i).zfill(4)}-to-{str(i+99).zfill(4)}-of-2000/'
    images = os.listdir(image_dir)
    df.loc[df['image_name'].isin(images), 'filepath'] = image_dir + df['image_name']

df = df[['filepath', 'prompt']].copy()

  0%|          | 0/20 [00:00<?, ?it/s]

In [6]:
df.to_csv('diffusiondb_80W.csv', index=False)

In [7]:
import sys
sys.path.append('/kaggle/input/sentence-transformers-222/sentence-transformers')
from sentence_transformers import SentenceTransformer

In [8]:
st_model = SentenceTransformer(
    '/kaggle/input/sentence-transformers-222/all-MiniLM-L6-v2',
    device='cuda'
)

embedding = st_model.encode(
    df['prompt'], 
    show_progress_bar=True, 
    convert_to_tensor=True
)

Batches:   0%|          | 0/8588 [00:00<?, ?it/s]

In [9]:
import torch

In [10]:
torch.save(embedding, 'diffusiondb_80W.pt')

In [11]:
!ls -lh

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
total 451M
---------- 1 root root  12K Mar 14 23:00 __notebook__.ipynb
-rw-r--r-- 1 root root  49M Mar 14 22:58 diffusiondb_80W.csv
-rw-r--r-- 1 root root 403M Mar 14 23:00 diffusiondb_80W.pt
