In [2]:
# install sacrebleu, datasets, evaluate

import time
from glob import iglob
import requests
from io import BytesIO

import numpy as np
from PIL import Image

import evaluate
import easyocr
from tqdm.auto import tqdm
from datasets import load_dataset

reader = easyocr.Reader(["en"], detect_network='craft', gpu=True)
chrf = evaluate.load("chrf")

  warn(f"Failed to load image Python extension: {e}")


In [6]:
# Do OCR
image_paths = iglob("/home/vlialin/data/TextCaps/train/*")

n = 50
start = time.time()
for i, image_path in tqdm(enumerate(image_paths), total=n):
    if i > n:
        break
    result = reader.readtext(
        image_path,
        batch_size=32,  # 6-8 examples/sec, GPU is unused most of the time
        # workers=8,  # 1.12 examples/sec
    )

print(f"Examples/sec: {n / (time.time() - start)}")

# Batched is much slower than unbatched, 5.0 examples/sec, GPU is unused most of the time
# batch = []
# batch_size = 32
# n = batch_size * 10
# start = time.time()
# for i, image_path in tqdm(enumerate(image_paths), total=n):
#     if i >= n:
#         break
#     batch.append(image_path)
#     if len(batch) == batch_size:
#         result = reader.readtext_batched(
#             batch,
#             # batch_size=batch_size,
#             # workers=8,
#             n_width=800, n_height=600
#         )
#         batch = []

# print(n / (time.time() - start))


  0%|          | 0/50 [00:00<?, ?it/s]

Examples/sec: 7.533760620233949


In [None]:
# watermarks = load_dataset("laion/laion2B-en-watermark")
# watermarks = watermarks.filter(lambda x: x["pwatermark"] > 0.5)
# watermars = dict(watermarks.hash)

In [23]:

# Download the dataset
dataset = load_dataset("laion/laion2B-en-joined", split="train", streaming=True)
dataset = dataset.shuffle(seed=834, buffer_size=10_000)

# dataset example:
# item: {'SAMPLE_ID': 2641080021034,
#  'URL': 'https://cdn.shopify.com/s/files/1/0017/3621/2538/products/blue-beach-umbrellas-point-of-rocks-crescent-beach-siesta-key-shawn-mcloughlin_32d72f5b-5e55-42f9-bfcf-d6fa8d239beb_300x300.jpg?v=1524171284',
#  'TEXT': 'Blue Beach Umbrellas, Point Of Rocks, Crescent Beach, Siesta Key - Spiral Notebook',
#  'HEIGHT': 231,
#  'WIDTH': 300,
#  'LICENSE': '?',
#  'NSFW': 'UNLIKELY',
#  'similarity': 0.3955616354942322}

Using custom data configuration laion--laion2B-en-joined-bc573946f750094d


In [24]:
next(iter(dataset))

{'URL': 'https://molinoproperty.com/wp-content/uploads/2017/06/29-4.jpg',
 'TEXT': 'Villa in Valtocado - Mijas for sale',
 'WIDTH': 1000,
 'HEIGHT': 667,
 'similarity': 0.3118513226509094,
 'hash': 7507156594793326694,
 'punsafe': 0.0034678280353546143,
 'pwatermark': 0.02891501970589161}

In [44]:
WATERMARKS = {"gettyimages"}

n = 1000
h_have_ocr = 0
n_errors = 0

start = time.time()
for i, item in tqdm(enumerate(dataset), total=n):
    if i >= n:
        break

    if item["punsafe"] > 0.5:
        continue

    if item["pwatermark"] > 0.5:
        continue

    if "porn" in item["TEXT"].lower():
        continue

    # download image from URL without causing an error, set timeout to 1 second

    try:
        response = requests.get(item["URL"], timeout=1)
    except requests.exceptions.RequestException as e:
        n_errors += 1
        continue
    if response.status_code != 200:
        n_errors += 1
        continue

    if i == 42:
        print(f"Loading imgage to PIL")

    try:
        image = Image.open(BytesIO(response.content))
    except OSError as e:
        n_errors += 1
        continue

    # image to numpy
    image = np.array(image)

    if i == 42:
        print(f"Starting OCR")

    # do OCR
    ocr_results = reader.readtext(
        image,
        batch_size=32,  # 6.3 examples/sec, GPU is unused most of the time
        # workers=8,
    )

    for ocr_item in ocr_results:
        ocr_text = ocr_item[1].lower()
        item_text = item["TEXT"].lower()
        ocr_confidence = ocr_item[2]

        for w in WATERMARKS:
            if w in ocr_text:
                continue

        if ocr_confidence < 0.8:
            continue

        similarity = chrf.compute(predictions=[ocr_text], references=[item_text])["score"]
        if similarity > 0.8:  # 0.8 is good
            h_have_ocr += 1
            break

        #     print(f"Image number: {i}")
        #     print(f"Image url: {item['URL']}")
        #     print(f"OCR: `{ocr_text}`, confidence: {ocr_confidence}, similarity: {similarity}")
        #     print(f"TEXT: {item['TEXT']}")
        #     print()

print(f"Examples/sec: {n / (time.time() - start)}")
print(f"Errors: {n_errors}")
print(f"Have OCR: {h_have_ocr/n}")

  0%|          | 0/1000 [00:00<?, ?it/s]

Loading imgage to PIL
Starting OCR
Examples/sec: 1.4908328861052205
Errors: 160
Have OCR: 0.201


In [3]:
dataset = load_dataset("laion/laion2B-en-joined", split="train", streaming=True)
dataset = dataset.shuffle(seed=84, buffer_size=10_000)
dataset = dataset.take(20_000_000)
dataset = dataset.filter(lambda x: x["punsafe"] is not None and x["punsafe"] < 0.5 and x["pwatermark"] is not None and x["pwatermark"] < 0.5)

Using custom data configuration laion--laion2B-en-joined-bc573946f750094d


In [4]:
%%time
dataset_list = list(dataset)

CPU times: user 3min 16s, sys: 10.2 s, total: 3min 26s
Wall time: 4min 18s


In [5]:
from datasets import Dataset
safe_dataset = Dataset.from_list(dataset_list)

In [6]:
safe_dataset

Dataset({
    features: ['URL', 'TEXT', 'WIDTH', 'HEIGHT', 'similarity', 'hash', 'punsafe', 'pwatermark'],
    num_rows: 14501921
})

In [7]:
safe_dataset.save_to_disk("/home/vlialin/data/text-laion-20M")

In [59]:
_1000_urls = [item["URL"] for item in dataset_list[:10_000]]

# save to csv
import csv
with open("1000_urls.csv", "w") as f:
    writer = csv.writer(f)
    writer.writerow(["url"])
    for url in _1000_urls:
        writer.writerow([url])
    
# with open("1000_urls.txt", "w") as f:
#     for url in _1000_urls:
#         f.write(f"{url}\n")

from img2dataset import download
import shutil
import os

output_dir = os.path.abspath("1000_images")

if os.path.exists(output_dir):
    shutil.rmtree(output_dir)

download(
    processes_count=16,
    thread_count=32,
    url_list="1000_urls.csv",
    image_size=512,
    resize_mode="keep_ratio",
    output_folder=output_dir,
    output_format="files",
    input_format="csv",
    enable_wandb=True,
    number_sample_per_shard=1000,
    distributor="multiprocessing",
)


Starting the downloading of this file
Sharding file number 1 of 1 called /home/vlialin/documents/random_notebooks/1000_urls.csv


0it [00:00, ?it/s]

File sharded in 10 shards
Downloading starting now, check your bandwidth speed (with bwm-ng)your cpu (with htop), and your disk usage (with iotop)!


10it [02:19, 13.99s/it]


In [None]:
img2dataset --url_list /home/vlialin/data/text--100k --input_format "arrow" \
        --url_col "URL" --caption_col "TEXT" --output_format webdataset \
        --output_folder text-laion7m-data --processes_count 16 --thread_count 128 --image_size 256 \
        --save_additional_columns "['WIDTH', 'HElaionIGHT', 'similarity', 'hash', 'punsafe', 'pwatermark']" --enable_wandb True

In [None]:
n = 1000
n_errors = 0

data = []

start = time.time()
for i, item in tqdm(enumerate(dataset), total=n):
    if i >= n:
        break

    if item["punsafe"] > 0.5:
        continue

    if item["pwatermark"] > 0.5:
        continue

    if "porn" in item["TEXT"].lower():
        continue

    data.append(item)


print(f"Examples/sec: {n / (time.time() - start)}")
print(f"Errors: {n_errors}")