# Sketch Quality Analysis (With CLIP)

The quickdraw dataset includes some invalid images since they were crowd sourced. (anyone could contribute to the dataset)

https://colinmorris.github.io/blog/bad_flamingos

In [None]:
import os

os.chdir("..")

In [2]:
from transformers import CLIPProcessor, CLIPModel

# pip install transformers

# https://huggingface.co/openai/clip-vit-large-patch14
model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

Skipping import of cpp extensions due to incompatible torch version 2.8.0+cu128 for torchao version 0.14.0         Please see GitHub issue #2919 for more info
W1103 13:48:22.240000 23184 site-packages\torch\distributed\elastic\multiprocessing\redirects.py:29] NOTE: Redirects are currently not supported in Windows or MacOs.


config.json: 0.00B [00:00, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
from raster_dataset import svg_rasterize
from IPython.display import display, HTML
from dataset import QuickDrawDataset
import torch

dataset = QuickDrawDataset(["cat"], download=True)
svgs = [dataset[0], dataset[8138], dataset[1234]]
rasterized_svgs = [svg_rasterize(svg) for svg in svgs]

text_inputs = ["a cat sketch", "random scribbles"]
inputs = processor(
    text=text_inputs, images=rasterized_svgs, return_tensors="pt", padding=True
)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

# Rank images for each text label
for j, text in enumerate(text_inputs):
    scores = probs[:, j]
    ranking = torch.argsort(scores, descending=True)
    print(f"\nText: '{text}'")
    label_ranking = ""

    for rank, idx in enumerate(ranking):
        label_ranking += f"<div style='display:inline-block; width: 150px; background-color: white; margin-right:10px;'><b>Ranking {rank+1}, Probability: {scores[idx]:.2f}</b><br>{svgs[idx]}</div>"

    display(HTML(label_ranking))

Downloading QuickDraw files: 100%|██████████| 1/1 [00:00<00:00, 5949.37it/s]
Loading QuickDraw files: 1it [00:03,  3.23s/it]



Text: 'a cat sketch'



Text: 'random scribbles'


In [None]:
svgs_inline = ""
for i in range(20):
    original_svg = dataset[i]
    svgs_inline += f'<div style="display:inline-block; width: 150px; background-color: white; margin-right:10px;"><b>Original {i}</b><br>{original_svg}</div>'

display(HTML(svgs_inline))

# What about not scribbles, but not exactly cats

`dataset[7], dataset[14]`

In [None]:
from raster_dataset import svg_rasterize
from IPython.display import display, HTML
from dataset import QuickDrawDataset
import torch

dataset = QuickDrawDataset(["cat"], download=True)
svgs = [
    dataset[0],
    dataset[8138],
    dataset[1234],
    dataset[7],
    dataset[14],
]
rasterized_svgs = [svg_rasterize(svg) for svg in svgs]

text_inputs = ["a cat sketch", "random scribbles"]
inputs = processor(
    text=text_inputs, images=rasterized_svgs, return_tensors="pt", padding=True
)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

# Rank images for each text label
for j, text in enumerate(text_inputs):
    scores = probs[:, j]
    ranking = torch.argsort(scores, descending=True)
    print(f"\nText: '{text}'")
    label_ranking = ""

    for rank, idx in enumerate(ranking):
        label_ranking += f"<div style='display:inline-block; width: 150px; background-color: white; margin-right:10px;'><b>Ranking {rank+1}, Probability: {scores[idx]:.2f}</b><br>{svgs[idx]}</div>"

    display(HTML(label_ranking))

Downloading QuickDraw files: 100%|██████████| 1/1 [00:00<00:00, 5511.57it/s]
Loading QuickDraw files: 1it [00:03,  3.38s/it]



Text: 'a cat sketch'



Text: 'random scribbles'


In [None]:
from raster_dataset import svg_rasterize
from IPython.display import display, HTML
from dataset import QuickDrawDataset
import torch

dataset = QuickDrawDataset(["cat"], download=True)
svgs = [dataset[i] for i in range(50)]
rasterized_svgs = [svg_rasterize(svg) for svg in svgs]

text_inputs = ["a cat sketch", "random scribbles"]
inputs = processor(
    text=text_inputs, images=rasterized_svgs, return_tensors="pt", padding=True
)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

# Rank images for each text label
for j, text in enumerate(text_inputs):
    scores = probs[:, j]
    ranking = torch.argsort(scores, descending=True)
    print(f"\nText: '{text}'")
    label_ranking = ""

    for rank, idx in enumerate(ranking):
        label_ranking += f"<div style='display:inline-block; width: 150px; background-color: white; margin-right:10px;'><b>Ranking {rank+1}, Probability: {scores[idx]:.2f}</b><br>{svgs[idx]}</div>"

    display(HTML(label_ranking))

Downloading QuickDraw files: 100%|██████████| 1/1 [00:00<00:00, 5315.97it/s]
Loading QuickDraw files: 1it [00:03,  3.28s/it]



Text: 'a cat sketch'



Text: 'random scribbles'


In [None]:
from raster_dataset import svg_rasterize
from IPython.display import display, HTML
from dataset import QuickDrawDataset
import torch

dataset = QuickDrawDataset(["cat"], download=True)
svgs = [dataset[i] for i in range(50)]
rasterized_svgs = [svg_rasterize(svg) for svg in svgs]

# for raster in rasterized_svgs:
#     display(raster)

text_inputs = ["good cat sketch", "bad cat sketch", "random scribbles"]
inputs = processor(
    text=text_inputs, images=rasterized_svgs, return_tensors="pt", padding=True
)

outputs = model(**inputs)
logits_per_image = outputs.logits_per_image
probs = logits_per_image.softmax(dim=1)

# Rank images for each text label
for j, text in enumerate(text_inputs):
    scores = probs[:, j]
    ranking = torch.argsort(scores, descending=True)
    print(f"\nText: '{text}'")
    label_ranking = ""

    for rank, idx in enumerate(ranking):
        label_ranking += f"<div style='display:inline-block; width: 150px; background-color: white; margin-right:10px;'><b>Ranking {rank+1}, Probability: {scores[idx]:.2f}</b><br>{svgs[idx]}</div>"

    display(HTML(label_ranking))

Downloading QuickDraw files: 100%|██████████| 1/1 [00:00<00:00, 1179.17it/s]
Loading QuickDraw files: 1it [00:03,  3.46s/it]



Text: 'good cat sketch'



Text: 'bad cat sketch'



Text: 'random scribbles'
