In [2]:
import pandas as pd
import seaborn as sns
import requests
from tqdm import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
import sys

sys.path.append(str(Path("..").resolve()))
from src import *
sns.set_style("darkgrid")
sns.set_context("notebook", font_scale=1.15)
tqdm.pandas()

# Check the availability of the thumbnails
##### ℹ️ This notebook requires `dataset/objaverse/1-annotations_filtered.parquet` and outputs `dataset/objaverse/2-annotations_filtered_by_thumbnails.parquet`
This notebook processes the thumbnails according to their current availability (HTTP 200) and selects the version with the highest resolution.

In [2]:
annotations = pd.read_parquet('../dataset/objaverse/thumbnails.parquet')
annotations.head(1)

Unnamed: 0_level_0,thumbnails
uid,Unnamed: 1_level_1
94db219c315742909fee67deeeacae15,[https://media.sketchfab.com/models/94db219c31...


## Using ThreadPoolExecutor to speed up (from 2h:15m to 3m)
Here we select the first higher resolution thumbnail for each object, or None if no thumbnail is available.

In [None]:
MIN_SIZE = 20_000  # bytes


def check_thumbnails(row) -> str | None:
    def get_size(url) -> int:
        try:
            response = requests.head(url, timeout=5)
            if response.status_code == 200 and "Content-Length" in response.headers:
                return int(response.headers["Content-Length"])
            return 0
        except requests.RequestException:
            return None

    if not isinstance(row["thumbnails"], np.ndarray):
        return None

    for thumbnail in row["thumbnails"][::-1] if "x" in row["thumbnails"][0][-8] else row["thumbnails"]:
        if (size := get_size(thumbnail)) is not None and size > MIN_SIZE:
            return thumbnail
    return None

In [None]:
def parallel_apply(df, func, max_workers=8):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        rows = [row for _, row in df.iterrows()]
        results = list(tqdm(executor.map(func, rows), total=len(rows), desc='Processing'))
    return results

In [None]:
thumbnails = parallel_apply(annotations, check_thumbnails, max_workers=64)

In [68]:
annotations['thumbnail'] = thumbnails
annotations.drop(columns=['thumbnails'], inplace=True)

In [None]:
annotations_filtered = annotations[annotations["thumbnail"].notna()]
annotations_filtered.set_index("uid", inplace=True)
annotations_filtered.to_parquet("../dataset/objaverse/thumbnails_checked2.parquet", index=False)

In [None]:
pd.read_parquet('../dataset/objaverse/thumbnails_checked.parquet')

Unnamed: 0_level_0,thumbnail
uid,Unnamed: 1_level_1
94db219c315742909fee67deeeacae15,https://media.sketchfab.com/models/94db219c315...
fc1339e225b7408caec82681be2746c5,https://media.sketchfab.com/models/fc1339e225b...
7b56f2bdfd2845588f6bde0c5362fd0d,https://media.sketchfab.com/models/7b56f2bdfd2...
0712f63f7e714e0d8d1247a08ec1f7fe,https://media.sketchfab.com/models/0712f63f7e7...
963dca3a0a7b4d6caacab65165829470,https://media.sketchfab.com/models/963dca3a0a7...
...,...
df79117e60684785b1ea408cfa9cdbeb,https://media.sketchfab.com/models/df79117e606...
e18091821c7e475881e1d444a0bbbb0b,https://media.sketchfab.com/models/e18091821c7...
24997b2e226f4de2944f2f40846b80c8,https://media.sketchfab.com/models/24997b2e226...
365c21d8754b49fc9f236800a355021c,https://media.sketchfab.com/models/365c21d8754...


: 

## Check for corrupted downloaded thumbnails

In [3]:
from PIL import Image

corrupted = []
for file in tqdm((ObjaverseDataset3D.DATASET_DIR / "render").glob("*")):
    try:
        Image.open(file)
    except:
        corrupted.append(file)

56586it [18:32, 50.88it/s]


In [None]:
cprint("You have", len(corrupted), "corrupted images")

You have [1m[34m4[0m corrupted images
