In [62]:
import pandas as pd
import seaborn as sns
import requests
from tqdm import tqdm
import numpy as np
from concurrent.futures import ThreadPoolExecutor

sns.set_style("darkgrid")
sns.set_context("notebook", font_scale=1.15)
tqdm.pandas()

# Check the availability of the thumbnails
##### ℹ️ This notebook requires `data/1-annotations_filtered.parquet` and outputs `data/2-annotations_filtered_by_thumbnails.parquet`
This notebook processes the thumbnails according to their current availability (HTTP 200) and selects the version with the highest resolution.

In [None]:
annotations = pd.read_parquet('../data/1-annotations_filtered.parquet')
annotations.head(1)

Unnamed: 0,uid,name,likeCount,animationCount,commentCount,tags,categories,thumbnails,description,faceCount,createdAt,vertexCount,archives
0,11102f046e7846b8b4053bce5779d95c,Research>Mechanisms>Systems>Mark VI/VII Gate,2,0,0,"[gate, mark, metroid, vi, prime, 2, vii]","[architecture, electronics-gadgets]",[https://media.sketchfab.com/models/11102f046e...,Both 6 and 7 use the same model.\n\nMechanism:...,208,2022-05-06T03:49:10.436446,118,"{'extra': None, 'glb': {'faceCount': 208, 'siz..."


## ThreadPoolExecutor to speed up (from 2h:15m to 3m)
Here we select the first higher resolution thumbnail for each object, or None if no thumbnail is available.

In [63]:
def check_thumbnails(row) -> str | None:
    def is_available(url) -> bool:
        try:
            response = requests.head(url, timeout=5)
            return response.status_code == 200
        except requests.RequestException:
            return False

    if not isinstance(row['thumbnails'], np.ndarray):
        return None

    for thumbnail in row['thumbnails'][::-1] if 'x' in row['thumbnails'][0][-8] else row['thumbnails']:
        if is_available(thumbnail):
            return thumbnail
    return None

In [64]:
def parallel_apply(df, func, max_workers=8):
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        rows = [row for _, row in df.iterrows()]
        results = list(tqdm(executor.map(func, rows), total=len(rows), desc='Processing'))
    return results

In [66]:
thumbnails = parallel_apply(annotations, check_thumbnails, max_workers=64)

Processing: 100%|██████████| 76580/76580 [02:57<00:00, 432.59it/s]


In [68]:
annotations['thumbnail'] = thumbnails
annotations.drop(columns=['thumbnails'], inplace=True)

In [None]:
annotations_filtered=annotations[annotations['thumbnail'].notna()]
annotations_filtered.set_index('uid', inplace=True)
annotations_filtered.to_parquet('../data/2-annotations_filtered_by_thumbnails.parquet', index=False)