In [6]:
import sys
from pathlib import Path
from tqdm import tqdm
import os
import pandas as pd

sys.path.append(str(Path("..").resolve()))
from src import *

# Download Entire Objaverse Dataset
Use this notebook to download the entire dataset. Might take a while.

First run [`download_objaverse.sh`](download_objaverse.sh). On Leonardo HPC, what I did was:
```bash
sbatch sh download_objaverse.sh 000 009
sbatch sh download_objaverse.sh 010 019
...
```
Hence, in the following I'll assume you have already downloaded a good share of the dataset. The entire dataset is about 10 TiB in size.

This notebook produce a dataframe of the following shape:
|uid|path|size|
|:--|:-:|:-:|
|7c2df01bd3174a71a7f6260d86b140de|/home/...|12 KiB|

In [3]:
SCRATCH_DIR = os.environ["SCRATCH"]
GLB_DIRS = [
    f"{SCRATCH_DIR}/objaverse/glbs",
    f"{SCRATCH_DIR}/glbs",
    f"../dataset/objaverse/objects",
]

In [13]:
# 6500it/s
MIN_SIZE = 4_096  # Discard git lfs pointers

glbs = {}
for dir in GLB_DIRS:
    dir = Path(dir).resolve()
    for glb in tqdm(list(dir.rglob("*.glb"))):
        if glb.stat().st_size > MIN_SIZE:
            glbs[glb.stem] = (str(glb), glb.stat().st_size)

100%|██████████| 798759/798759 [02:02<00:00, 6498.68it/s]
100%|██████████| 42979/42979 [00:06<00:00, 6248.47it/s]
100%|██████████| 72601/72601 [00:12<00:00, 5910.56it/s]


In [14]:
cprint(
   "You have a total of",
   f"blue:{len(glbs):,} GLBs",
   "totalling",
   f"red:{sum(x[1]for x in glbs.values())/2**40:.2f} TiB"
)

You have a total of [1m[34m289,942 GLBs[0m totalling [1m[31m2.49 TiB[0m


In [24]:
df = pd.DataFrame.from_dict(glbs, orient="index", columns=["path", "size"])
df.index.name = "uid"
df.to_parquet(ObjaverseDataset3D.DATASET_DIR / "objaverse_glbs.parquet")

In [21]:
df.head()

Unnamed: 0_level_0,path,size
uid,Unnamed: 1_level_1,Unnamed: 2_level_1
7c2df01bd3174a71a7f6260d86b140de,/leonardo_scratch/large/userexternal/vmorelli/...,5843000
b7381a0363224c359542c9b712d062f8,/leonardo_scratch/large/userexternal/vmorelli/...,146288
b74934f4600741b291f57dfb2aa72ec7,/leonardo_scratch/large/userexternal/vmorelli/...,1053828
2679aacf1de9414fb4efd4e01757cc92,/leonardo_scratch/fast/IscrC_MACRO/Texture-Any...,8804
372ece96a80e448ca54978fb6b06c4ee,/leonardo_scratch/large/userexternal/vmorelli/...,2104196
