MIT License

Copyright (c) 2024-present K. S. Ernest (iFire) Lee
Copyright (c) 2024 Marcus Loren

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.


# Install the dependencies

python3 -m pip install --break-system-packages --user requests tqdm pygltflib pandas

### Step 1 - Get model sizes & path

Option 1 - Extract manually:
1. Run "git clone https://huggingface.co/datasets/allenai/objaverse" and then abort the command when it starts to download the models.
2. This will create a git repo folder, you then can run "python dump_gitcommits.py > out.txt" to dump the entire commit history
3. Then you call extract_models_from_dump("out.txt") to parse and get all the model paths and their sizes.

Option 2 - Use the pre-extracted json (model_sizes.json.gz)

In [None]:
import json 
import gzip

def extract_models_from_dump(file_path):
    model_sizes = {}
    current_model = None
    with open(file_path, 'r') as file:
        for line in file:
            # Get model path
            if ".glb" in line:
                # Extract model path
                model_path = line.split()[-1].strip()
                model_path = model_path.replace("b/", "")
                current_model = model_path
            # Get current_model size
            elif current_model and "size" in line: 
                
                size = int(line.split()[-1].strip()) 
                model_sizes[current_model] = size 
                current_model = None
    return model_sizes
 
 
 ## Option 1
#model_sizes = extract_models_from_dump("out.txt")  


## Option 2
with gzip.open("model_sizes.json.gz", 'rb') as gzip_file: 
    model_sizes = json.loads(gzip_file.read().decode('utf-8'))
    
print(len(model_sizes))

### Download the meshes as per specified size limit

In [None]:
import os
import requests
from tqdm import tqdm  
from concurrent.futures import ThreadPoolExecutor 

def download_model(model_url, save_path):
    try:
        response = requests.get(model_url)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
                #print(f"Downloaded: {save_path}")
        else:
            print(f"Failed to download: {model_url}")
    except Exception as e:
        print(f"Error downloading: {model_url}, {e}")

def download_filtered_models(model_sizes, base_url, save_dir, minKb, maxKb, num_threads = 6, maxDownloadedMeshes = 250000):
    filtered_models = {model_path: size for model_path, size in model_sizes.items() if minKb < size < maxKb * 1024}
    
    downloaded_meshes = 0

    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for model_path, size in filtered_models.items():
            if downloaded_meshes >= maxDownloadedMeshes:
                break

            folder_name = os.path.dirname(model_path)
            sub_folder = os.path.join(save_dir, folder_name)
            os.makedirs(sub_folder, exist_ok=True)
            
            file_name = os.path.basename(model_path)
            save_path = os.path.join(sub_folder, file_name)
            
            if not os.path.exists(save_path):
                model_url = f"{base_url}/{model_path}?download=true"
                futures.append(executor.submit(download_model, model_url, save_path))
                
                downloaded_meshes += 1
                
        for future in tqdm(futures, total=len(futures)):
            future.result()
            
base_url = "https://huggingface.co/datasets/allenai/objaverse/resolve/main"  
save_dir = f'./objaverse' 

os.makedirs(save_dir, exist_ok=True)   
download_filtered_models(model_sizes, base_url, save_dir, minKb = 20, maxKb = 10240, num_threads = 6, maxDownloadedMeshes = 10) 

### Download metadata

In [None]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm
 
def download_file(url, folder_path, filename):
    url = url + "?download=true"
    print(url)
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()  # If the response was successful, no Exception will be raised
        with open(os.path.join(folder_path, filename), 'wb') as f:
            f.write(response.content) 
        return True
    except Exception as err:
        print(f"Failed to download {filename}. Error: {err}")
        return False

def download_metadata(base_url, save_dir, num_threads=6):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for i in range(1, 161):
            filename = f"000-{i:03d}.json.gz"
            file_url = base_url + filename
            futures.append(executor.submit(download_file, file_url, save_dir, filename))
         
        for future in tqdm(futures, total=len(futures)):
            result = future.result()
            if not result:
                continue
            
base_url = "https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/" 
save_dir = './objaverse/metadata'
os.makedirs(save_dir, exist_ok=True)   

download_metadata(base_url, save_dir)

### Extract the metadata to a JSON with only the relevant information, e.g the models you downloaded

In [None]:
import os
import glob
import gzip
import json
from pygltflib import GLTF2, BufferFormat
import pandas as pd
import time

captions_df = pd.read_csv('./objaverse_annotations/pali_captions.csv', sep=';')
material_annotations_df = pd.read_csv('./objaverse_annotations/pali_material_annotations.csv', sep=';')
type_annotations_df = pd.read_csv('./objaverse_annotations/pali_type_annotations.csv', sep=';')
captions_dict = captions_df.set_index('object_uid').T.to_dict('list')
material_annotations_dict = material_annotations_df.set_index('object_uid').T.to_dict('list')
type_annotations_dict = type_annotations_df.set_index('object_uid').T.to_dict('list')

# Load metadata
metadata = {}
filtered_metadata = {}
metadata_path = './objaverse/metadata'
for file_name in os.listdir(metadata_path):
    if file_name.endswith(".gz"):
        file_path = os.path.join(metadata_path, file_name)
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            file_metadata = json.load(f)
            metadata.update(file_metadata)

input_directory = './objaverse/glbs'
output_gltf_directory = './objaverse/gltf_xmp_json_ld'

os.makedirs(output_gltf_directory, exist_ok=True)

def convert_lists_to_ordered_xmp_format(data):
    for key, value in data.items():
        if isinstance(value, list):
            # Always use '@list' to represent an ordered list.
            data[key] = {'@list': value}
        elif isinstance(value, dict):
            convert_lists_to_ordered_xmp_format(value)

def add_to_filtered_metadata(key, value):
    if value is not None:
        filtered_metadata[f"vsekai:{key}"] = value

existing_models = {}
for file_path in glob.iglob(input_directory + '/**/*', recursive=True):
    if os.path.isfile(file_path):
        start_time = time.time()
        file_name, file_extension = os.path.splitext(file_path)
        existing_models[os.path.basename(file_name)] = file_path

        if file_extension.lower() == ".glb" and os.path.basename(file_name) in metadata:
            gltf_embedded = GLTF2().load(file_path) 
            gltf_embedded.convert_buffers(BufferFormat.DATAURI)
            gltf_file_path = os.path.join(output_gltf_directory, os.path.basename(file_name) + ".gltf")
            gltf_embedded.save(gltf_file_path)
            data = metadata[os.path.basename(file_name)]
            if data["license"] != "by":
                print("Skipping due to License: ", data["license"])
                os.remove(gltf_file_path)
                continue
            convert_lists_to_ordered_xmp_format(data)
            filtered_metadata = {
                "@context": {
                    "dc": "http://purl.org/dc/elements/1.1/",
                    "vsekai": "http://v-sekai.org/vsekai/elements/0.2/"
                },
                "@id": data["uid"],
                "dc:title": data["name"],
                "dc:creator": {
                    "@id": data["user"]["uid"],
                    "dc:name": data["user"]["username"]
                },
                "dc:description": data["description"],
                "dc:date": data["createdAt"],
                "dc:identifier": data["uri"],
                "dc:source": data["viewerUrl"],
                "dc:rights": data["license"],
                "dc:subject": data["tags"],
                "dc:type": "3D Model",
                "dc:relation": data["user"]["profileUrl"],
                "vsekai:viewCount": data["viewCount"],
                "vsekai:likeCount": data["likeCount"],
                "vsekai:commentCount": data["commentCount"],
                "vsekai:isDownloadable": data["isDownloadable"],
                "vsekai:publishedAt": data["publishedAt"],
                "vsekai:faceCount": data["faceCount"],
                "vsekai:vertexCount": data["vertexCount"],
                "vsekai:isAgeRestricted": data["isAgeRestricted"]
            }
            if data["uid"] in captions_dict:
                caption_annotation, caption_annotation_probability = captions_dict[data["uid"]]
                add_to_filtered_metadata("captionAnnotation", caption_annotation)
                add_to_filtered_metadata("captionAnnotationProbability", caption_annotation_probability)

            if data["uid"] in material_annotations_dict:
                material_annotation, material_annotation_probability = material_annotations_dict[data["uid"]]
                add_to_filtered_metadata("materialAnnotation", material_annotation)
                add_to_filtered_metadata("materialAnnotationProbability", material_annotation_probability)

            if data["uid"] in type_annotations_dict:
                type_annotation, type_annotation_probability = type_annotations_dict[data["uid"]]
                add_to_filtered_metadata("typeAnnotation", type_annotation)
                add_to_filtered_metadata("typeAnnotationProbability", type_annotation_probability)

            optional_tags = ["animationCount", "staffpickedAt", "archives", "categories"]
            for tag in optional_tags:
                if tag in data:
                    add_to_filtered_metadata(tag, data[tag])

            with open(gltf_file_path, 'r') as f:
                gltf_json = json.load(f)

            xmp_extension = {
                "KHR_xmp_json_ld": {
                    "packets": [filtered_metadata]
                }
            }

            if 'extensions' in gltf_json['asset']:
                if 'KHR_xmp_json_ld' in gltf_json['asset']['extensions']:
                    gltf_json['asset']['extensions']['KHR_xmp_json_ld']['packets'].append(filtered_metadata)
                else:
                    gltf_json['asset']['extensions'].update(xmp_extension)
            else:
                gltf_json['asset']['extensions'] = xmp_extension

            gltf_json['asset']['extensions']['KHR_xmp_json_ld']['packet'] = len(gltf_json['asset']['extensions']['KHR_xmp_json_ld']['packets']) - 1

            if 'extensionsUsed' in gltf_json:
                if "KHR_xmp_json_ld" not in gltf_json['extensionsUsed']:
                    gltf_json['extensionsUsed'].append("KHR_xmp_json_ld")
            else:
                gltf_json['extensionsUsed'] = ["KHR_xmp_json_ld"]

            with open(gltf_file_path, 'w') as f:
                json.dump(gltf_json, f, indent=4)

            print(json.dumps({
                "uid": data["uid"], 
                "captionAnnotation": filtered_metadata.get("vsekai:captionAnnotation", ""), 
                "timeTakenForThisIteration": time.time() - start_time
            }))