### Step 1 - Get model sizes & path

Option 1 - Extract manually:
1. Run "git clone https://huggingface.co/datasets/allenai/objaverse" and then abort the command when it starts to download the models.
2. This will create a git repo folder, you then can run "python dump_gitcommits.py > out.txt" to dump the entire commit history
3. Then you call extract_models_from_dump("out.txt") to parse and get all the model paths and their sizes.

Option 2 - Use the pre-extracted json (model_sizes.json.gz)

In [None]:
import os,json
import requests
import gzip

def extract_models_from_dump(file_path):
    model_sizes = {}
    current_model = None
    with open(file_path, 'r') as file:
        for line in file:
            # Check if line contains ".glb"
            if ".glb" in line:
                # Extract model path
                model_path = line.split()[-1].strip()
                # Remove leading "b" if present
                model_path = model_path.replace("b/", "")
                current_model = model_path
            # Check if line contains "size" and current_model is set
            elif current_model and "size" in line:
                # Extract size
                size = int(line.split()[-1].strip())
                # Store model size
                model_sizes[current_model] = size
                # Reset current_model for next iteration
                current_model = None
    return model_sizes
 
 
 ## Option 1
#model_sizes = extract_models_from_dump("out.txt")  


## Option 2
with gzip.open("model_sizes.json.gz", 'rb') as gzip_file: 
    model_sizes = json.loads(gzip_file.read().decode('utf-8'))
    
print(len(model_sizes))

### Download the meshes as per specified size limit

In [None]:
import os
import requests
from tqdm import tqdm  
from concurrent.futures import ThreadPoolExecutor 

def download_model(model_url, save_path):
    try:
        response = requests.get(model_url)
        if response.status_code == 200:
            with open(save_path, 'wb') as f:
                f.write(response.content)
                #print(f"Downloaded: {save_path}")
        else:
            print(f"Failed to download: {model_url}")
    except Exception as e:
        print(f"Error downloading: {model_url}, {e}")

def download_filtered_models(model_sizes, base_url, save_dir, minKb, maxKb,num_threads = 6):
    filtered_models = {model_path: size for model_path, size in model_sizes.items() if minKb < size < maxKb * 1024}
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for model_path, size in filtered_models.items():
            folder_name = os.path.dirname(model_path)
            sub_folder = os.path.join(save_dir, folder_name)
            os.makedirs(sub_folder, exist_ok=True)
            file_name = os.path.basename(model_path)
            save_path = os.path.join(sub_folder, file_name)
            if not os.path.exists(save_path):
                model_url = f"{base_url}/{model_path}?download=true"
                futures.append(executor.submit(download_model, model_url, save_path))
        for future in tqdm(futures, total=len(futures)):
            future.result()
            
base_url = "https://huggingface.co/datasets/allenai/objaverse/resolve/main"  
save_dir = f'./objaverse' 

os.makedirs(save_dir, exist_ok=True)   
download_filtered_models(model_sizes, base_url, save_dir, minKb = 2, maxKb = 80, num_threads= 6) 

### Download metadata

In [None]:
import os
import requests
from concurrent.futures import ThreadPoolExecutor
from tqdm import tqdm

# Function to download files
def download_file(url, folder_path, filename):
    url = url + "?download=true"
    print(url)
    response = requests.get(url, stream=True)
    if response.status_code == 200:
        with open(os.path.join(folder_path, filename), 'wb') as f:
            f.write(response.content)
        #print(f"Downloaded {filename}")
    else:
        print(f"Failed to download {filename}")
 
def download_metadata(base_url, save_dir,  num_threads=6):
    with ThreadPoolExecutor(max_workers=num_threads) as executor:
        futures = []
        for i in range(1, 161):
            filename = f"000-{i:03d}.json.gz"
            file_url = base_url + filename
            futures.append(executor.submit(download_file, file_url, save_dir, filename))
        
        # Wait for all tasks to complete
        for future in tqdm(futures, total=len(futures)):
            future.result()

# Example usage
base_url = "https://huggingface.co/datasets/allenai/objaverse/resolve/main/metadata/" 
save_dir = './objaverse/metadata'
os.makedirs(save_dir, exist_ok=True)   

download_metadata(base_url, save_dir)


### Extract the metadata to a JSON with only the relevant information, e.g the models you downloaded

In [None]:
import os
import glob

directory = './objaverse/glbs'  

existing_models = {}
for file_path in glob.iglob(directory + '/**/*', recursive=True):
    if os.path.isfile(file_path):
        file_name, file_extension = os.path.splitext(file_path)
        existing_models[os.path.basename(file_name)] = file_path


In [11]:
import os,gzip,json
metadata = {}
filtered_metadata = { }


metadata_path = './objaverse/metadata'
for file_name in os.listdir(metadata_path):
    if file_name.endswith(".gz"): 
        file_path = os.path.join(metadata_path, file_name) 
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            metadata = json.load(f)
            for key, value in existing_models.items():
                if key in metadata:
                    filtered_metadata[key] = metadata[key] 
        

In [13]:
with open('./objaverse/metadata.json'  , 'w') as f:
    json.dump(filtered_metadata, f)