In [1]:
import torch

# Local GPU Test

print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

2.5.1
True
NVIDIA GeForce RTX 3090


In [120]:
import objaverse

uids = objaverse.load_uids() # each object has a unique ID (uid), then load it
print("all of objects length :", len(uids))

lvis_annotations = objaverse.load_lvis_annotations()

num_of_objects = 50
filtered_lvis_annotations = {key: value for key, value in lvis_annotations.items() if len(value) >= num_of_objects}
print(f"the number of classes having items more than {num_of_objects} : {len(filtered_lvis_annotations)}")

all of objects length : 798759
the number of classes having items more than 50 : 319


In [121]:
my_datasets = {}

for key, value in filtered_lvis_annotations.items():
    sampled_uids = value[:num_of_objects]
    my_datasets[key] = sampled_uids
    print(f"{key} has {len(sampled_uids)} items : {sampled_uids}")
print(f"This dataset consists of {len(my_datasets)} classes.")

Christmas_tree has 50 items : ['e283aa7835664a74a4ad39afb26d2ef3', 'ae0c3996ee0345aaa8bbb717db7ffb25', 'fe2b4a2708334ff880f62a947cfd3d50', '2da008c410be42d188a0c95aa1bca05e', '16fbadc669454c26aa0927445b3c6ffd', '0b1b92d6d5584284b59aaab57b1009a1', 'fe8eef087d6646fabb24833ea6af5550', '3c11fef580924d0a831e42db6e217579', 'b2a917737eab4866a8fda30fded72d1c', '281891d0dd6a4affbda7530bed83f846', '2826823d900347bd97b17708ee04c3ff', 'da2c426b7ec247d5aee0ab51a83f8e14', '3ca3558e2b594758ab50c8fff8ea0b12', 'feb96b6b2b534c58bbdf9e081f2f4e46', '3b38afc564c44f52afaca0acb9949c35', '8f4af0f494fb4a01924d53ee7c34fa35', '637cfe8730f44325a34cb8f34de5ced2', 'c6c2b4cc101c4ccda903bfe33f0639fc', '64a0364ffb424438b26ac3c846ed2186', '3229c85a80374129995bc89f9ccb4546', 'c1222de31eb6474abbdb6b636f8225a4', 'b6be1f59174d4a10825270fd805f21ba', '3d0f9198c9e64d77896bb0cd4a2bd87c', '48e999874aa94429b31412c9d8f8e6b7', '8c6c7a556b37410b94fd72dcb016326b', 'ce9e4f6890444e69bbe3e87c2dde4b81', '32fe939ce1544fa28a33bdfcf904a1ee

In [125]:
# once you download the datasets, you don't need to run it again.

import ssl
ssl._create_default_https_context = ssl._create_unverified_context

count = 1
for key, value in my_datasets.items():
    print(f"\nDownloading {key}... ({count} / {len(my_datasets)})")
    objaverse.load_objects(value, 1)
    print(f"Downloading {key} done.\n")
    count += 1

print("All downloading are done.")


Downloading Christmas_tree... (1 / 319)
Downloading Christmas_tree done.


Downloading Lego... (2 / 319)
Downloading Lego done.


Downloading airplane... (3 / 319)
Downloading airplane done.


Downloading alarm_clock... (4 / 319)
Downloading alarm_clock done.


Downloading alligator... (5 / 319)
Downloading alligator done.


Downloading antenna... (6 / 319)
Downloading antenna done.


Downloading apple... (7 / 319)
Downloading apple done.


Downloading armchair... (8 / 319)
Downloading armchair done.


Downloading armoire... (9 / 319)
Downloading armoire done.


Downloading armor... (10 / 319)
Downloading armor done.


Downloading army_tank... (11 / 319)
Downloading army_tank done.


Downloading avocado... (12 / 319)
Downloading avocado done.


Downloading award... (13 / 319)
Downloading award done.


Downloading awning... (14 / 319)
Downloading awning done.


Downloading ax... (15 / 319)
Downloading ax done.


Downloading backpack... (16 / 319)
Downloading backpack done.


Downloadin

In [138]:
import trimesh
import gzip
import pyrender
import numpy as np
import os
from PIL import Image

class Renderer:
    def __init__(self, path_file, output_path):
        self.my_datasets = my_datasets # key : a class, value : a list of uids
        self.base_path = os.path.join("D:\Sehyeon\Datasets\objaverse", "hf-objaverse-v1")
        with gzip.open(path_file, 'rt') as f:
            self.object_paths = json.load(f)
        self.output_path = output_path
        
    def load_mesh(self, uid):
        # load 3D .glb mesh from its UID
        mesh_path = os.path.join(self.base_path, self.object_paths[uid])
        if not mesh_path or not os.path.exists(mesh_path):
            print(f"Model file not found for UID: {uid}")
            return None
        
        # normalize the mesh to fit inside a 2*2*2 cube centered at (0,0,0)
        try:
            mesh = trimesh.load(mesh_path, force='mesh')
            centroid = mesh.bounds.mean(axis=0)        
            mesh.apply_translation(-centroid)
            
            max_extent = mesh.extents.max()
            scale_factor = 2.0 / max_extent
            
            mesh.apply_scale(scale_factor)
            
        except Exception as e:
            print(f"Error loading mesh: {e}")
            print(f"This mesh is not available 1(UID: {uid}), Error : {e}")
            return None
        
        return mesh
    
    def render_mesh(self, uid, mesh, camera_pos=[[2.5, 1.0, 2.5], [-2.5, 1.0, 2.5], [-2.5, 1.0, -2.5], [2.5, 1.0, -2.5]], 
                    image_size=(512, 512)):
        
        try:
            scene = pyrender.Scene()
            mesh = pyrender.Mesh.from_trimesh(mesh)
            scene.add(mesh)
            
            camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0, name="camera")
            light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0, name="light")
            
        except Exception as e:
            print(f"This mesh is not available 2(UID: {uid}), Error : {e}")
            return None    
        
        images = []
        
        for c_pos in camera_pos:
            # Set camera pose
            # Calculate camera pose to look at the origin
            eye = np.array(c_pos)
            target = np.array([0.0, 0.0, 0.0])  # Look at the origin
            up = np.array([0.0, 1.0, 0.0])  # Define the up direction

            # Compute the camera view matrix
            z_axis = (eye - target) / np.linalg.norm(eye - target) # Forward
            x_axis = np.cross(up, z_axis) / np.linalg.norm(np.cross(up, z_axis)) # Right
            y_axis = np.cross(z_axis, x_axis) # Up

            camera_pose = np.eye(4)
            camera_pose[:3, :3] = np.vstack([x_axis, y_axis, z_axis]).T
            camera_pose[:3, 3] = eye
            

            # Render the scene
            try:
                camera_node = scene.add(camera, pose=camera_pose)
                light_node = scene.add(light, pose=camera_pose)
                r = pyrender.OffscreenRenderer(*image_size)
                color, _ = r.render(scene)
                images.append(Image.fromarray(color))
                
                scene.remove_node(camera_node)
                scene.remove_node(light_node)
            except Exception as e:
                print(f"This mesh is not available 3(UID: {uid}), Error : {e}")
                return None
                
        return images
    
    def process_dataset(self):
        cls_count = 1
        for cls, uids in self.my_datasets.items():
            print(f"\nProcessing {cls}... {cls_count} / {len(my_datasets)}")
            
            class_dir = os.path.join(self.output_path, cls)
            os.makedirs(class_dir, exist_ok=True)
            
            for uid in uids:
                img_exists = True
                for i in range(4):
                    img_path = os.path.join(class_dir, f"{uid}_{i}.png") # class path + img name
                    if os.path.exists(img_path):
                        print(f"The file {img_path} already exists.")
                    else:
                        img_exists = False
                        break
                
                if img_exists is False:
                    mesh = self.load_mesh(uid)
                    if mesh is None:
                        continue
                    images = self.render_mesh(uid, mesh)
                    if images is None:
                        continue
                    
                    for i, img in enumerate(images):
                        img_path = os.path.join(class_dir, f"{uid}_{i}.png")
                        try:
                            img.save(img_path)
                            print(f"Saved {uid}_{i}.png")
                        except Exception as e:
                            print(f"Error saving image: {e}")
            
            cls_count += 1
        
        print("Rendering completed!")

In [6]:
# once you rendered the images, you don't need to run it again.

renderer = Renderer("D:\Sehyeon\Datasets\objaverse\hf-objaverse-v1\object-paths.json.gz", "D:\Sehyeon\Datasets\output")
renderer.process_dataset()

NameError: name 'Renderer' is not defined

In [39]:
import json
import os

img_base_path = "D:\Sehyeon\Datasets\output"
img_class_datasets = []

for cls, uids in my_datasets.items():
    for uid in uids:
        for i in range(4):
            img_path = os.path.join(img_base_path, cls, f"{uid}_{i}.png")
            if not os.path.exists(img_path):
                print(f"The file {img_path} doesn't exists.")
                break
            temp = {"image": img_path, "label": cls}
            img_class_datasets.append(temp)

# check whether the datasets is correctly constructed 
print(img_class_datasets[:100])

dataset_json_path = os.path.join(img_base_path, "datasets.json")
with open(dataset_json_path, "w", encoding="utf-8") as f:
    json.dump(img_class_datasets, f, indent=4)
    

The file D:\Sehyeon\Datasets\output\Ferris_wheel\5c1c07d9bf894fa4b05769bf73b596d3_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\Tabasco_sauce\3dc72225634f4806b2bf4e24e2c0da6f_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\alligator\da05f13eada54ab8922726d74ef89652_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\ambulance\3692f90ebefe487689ff9064e8167eb7_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\ax\921e4467c7904b21af139862e691eac0_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\banner\434b71cfd5bc4e3ab361f6c195abb182_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\barge\919af3940fe94ba6940897da72791990_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\barrel\09e0d00fe07542169f13fd46297a2f06_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\bath_mat\967b032e00a8455ba1103f7a05aa9632_0.png doesn't exists.
The file D:\Sehyeon\Datasets\output\beeper\5895cab07bed47ca9defeacd935c55ec_0.png doesn't exists.
The 

In [40]:
from datasets import load_dataset

dataset = load_dataset("json", data_files=dataset_json_path)

dataset = dataset["train"].train_test_split(train_size=0.9, shuffle=True, seed=42)
print(dataset["train"])
print(dataset["test"])

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['image', 'label'],
    num_rows: 37882
})
Dataset({
    features: ['image', 'label'],
    num_rows: 4210
})


In [41]:
from transformers import CLIPModel, CLIPProcessor

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

In [42]:
from torch.utils.data import Dataset, DataLoader
from PIL import Image

class CustomDataset(Dataset):
    def __init__(self, dataset, clip_processor, is_train):
        self.dataset = dataset
        self.is_train = is_train
        self.clip_processor = clip_processor
        self.class_texts = [
            f"A photo of a {cls}." for cls in set(dataset["label"])
        ]
        
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image_path = self.dataset[idx]["image"]
        label = self.dataset[idx]["label"]

        image = Image.open(image_path).convert("RGB")
        text = f"A photo of a {label}."
        
        return {"image": image, "label": label, "text": text}

    def preprocess(self, batch):
        images = [data["image"] for data in batch]
        texts = [data["text"] for data in batch]
        labels = [data["label"] for data in batch]

        inputs = self.clip_processor(text=texts, images=images, return_tensors="pt", padding=True)

        return {
            "text": texts,
            "label": labels,
            **inputs,
        }

In [43]:
train_dataset = dataset["train"]
train_dataset = CustomDataset(train_dataset, clip_processor, is_train=True)
print(train_dataset.class_texts)

train_dataloader = DataLoader(
    train_dataset, 
    batch_size=64,
    shuffle=True,
    collate_fn=train_dataset.preprocess
)

test_dataset = dataset["test"]
test_dataset = CustomDataset(test_dataset, clip_processor, is_train=False)
print(test_dataset.class_texts)

test_dataloader = DataLoader(test_dataset, 
    batch_size=64,
    shuffle=True, 
    collate_fn=test_dataset.preprocess)

['A photo of a folding_chair.', 'A photo of a record_player.', 'A photo of a shopping_bag.', 'A photo of a pinwheel.', 'A photo of a cigarette.', 'A photo of a blouse.', 'A photo of a bear.', 'A photo of a boom_microphone.', 'A photo of a wagon.', 'A photo of a kitten.', 'A photo of a bulldozer.', 'A photo of a butterfly.', 'A photo of a calf.', 'A photo of a vulture.', 'A photo of a napkin.', 'A photo of a pocket_watch.', 'A photo of a pickle.', 'A photo of a race_car.', 'A photo of a hookah.', 'A photo of a Ferris_wheel.', 'A photo of a hot_sauce.', 'A photo of a goggles.', 'A photo of a toilet_tissue.', 'A photo of a sportswear.', 'A photo of a mascot.', 'A photo of a control.', 'A photo of a pegboard.', 'A photo of a card.', 'A photo of a dragonfly.', 'A photo of a date_(fruit).', 'A photo of a book.', 'A photo of a bowling_ball.', 'A photo of a wooden_leg.', 'A photo of a helmet.', 'A photo of a sock.', 'A photo of a broom.', 'A photo of a spear.', 'A photo of a ski_parka.', 'A ph

In [44]:
import torch.nn.functional as F

def loss_fn(logits_per_image, logits_per_text):
    assert logits_per_image.shape[0] == logits_per_image.shape[0] # logits' shape should be (nxn)
    assert logits_per_image.shape == logits_per_text.shape
    
    labels = torch.arange(logits_per_image.shape[0], device="cuda")
    loss_i = F.cross_entropy(logits_per_image, labels)
    loss_t = F.cross_entropy(logits_per_image, labels)
    loss = (loss_i + loss_t) / 2
    
    return loss

In [45]:
from torch.optim import AdamW
from tqdm import tqdm

clip_model.to("cuda")
optimizer = AdamW(clip_model.parameters(), lr=5e-6)
clip_model.train()

num_epochs = 5
for epoch in tqdm(range(1, num_epochs+1), position=0, desc="epoch"):
    for batch in tqdm(train_dataloader, position=0, desc="batch", leave=False):
        optimizer.zero_grad()
 
        outputs = clip_model(
            pixel_values=batch["pixel_values"].to("cuda"),
            input_ids=batch["input_ids"].to("cuda"),
            attention_mask=batch["attention_mask"].to("cuda"),
        )
 
        logits_per_image = outputs.logits_per_image
        logits_per_text = outputs.logits_per_text
        loss = loss_fn(logits_per_image, logits_per_text)
        loss.backward()
                
        optimizer.step()

    print(f"Epoch {epoch}, Loss: {loss:.4f}")

epoch:  20%|██        | 1/5 [05:09<20:38, 309.71s/it]   

Epoch 1, Loss: 0.6855


epoch:  40%|████      | 2/5 [10:20<15:31, 310.40s/it]   

Epoch 2, Loss: 1.0161


epoch:  60%|██████    | 3/5 [15:29<10:19, 309.85s/it]   

Epoch 3, Loss: 0.2241


epoch:  80%|████████  | 4/5 [20:40<05:10, 310.20s/it]   

Epoch 4, Loss: 0.2083


epoch: 100%|██████████| 5/5 [25:50<00:00, 310.06s/it]   

Epoch 5, Loss: 0.1133





In [46]:
import torch.nn.functional as F

all_class_texts = clip_processor.tokenizer(test_dataset.class_texts, padding=True)
all_class_texts = {k: torch.tensor(v, device="cuda") for k, v in all_class_texts.items()}
# print(all_class_texts)

class_to_idx = {}
for i, text in enumerate(test_dataset.class_texts):
    cls = text[len("A photo of a "):-1]
    class_to_idx[cls] = i

clip_model.eval()
correct_count = 0
ce_loss_sum = 0

with torch.no_grad():
    for batch in tqdm(test_dataloader):
        outputs = clip_model(
            pixel_values=batch["pixel_values"].to("cuda"),
            **all_class_texts,
        )
        
        probs = outputs.logits_per_image.cpu().softmax(dim=1)
        pred = probs.argmax(dim=1)
        # print("prediction :", pred)
        label = torch.tensor([class_to_idx[lbl] for lbl in batch["label"]])
        # print("labels :", label)
 
        correct_count += (pred == label).sum().item()
        ce_loss_sum += F.cross_entropy(probs, label).item()
    
accuracy = correct_count / len(test_dataloader.dataset)
ce_loss = ce_loss_sum / len(test_dataloader)
print(f"Test CE loss: {ce_loss:.4}, Test accuracy: {(accuracy * 100):.4}%")

100%|██████████| 66/66 [01:00<00:00,  1.09it/s]

Test CE loss: 6.498, Test accuracy: 51.24 %





In [60]:
import torch
import trimesh
import pyrender
import numpy as np
from PIL import Image

class RendererWithCLIP():
    def __init__(self, mesh_path, text_query, clip_model, clip_processor):
        self.mesh_path = mesh_path
        self.text_query = text_query
        self.clip_model = clip_model
        self.clip_processor = clip_processor
        
    def load_mesh(self):
        if not self.mesh_path or not os.path.exists(self.mesh_path):
            print(f"Model file not found")
            return None
        
        # normalize the mesh to fit inside a 2*2*2 cube centered at (0,0,0)
        try:
            mesh = trimesh.load(self.mesh_path, force='mesh')
            centroid = mesh.bounds.mean(axis=0)        
            mesh.apply_translation(-centroid)
            
            max_extent = mesh.extents.max()
            scale_factor = 2.0 / max_extent
            
            mesh.apply_scale(scale_factor)
            
        except Exception as e:
            print(f"Error loading mesh: {e}")
            return None
        
        return mesh
    
    def render_mesh(self, mesh, camera_pos=[2.5, 1.0, 2.5], image_size=(512, 512)):
        try:
            scene = pyrender.Scene()
            mesh = pyrender.Mesh.from_trimesh(mesh)
            scene.add(mesh)
            
            camera = pyrender.PerspectiveCamera(yfov=np.pi / 3.0, name="camera")
            light = pyrender.DirectionalLight(color=np.ones(3), intensity=3.0, name="light")
            
        except Exception as e:
            print(f"This mesh is not available 1")
            return None
        
        eye = np.array(camera_pos)
        target = np.array([0.0, 0.0, 0.0])
        up = np.array([0.0, 1.0, 0.0])

        # Compute the camera view matrix
        z_axis = (eye - target) / np.linalg.norm(eye - target) # Forward
        x_axis = np.cross(up, z_axis) / np.linalg.norm(np.cross(up, z_axis)) # Right
        y_axis = np.cross(z_axis, x_axis) # Up

        camera_pose_matrix = np.eye(4)
        camera_pose_matrix[:3, :3] = np.vstack([x_axis, y_axis, z_axis]).T
        camera_pose_matrix[:3, 3] = eye
        
        # Render the scene
        try:
            camera_node = scene.add(camera, pose=camera_pose_matrix)
            light_node = scene.add(light, pose=camera_pose_matrix)
            r = pyrender.OffscreenRenderer(*image_size)
            color, _ = r.render(scene)
            
            return Image.fromarray(color)
            
        except Exception as e:
            print(f"This mesh is not available 2")
            return None

    def predict_from_3d_model(self):
        # Load the 3D mesh
        mesh = self.load_mesh()
        if mesh is None:
            print(f"Failed to load mesh")
            return None

        # Render the mesh into multiple views
        image = self.render_mesh(mesh)
        if image is None:
            print(f"Failed to render images for UID: {uid}")
            return None

        # Prepare the images for CLIP
        inputs = self.clip_processor(text=self.text_query, images=image, return_tensors="pt", padding=True).to("cuda")

        # Predict with CLIP
        with torch.no_grad():
            outputs = self.clip_model(**inputs)
            logits_per_image = outputs.logits_per_image
            probs = logits_per_image.softmax(dim=1)
            
            predicted_class_idx = torch.argmax(probs[0]).item()
            predicted_text = self.text_query[predicted_class_idx]

        return predicted_text

In [96]:
import os

class_path = "D:\Sehyeon\Datasets\output"
text_query = []

# create list of all classes
for item in os.listdir(class_path):
    item_path = os.path.join(class_path, item)
    if os.path.isdir(item_path):
        text_query.append("A 3D model of a " + item + ".")
        
renderer_with_CLIP = RendererWithCLIP("D:\Sehyeon\Datasets\objaverse\hf-objaverse-v1\glbs\\000-008\\74b5c985f7a747fe9863f3b5f977a056.glb", text_query, clip_model, clip_processor)
clip_result = renderer_with_CLIP.predict_from_3d_model()
print("result :", clip_result)

result : A 3D model of a bowl.


In [119]:
from transformers import AutoTokenizer, AutoModelForCausalLM

llama_tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-3.2-1B")
llama_model = AutoModelForCausalLM.from_pretrained("meta-llama/Llama-3.2-1B")

llama_model.to("cuda")

clip_result_class_name = clip_result[len("A 3D model of a "):-len(".")]
prompt = f"Here is {clip_result}. Write only a perfectly objective explanation about what {clip_result_class_name} is, and its uses."

inputs = llama_tokenizer(prompt, return_tensors="pt").to("cuda")
output = llama_model.generate(inputs["input_ids"], max_length=100, num_return_sequences=1, temperature=0.7)

response = llama_tokenizer.decode(output[0], skip_special_tokens=True)
print("Generated Response:", response[len(prompt):])

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


KeyboardInterrupt: 