In [1]:
# GPU: CUDA 11.8, PyTorch 2.0.1 su Kaggle
!pip install --upgrade pip
!pip install git+https://github.com/openai/CLIP.git
# 1) Rimuovi ogni installazione pre-esistente  
!pip uninstall -y kaolin

# 2) Installa esattamente Torch 2.0.1 cu118 (l’ambiente Kaggle è già cu118, ma lo riallineiamo)
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 \
  -f https://download.pytorch.org/whl/cu118/torch_stable.html

# 3) Installa la wheel ufficiale di Kaolin per Torch 2.0.1+cu118
!pip install kaolin==0.17.0 \
  -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.0.1_cu118.html

!pip install tqdm pillow
!rm -rf /kaggle/working/Affordance_Highlighting_Project_2024
!rm -rf /kaggle/working/output
!git clone https://github.com/MirkoDiMa/Affordance_Highlighting_Project_2024.git
%cd Affordance_Highlighting_Project_2024
import sys
# Aggiungi la cartella principale del repo al PYTHONPATH
sys.path.append('/kaggle/working/Affordance_Highlighting_Project_2024')

Collecting pip
  Downloading pip-25.1.1-py3-none-any.whl.metadata (3.6 kB)
Downloading pip-25.1.1-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m27.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hInstalling collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.1.2
    Uninstalling pip-24.1.2:
      Successfully uninstalled pip-24.1.2
Successfully installed pip-25.1.1
Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-wnecqzee
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-wnecqzee
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting ftfy (from clip==1.0)
  Downloading ftfy-6.3.1-py3-none-any.whl.metadata (7.3 kB)
Collecting nvidia-cuda-nvrtc-cu12==1

In [2]:
# ─── EXPERIMENT CONFIG ──────────────────────────────────────────────────────────
exp_config = {
    # Prompt
    "prompt": "A 3D render of a gray candle with highlighted hat",

    # Seed & determinismo
    "seed": 45,

    # Dati & percorsi
    "obj_path":        "data/candle.obj",
    "output_dir":      "/kaggle/working/output",

    # CLIP
    "clip_model_name": "ViT-L/14",

    # MLP
    "mlp_input_dim":   3,
    "mlp_hidden_dim":  256,
    "mlp_num_layers":  8,
    "mlp_out_dim":     2,
    "positional_encoding": False,
    "sigma":           5.0,

    # Training
    "render_res":      224,
    "n_views":         8,
    "learning_rate":   1e-4,
    "n_iter":          2500,
    "n_augs":          5,
    "clipavg":         "view",

    # Augmentation
    "aug_type":        "RandomPerspective",
    "aug_params": {
        "distortion_scale": 0.5,
        "p":               0.8,
    },
}


In [3]:
import clip
import copy
import json
import kaolin as kal
import kaolin.ops.mesh
import numpy as np
import os
import random
import torch
import torch.nn as nn
import torchvision
import time

from itertools import permutations, product
from Normalization import MeshNormalizer
from mesh import Mesh
from pathlib import Path
from render import Renderer
from tqdm import tqdm
from torch.autograd import grad
from torchvision import transforms
from utils import device, color_mesh
from utils import FourierFeatureTransform

class NeuralHighlighter(nn.Module):
    def __init__(self, depth, width, out_dim, input_dim=3, positional_encoding=False, sigma=5.0):
        super(NeuralHighlighter, self).__init__()
        layers = []
        if positional_encoding:
            layers.append(FourierFeatureTransform(input_dim, width, sigma))
            layers.append(nn.Linear(width * 2 + input_dim, width))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([width]))
        else:
            layers.append(nn.Linear(input_dim, width))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([width]))
        for i in range(depth):
            layers.append(nn.Linear(width, width))
            layers.append(nn.ReLU())
            layers.append(nn.LayerNorm([width]))
        layers.append(nn.Linear(width, out_dim))
        layers.append(nn.Softmax(dim=1))

        self.mlp = nn.ModuleList(layers)
        print(self.mlp)
    
    def forward(self, x):
        for layer in self.mlp:
            x = layer(x)
        return x

def get_clip_model(clipmodel):
    model, preprocess = clip.load(clipmodel, device=device, jit=False)
    return model, preprocess

# ================== HELPER FUNCTIONS =============================
def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    mlp.eval()
    with torch.no_grad():
        probs = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        # for renders
        one_hot = torch.zeros(probs.shape).to(device)
        one_hot = one_hot.scatter_(1, max_idx, 1)
        sampled_mesh = mesh

        highlight = torch.tensor([204, 255, 0]).to(device)
        gray = torch.tensor([180, 180, 180]).to(device)
        colors = torch.stack((highlight/255, gray/255)).to(device)
        color_mesh(one_hot, sampled_mesh, colors)
        rendered_images, _, _ = render.render_views(sampled_mesh, num_views=5,
                                                                        show=False,
                                                                        center_azim=0,
                                                                        center_elev=0,
                                                                        std=4,
                                                                        return_views=True,
                                                                        lighting=True,
                                                                        background=background)
        # for mesh
        final_color = torch.zeros(vertices.shape[0], 3).to(device)
        final_color = torch.where(max_idx==0, highlight, gray)
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')
def save_exp_config(config, output_dir):
    import json, csv, os
    # JSON
    with open(os.path.join(output_dir, 'experiment_config.json'), 'w') as f:
        json.dump(config, f, indent=2)
    # CSV
    csv_path = os.path.join(output_dir, 'experiments_summary.csv')
    write_header = not os.path.exists(csv_path)
    with open(csv_path, 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=config.keys())
        if write_header: writer.writeheader()
        writer.writerow(config)

def clip_loss(rendered_images: torch.Tensor,
              text_embedding: torch.Tensor,
              clip_model: nn.Module,
              clip_transform: transforms.Compose,
              augment_transform: transforms.Compose,
              n_augs: int,
              clipavg: str = "view") -> torch.Tensor:
    """
    Replichiamo esattamente la loss del codice ufficiale:
    - n_augs==0: un solo forward con clip_transform
    - n_augs>0: summation di n_augs forward con augment_transform
    - clipavg="view": media sulle viste prima di cosine‐similarity
    - clipavg!="view": media sulle coppie vista‐testo
    """
    # caso senza augmentazioni
    if n_augs == 0:
        # 1) applica resize+normalize
        clip_imgs = clip_transform(rendered_images)            # (V,3,H,W)
        # 2) encode CLIP
        enc = clip_model.encode_image(clip_imgs)               # (V,D)
        enc = enc / enc.norm(dim=1, keepdim=True)

        # 3) normalizza testo
        txt = text_embedding / text_embedding.norm(dim=1, keepdim=True)

        # 4) computa loss
        if clipavg == "view":
            if txt.shape[0] > 1:
                # media viste vs media testo
                loss = -torch.cosine_similarity(enc.mean(0),
                                                txt.mean(0), dim=0)
            else:
                loss = -torch.cosine_similarity(enc.mean(0, keepdim=True),
                                                txt, dim=1)
        else:
            loss = -torch.mean(torch.cosine_similarity(enc, txt, dim=1))

    # caso con augmentazioni
    else:
        loss = 0.0
        for _ in range(n_augs):
            # 1) augment + normalize
            aug = augment_transform(rendered_images)            # (V,3,H,W)
            # 2) encode
            enc_a = clip_model.encode_image(aug)
            enc_a = enc_a / enc_a.norm(dim=1, keepdim=True)
            # 3) testo normalizzato
            txt = text_embedding / text_embedding.norm(dim=1, keepdim=True)

            # 4) accumula loss (no division!)
            if clipavg == "view":
                if txt.shape[0] > 1:
                    loss -= torch.cosine_similarity(enc_a.mean(0),
                                                    txt.mean(0), dim=0)
                else:
                    loss -= torch.cosine_similarity(enc_a.mean(0, keepdim=True),
                                                    txt, dim=1)
            else:
                loss -= torch.mean(torch.cosine_similarity(enc_a, txt, dim=1))

    return loss


    
def save_renders(dir, i, rendered_images, name=None):
    if name is not None:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, name))
    else:
        torchvision.utils.save_image(rendered_images, os.path.join(dir, 'renders/iter_{}.jpg'.format(i)))


Warp 1.8.0 initialized:
   CUDA Toolkit 12.8, Driver 12.6
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
     "cuda:1"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   CUDA peer access:
     Supported fully (all-directional)
   Kernel cache:
     /root/.cache/warp/1.8.0


In [None]:
# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
seed=exp_config["seed"]
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True

render_res = exp_config["render_res"]
learning_rate = exp_config["learning_rate"]
n_iter = exp_config["n_iter"]
res = exp_config["render_res"]
obj_path = exp_config["obj_path"]
n_augs = exp_config["n_augs"]
output_dir = exp_config["output_dir"]
clip_model = exp_config["clip_model_name"]

clip_model, preprocess = get_clip_model(clip_model)

Path(os.path.join(output_dir, 'renders')).mkdir(parents=True, exist_ok=True)

objbase, extension = os.path.splitext(os.path.basename(obj_path))

render = Renderer(dim=(render_res, render_res))
mesh = Mesh(obj_path)
MeshNormalizer(mesh)()

# Initialize variables
background = torch.tensor((1., 1., 1.)).to(device)

log_dir = output_dir

# CLIP and Augmentation Transforms
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([
    transforms.Resize((res, res), antialias=False),
    clip_normalizer
])
augment_transform = transforms.Compose([
    transforms.RandomResizedCrop(res, scale=(1, 1), antialias=False),
    transforms.RandomPerspective(fill=1,
                                 distortion_scale=exp_config["aug_params"]["distortion_scale"],
                                 p=exp_config["aug_params"]["p"]),
    clip_normalizer
])

# MLP Settings
mlp = NeuralHighlighter(depth=exp_config["mlp_num_layers"],
    width=exp_config["mlp_hidden_dim"],
    out_dim=exp_config["mlp_out_dim"],
    input_dim=exp_config["mlp_input_dim"],
    positional_encoding=exp_config["positional_encoding"],
    sigma=exp_config["sigma"]).to(device)
optim = torch.optim.Adam(mlp.parameters(), learning_rate)

# list of possible colors
rgb_to_color = {(204/255, 1., 0.): "highlighter", (180/255, 180/255, 180/255): "gray"}
color_to_rgb = {"highlighter": [204/255, 1., 0.], "gray": [180/255, 180/255, 180/255]}
full_colors = [[204/255, 1., 0.], [180/255, 180/255, 180/255]]
colors = torch.tensor(full_colors).to(device)


# --- Prompt ---
# encode prompt with CLIP
prompt = exp_config["prompt"]

with torch.no_grad():
    prompt_token = clip.tokenize([prompt]).to(device)
    encoded_text = clip_model.encode_text(prompt_token)
    encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)

vertices = copy.deepcopy(mesh.vertices)
n_views = exp_config["n_views"]

best_loss = float('inf')
best_iter = -1
best_state = None

losses = []
start_time = time.time()
# Optimization loop
for i in tqdm(range(n_iter)):
    optim.zero_grad()

    # predict highlight probabilities
    pred_class = mlp(vertices)

    # color and render mesh
    sampled_mesh = mesh
    color_mesh(pred_class, sampled_mesh, colors)
    rendered_images, elev, azim = render.render_views(sampled_mesh, num_views=n_views,
                                                            show=False,
                                                            center_azim=0,
                                                            center_elev=0,
                                                            std=4,
                                                            return_views=True,
                                                            lighting=True,
                                                            background=background)

    # Calculate CLIP Loss
    loss = clip_loss(rendered_images,
        encoded_text,
        clip_model,
        clip_transform,
        augment_transform,
        n_augs,
        clipavg = exp_config["clipavg"])
    loss.backward()

    optim.step()

    # update variables + record loss
    with torch.no_grad():
        losses.append(loss.item())
    # tracking del best
    if loss.item() < best_loss:
        best_loss  = loss.item()
        best_iter  = i
        best_state = copy.deepcopy(mlp.state_dict())
        # opzionale: salva immediatamente anche le immagini
        #save_renders(log_dir, f"best_{best_iter}", rendered_images)
    # report results
    if i % 100 == 0:
        print("Last 100 CLIP score: {}".format(np.mean(losses[-100:])))
        save_renders(log_dir, i, rendered_images)
        with open(os.path.join(log_dir, "training_info.txt"), "a") as f:
            f.write(f"For iteration {i}... Prompt: {prompt}, Last 100 avg CLIP score: {np.mean(losses[-100:])}, CLIP score {losses[-1]}\n")

# metriche
final_loss       = losses[-1]
exp_config["final_clip_score"]       = -final_loss
exp_config["avg_clip_score_last100"] = -float(np.mean(losses[-100:]))
exp_config["runtime_seconds"]        = time.time() - start_time
# fine del loop: ricarica il modello al best_iter
mlp.load_state_dict(best_state)
exp_config["best_iter"] = best_iter
exp_config["best_clip_score"] = -best_loss
# salva config + summary
save_exp_config(exp_config, output_dir)
# save results
save_final_results(log_dir, f"{objbase}_best_iter{best_iter}", mesh, mlp, vertices, colors, render, background)

# Save prompts
with open(os.path.join(log_dir, "prompt.txt"), "w") as f:
    f.write(prompt)

100%|███████████████████████████████████████| 890M/890M [00:18<00:00, 51.4MiB/s]


ModuleList(
  (0): Linear(in_features=3, out_features=256, bias=True)
  (1): ReLU()
  (2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (3): Linear(in_features=256, out_features=256, bias=True)
  (4): ReLU()
  (5): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (6): Linear(in_features=256, out_features=256, bias=True)
  (7): ReLU()
  (8): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (9): Linear(in_features=256, out_features=256, bias=True)
  (10): ReLU()
  (11): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (12): Linear(in_features=256, out_features=256, bias=True)
  (13): ReLU()
  (14): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (15): Linear(in_features=256, out_features=256, bias=True)
  (16): ReLU()
  (17): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (18): Linear(in_features=256, out_features=256, bias=True)
  (19): ReLU()
  (20): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  (21): Linear(in_features=256, ou

  0%|          | 1/2500 [00:02<1:28:26,  2.12s/it]

Last 100 CLIP score: -1.57421875


  4%|▍         | 101/2500 [02:56<1:13:20,  1.83s/it]

Last 100 CLIP score: -1.6928515625


  8%|▊         | 201/2500 [06:02<1:12:24,  1.89s/it]

Last 100 CLIP score: -1.69857421875


 12%|█▏        | 301/2500 [09:09<1:08:00,  1.86s/it]

Last 100 CLIP score: -1.7078515625


 16%|█▌        | 401/2500 [12:14<1:05:19,  1.87s/it]

Last 100 CLIP score: -1.699345703125


 20%|██        | 501/2500 [15:21<1:02:09,  1.87s/it]

Last 100 CLIP score: -1.711455078125


 24%|██▍       | 601/2500 [18:27<59:08,  1.87s/it]  

Last 100 CLIP score: -1.7155078125


 27%|██▋       | 686/2500 [21:05<56:27,  1.87s/it]