**CELL 1 — Environment & repo**

In [1]:
# GPU: CUDA 11.8, PyTorch 2.0.1 su Kaggle
!pip install --upgrade pip
!pip install git+https://github.com/openai/CLIP.git
# 1) Rimuovi ogni installazione pre-esistente  
!pip uninstall -y kaolin

# 2) Installa esattamente Torch 2.0.1 cu118 (l’ambiente Kaggle è già cu118, ma lo riallineiamo)
!pip install torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 \
  -f https://download.pytorch.org/whl/cu118/torch_stable.html

# 3) Installa la wheel ufficiale di Kaolin per Torch 2.0.1+cu118
!pip install kaolin==0.17.0 \
  -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.0.1_cu118.html

!pip install -q open3d==0.18.0 tqdm pillow
!rm -rf /kaggle/working/Affordance_Highlighting_Project_2024
!rm -rf /kaggle/working/output
!git clone https://github.com/MirkoDiMa/Affordance_Highlighting_Project_2024.git
%cd Affordance_Highlighting_Project_2024
import sys
# Aggiungi la cartella principale del repo al PYTHONPATH
sys.path.append('/kaggle/working/Affordance_Highlighting_Project_2024')

Collecting git+https://github.com/openai/CLIP.git
  Cloning https://github.com/openai/CLIP.git to /tmp/pip-req-build-e43uf2qi
  Running command git clone --filter=blob:none --quiet https://github.com/openai/CLIP.git /tmp/pip-req-build-e43uf2qi
  Resolved https://github.com/openai/CLIP.git to commit dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
  Preparing metadata (setup.py) ... [?25l[?25hdone
Found existing installation: kaolin 0.17.0
Uninstalling kaolin-0.17.0:
  Successfully uninstalled kaolin-0.17.0
Looking in links: https://download.pytorch.org/whl/cu118/torch_stable.html
Looking in links: https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.0.1_cu118.html
Collecting kaolin==0.17.0
  Using cached https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.0.1_cu118/kaolin-0.17.0-cp311-cp311-linux_x86_64.whl (5.9 MB)
Installing collected packages: kaolin
Successfully installed kaolin-0.17.0
Cloning into 'Affordance_Highlighting_Project_2024'...
remote: Enumerating objects: 133, done.

**CELL 2 — Config**

In [2]:
# =========================
#         CONFIG
# =========================
import os, json, copy, time, gc, random, numpy as np
from pathlib import Path

# Keep CPU libs conservative to avoid crashes on Kaggle
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"

REPO_ROOT   = '/kaggle/working/Affordance_Highlighting_Project_2024'
OUTPUT_ROOT = '/kaggle/working/output'
Path(OUTPUT_ROOT).mkdir(parents=True, exist_ok=True)

# We only load point clouds from the repository (no mesh fallback)
PLY_RELATIVE   = "data/candle_15000pts.ply"   # << change if needed
USE_PLY_DIRECT = True                         # << must be True

# Base OBJ kept only for naming; not used to sample PCD
OBJ_RELATIVE = "data/candle.obj"

SAFE_MODE = False  # Does not change hyperparameters; just controls n_iter below

exp_config = {
    "prompt": "A 3D render of a gray candle with highlighted hat",
    "seed": 45,
    "clip_model_name": "ViT-L/14",
    "render_res": 224,
    "n_views": 5,
    "learning_rate": 1e-5,
    "n_iter": 400 if SAFE_MODE else 2500,
    "n_augs": 5,
    "clipavg": "view",
    "mlp_input_dim": 3,
    "mlp_hidden_dim": 256,
    "mlp_num_layers": 6,
    "mlp_out_dim": 2,
    "positional_encoding": False,
    "sigma": 5.0,
}


**CELL 3 — Imports & small utilities**

In [3]:
# =========================
#   IMPORTS & HELPERS
# =========================
import torch, torch.nn as nn, torchvision
import clip
import open3d as o3d
from torchvision import transforms
from tqdm import tqdm

from render import Renderer
from mesh import Mesh

# Normalizer may be named differently across repos
try:
    from Normalization import MeshNormalizer
except Exception:
    from MeshNormalizer import MeshNormalizer

from utils import device, color_mesh
try:
    from utils import FourierFeatureTransform
    HAS_FOURIER = True
except Exception:
    HAS_FOURIER = False


def set_seed(seed: int):
    """Deterministic seeding where feasible."""
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True


class NeuralHighlighter(nn.Module):
    """Simple MLP that outputs per-vertex soft assignments (2 classes)."""
    def __init__(self, depth, width, out_dim, input_dim=3, positional_encoding=False, sigma=5.0):
        super().__init__()
        layers = []
        if positional_encoding and HAS_FOURIER:
            layers.append(FourierFeatureTransform(input_dim, width, sigma))
            layers.append(nn.Linear(width * 2 + input_dim, width))
        else:
            layers.append(nn.Linear(input_dim, width))
        layers += [nn.ReLU(), nn.LayerNorm([width])]

        for _ in range(depth):
            layers += [nn.Linear(width, width), nn.ReLU(), nn.LayerNorm([width])]

        layers += [nn.Linear(width, out_dim), nn.Softmax(dim=1)]
        self.mlp = nn.Sequential(*layers)

    def forward(self, x):
        return self.mlp(x)


def get_clip_model(name: str):
    model, preprocess = clip.load(name, device=device, jit=False)
    return model, preprocess


def save_renders(dirpath, i, rendered_images, name=None):
    Path(os.path.join(dirpath, 'renders')).mkdir(parents=True, exist_ok=True)
    if name is None:
        name = f"renders/iter_{i}.jpg"
    torchvision.utils.save_image(rendered_images, os.path.join(dirpath, name))


def save_exp_config(config, output_dir):
    import csv
    Path(output_dir).mkdir(parents=True, exist_ok=True)
    with open(os.path.join(output_dir, 'experiment_config.json'), 'w') as f:
        json.dump(config, f, indent=2)
    csv_path = os.path.join(output_dir, 'experiments_summary.csv')
    write_header = not os.path.exists(csv_path)
    with open(csv_path, 'a', newline='') as f:
        writer = csv.DictWriter(f, fieldnames=config.keys())
        if write_header: writer.writeheader()
        writer.writerow(config)


def save_final_results(log_dir, name, mesh, mlp, vertices, colors, render, background):
    """Export final PLY (per-vertex colors) and a small render strip."""
    mlp.eval()
    with torch.no_grad():
        probs   = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        one_hot = torch.zeros_like(probs).to(device).scatter_(1, max_idx, 1)

        highlight = torch.tensor([204, 255, 0], device=device)
        gray      = torch.tensor([180, 180, 180], device=device)
        palette   = torch.stack((highlight/255, gray/255)).to(device)

        color_mesh(one_hot, mesh, palette)

        rendered_images, _, _ = render.render_views(
            mesh, num_views=5, show=False, center_azim=0, center_elev=0,
            std=4, return_views=True, lighting=True, background=background)

        final_color = torch.where(max_idx==0, highlight, gray)  # (V,3), uint8-like in export
        mesh.export(os.path.join(log_dir, f"{name}.ply"), extension="ply", color=final_color)
        save_renders(log_dir, 0, rendered_images, name='final_render.jpg')


def clip_loss(rendered_images, text_embedding, clip_model, clip_transform, augment_transform, n_augs, clipavg="view"):
    """CLIP loss with optional augment accumulation (matches your original logic)."""
    if n_augs == 0:
        clip_imgs = clip_transform(rendered_images)
        enc = clip_model.encode_image(clip_imgs)
        enc = enc / enc.norm(dim=1, keepdim=True)
        txt = text_embedding / text_embedding.norm(dim=1, keepdim=True)
        if clipavg == "view":
            return -torch.cosine_similarity(enc.mean(0, keepdim=True), txt, dim=1)
        else:
            return -torch.mean(torch.cosine_similarity(enc, txt, dim=1))
    else:
        loss = 0.0
        for _ in range(n_augs):
            aug = augment_transform(rendered_images)
            enc = clip_model.encode_image(aug)
            enc = enc / enc.norm(dim=1, keepdim=True)
            txt = text_embedding / text_embedding.norm(dim=1, keepdim=True)
            if clipavg == "view":
                loss -= torch.cosine_similarity(enc.mean(0, keepdim=True), txt, dim=1)
            else:
                loss -= torch.mean(torch.cosine_similarity(enc, txt, dim=1))
        return loss


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
Warp 1.8.1 initialized:
   CUDA Toolkit 12.8, Driver 12.6
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
     "cuda:1"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   CUDA peer access:
     Not supported
   Kernel cache:
     /root/.cache/warp/1.8.1


**CELL 4 — Init CLIP & prompt**

In [4]:
# =========================
#   INIT CLIP & COLORS
# =========================
set_seed(exp_config["seed"])
clip_model, _ = get_clip_model(exp_config["clip_model_name"])

res = exp_config["render_res"]
clip_normalizer = transforms.Normalize(
    (0.48145466, 0.4578275, 0.40821073),
    (0.26862954, 0.26130258, 0.27577711)
)
clip_transform = transforms.Compose([
    transforms.Resize((res, res), antialias=False),
    clip_normalizer
])
augment_transform = transforms.Compose([
    transforms.RandomResizedCrop(res, scale=(1, 1), antialias=False),
    transforms.RandomPerspective(fill=1, distortion_scale=0.5, p=0.8),
    clip_normalizer
])

with torch.no_grad():
    prompt_token = clip.tokenize([exp_config["prompt"]]).to(device)
    encoded_text = clip_model.encode_text(prompt_token)
    encoded_text = encoded_text / encoded_text.norm(dim=1, keepdim=True)

# fixed 2-color palette (highlighter + gray)
colors = torch.tensor([[204/255, 1.0, 0.0],
                       [180/255, 180/255, 180/255]], device=device)


**CELL 5 — Load PLY**

In [5]:
# =========================
#   LOAD PLY FROM REPO
# =========================
POINTCLOUD_DIR = os.path.join(OUTPUT_ROOT, 'pointcloud')
PCD_DIR        = os.path.join(OUTPUT_ROOT, 'pcd')
Path(POINTCLOUD_DIR).mkdir(parents=True, exist_ok=True)
Path(PCD_DIR).mkdir(parents=True, exist_ok=True)

assert USE_PLY_DIRECT, "This notebook expects PLYs to be provided in the repo."
ply_abs = os.path.join(REPO_ROOT, PLY_RELATIVE)
if not os.path.exists(ply_abs):
    raise FileNotFoundError(f"PLY not found: {ply_abs}")

pcd = o3d.io.read_point_cloud(ply_abs)
if len(pcd.points) == 0:
    raise RuntimeError(f"PLY is empty: {ply_abs}")

# Light-weight normals for robustness
pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamKNN(knn=16))
pcd.orient_normals_consistent_tangent_plane(24)

objbase = Path(ply_abs).stem
pcd_out = os.path.join(POINTCLOUD_DIR, f"{objbase}.ply")
o3d.io.write_point_cloud(pcd_out, pcd)
print(f"[PCD] Loaded: {ply_abs}  | N points = {len(pcd.points)}")


[PCD] Loaded: /kaggle/working/Affordance_Highlighting_Project_2024/data/candle_15000pts.ply  | N points = 15000


**CELL 6 — BPA reconstruction**

In [6]:
# =========================
#   BPA RECONSTRUCTION
# =========================
def reconstruct_mesh_bpa(ply_path: str, obj_output_path: str):
    pcd = o3d.io.read_point_cloud(ply_path)
    print(f"[RECON] Points: {len(pcd.points)}")

    # Radii based on median NN distance (dense multi-scale list helps close small gaps)
    dists = np.asarray(pcd.compute_nearest_neighbor_distance())
    med = float(np.median(dists)) if dists.size else 0.01
    radii = o3d.utility.DoubleVector([med*1.2, med*1.6, med*2.0, med*2.4, med*3.0, med*3.8])

    mesh = o3d.geometry.TriangleMesh.create_from_point_cloud_ball_pivoting(pcd, radii)

    # Cleanup + normals
    mesh.remove_unreferenced_vertices()
    mesh.remove_degenerate_triangles()
    mesh.remove_duplicated_vertices()
    mesh.remove_duplicated_triangles()
    mesh.remove_non_manifold_edges()

    # Crop to PCD AABB (slightly expanded) to remove any outer shell
    bbox = pcd.get_axis_aligned_bounding_box().scale(1.01, pcd.get_axis_aligned_bounding_box().get_center())
    mesh = mesh.crop(bbox)

    mesh.compute_vertex_normals()

    # Gentle smoothing
    if hasattr(mesh, "filter_smooth_taubin"):
        mesh = mesh.filter_smooth_taubin(number_of_iterations=10)
    else:
        mesh = mesh.filter_smooth_simple(number_of_iterations=3)

    o3d.io.write_triangle_mesh(obj_output_path, mesh)
    print(f"[RECON] BPA -> {obj_output_path}")

recon_obj_abs = os.path.join(PCD_DIR, f"{objbase}_frompc.obj")
reconstruct_mesh_bpa(pcd_out, recon_obj_abs)

del pcd; gc.collect()


[RECON] Points: 15000
[RECON] BPA -> /kaggle/working/output/pcd/candle_15000pts_frompc.obj


192

**CELL 7 — Train 3D Highlighter on reconstructed mesh**

In [7]:
# =========================
#   TRAIN ON RECON MESH
# =========================
exp_config_pc = copy.deepcopy(exp_config)
exp_config_pc["obj_path"]   = recon_obj_abs
exp_config_pc["output_dir"] = PCD_DIR
Path(os.path.join(exp_config_pc["output_dir"], 'renders')).mkdir(parents=True, exist_ok=True)

renderer = Renderer(dim=(exp_config_pc["render_res"], exp_config_pc["render_res"]))
mesh_pc  = Mesh(exp_config_pc["obj_path"])
MeshNormalizer(mesh_pc)()

background  = torch.tensor((1., 1., 1.), device=device)
vertices_pc = mesh_pc.vertices.clone()
n_views     = exp_config_pc["n_views"]

mlp = NeuralHighlighter(
    depth=exp_config_pc["mlp_num_layers"],
    width=exp_config_pc["mlp_hidden_dim"],
    out_dim=exp_config_pc["mlp_out_dim"],
    input_dim=exp_config_pc["mlp_input_dim"],
    positional_encoding=exp_config_pc["positional_encoding"],
    sigma=exp_config_pc["sigma"]
).to(device)

optim = torch.optim.Adam(mlp.parameters(), exp_config_pc["learning_rate"])

best_loss, best_iter, best_state = float('inf'), -1, None
losses = []
start_time = time.time()

for i in tqdm(range(exp_config_pc["n_iter"])):
    optim.zero_grad()

    pred_class = mlp(vertices_pc)
    color_mesh(pred_class, mesh_pc, colors)

    rendered_images, elev, azim = renderer.render_views(
        mesh_pc, num_views=n_views, show=False,
        center_azim=0, center_elev=0, std=4,
        return_views=True, lighting=True, background=background
    )

    loss = clip_loss(
        rendered_images, encoded_text, clip_model,
        clip_transform, augment_transform, exp_config_pc["n_augs"],
        clipavg=exp_config_pc["clipavg"]
    )
    (loss if torch.is_tensor(loss) else torch.tensor(loss, device=device)).mean().backward()
    optim.step()

    with torch.no_grad():
        val = loss.item() if torch.is_tensor(loss) else float(loss)
        losses.append(val)
        if val < best_loss:
            best_loss  = val
            best_iter  = i
            best_state = copy.deepcopy(mlp.state_dict())

    if i % 100 == 0:
        torch.cuda.empty_cache(); gc.collect()
        print(f"Last 100 CLIP score (pcd): {np.mean(losses[-100:])}")
        save_renders(exp_config_pc["output_dir"], i, rendered_images)
        with open(os.path.join(exp_config_pc["output_dir"], "training_info.txt"), "a") as f:
            f.write(f"[PCD] Iter {i} | Prompt: {exp_config_pc['prompt']} | Last100 avg: {np.mean(losses[-100:])} | Loss: {losses[-1]}\n")

# Restore best and save artifacts
mlp.load_state_dict(best_state)
exp_config_pc["best_iter"]              = best_iter
exp_config_pc["best_clip_score"]        = -best_loss
exp_config_pc["final_clip_score"]       = -losses[-1]
exp_config_pc["avg_clip_score_last100"] = -float(np.mean(losses[-100:]))
exp_config_pc["runtime_seconds"]        = time.time() - start_time
save_exp_config(exp_config_pc, exp_config_pc["output_dir"])

objbase_pc = Path(exp_config_pc["obj_path"]).stem
save_final_results(
    exp_config_pc["output_dir"],
    f"{objbase_pc}_best_iter{best_iter}",
    mesh_pc, mlp, vertices_pc, colors, renderer, background
)

with open(os.path.join(exp_config_pc["output_dir"], "prompt.txt"), "w") as f:
    f.write(exp_config_pc["prompt"])

print(f"[DONE] Best iter: {best_iter}, best CLIP score: {-best_loss:.4f}")


  0%|          | 1/2500 [00:01<1:18:42,  1.89s/it]

Last 100 CLIP score (pcd): -1.556640625


  4%|▍         | 101/2500 [01:56<52:16,  1.31s/it]

Last 100 CLIP score (pcd): -1.666103515625


  8%|▊         | 201/2500 [03:56<51:07,  1.33s/it]

Last 100 CLIP score (pcd): -1.709755859375


 12%|█▏        | 301/2500 [05:57<49:26,  1.35s/it]

Last 100 CLIP score (pcd): -1.71279296875


 16%|█▌        | 401/2500 [08:00<47:11,  1.35s/it]

Last 100 CLIP score (pcd): -1.715419921875


 20%|██        | 501/2500 [10:02<44:57,  1.35s/it]

Last 100 CLIP score (pcd): -1.717099609375


 24%|██▍       | 601/2500 [12:04<42:42,  1.35s/it]

Last 100 CLIP score (pcd): -1.7183203125


 28%|██▊       | 701/2500 [14:06<40:37,  1.35s/it]

Last 100 CLIP score (pcd): -1.71568359375


 32%|███▏      | 801/2500 [16:08<38:12,  1.35s/it]

Last 100 CLIP score (pcd): -1.70615234375


 36%|███▌      | 901/2500 [18:11<36:03,  1.35s/it]

Last 100 CLIP score (pcd): -1.71875


 40%|████      | 1001/2500 [20:13<33:43,  1.35s/it]

Last 100 CLIP score (pcd): -1.718095703125


 44%|████▍     | 1101/2500 [22:15<31:26,  1.35s/it]

Last 100 CLIP score (pcd): -1.72951171875


 48%|████▊     | 1201/2500 [24:17<29:08,  1.35s/it]

Last 100 CLIP score (pcd): -1.7227734375


 52%|█████▏    | 1301/2500 [26:20<26:55,  1.35s/it]

Last 100 CLIP score (pcd): -1.72166015625


 56%|█████▌    | 1401/2500 [28:22<24:38,  1.35s/it]

Last 100 CLIP score (pcd): -1.72421875


 60%|██████    | 1501/2500 [30:24<22:26,  1.35s/it]

Last 100 CLIP score (pcd): -1.721533203125


 64%|██████▍   | 1601/2500 [32:26<20:07,  1.34s/it]

Last 100 CLIP score (pcd): -1.71845703125


 68%|██████▊   | 1701/2500 [34:28<17:57,  1.35s/it]

Last 100 CLIP score (pcd): -1.725615234375


 72%|███████▏  | 1801/2500 [36:30<15:45,  1.35s/it]

Last 100 CLIP score (pcd): -1.720625


 76%|███████▌  | 1901/2500 [38:33<13:26,  1.35s/it]

Last 100 CLIP score (pcd): -1.731025390625


 80%|████████  | 2001/2500 [40:35<11:12,  1.35s/it]

Last 100 CLIP score (pcd): -1.718046875


 84%|████████▍ | 2101/2500 [42:37<08:59,  1.35s/it]

Last 100 CLIP score (pcd): -1.724306640625


 88%|████████▊ | 2201/2500 [44:39<06:43,  1.35s/it]

Last 100 CLIP score (pcd): -1.73392578125


 92%|█████████▏| 2301/2500 [46:42<04:28,  1.35s/it]

Last 100 CLIP score (pcd): -1.732705078125


 96%|█████████▌| 2401/2500 [48:44<02:13,  1.35s/it]

Last 100 CLIP score (pcd): -1.716943359375


100%|██████████| 2500/2500 [50:44<00:00,  1.22s/it]


[DONE] Best iter: 1692, best CLIP score: 1.8750
