In [None]:
# GPU: CUDA 11.8, PyTorch 2.0.1 (Kaggle compatible)
!pip install --upgrade -q pip
!pip install -q git+https://github.com/openai/CLIP.git
!pip uninstall -y -q kaolin


!pip install -q torch==2.0.1+cu118 torchvision==0.15.2+cu118 torchaudio==2.0.2+cu118 \
  -f https://download.pytorch.org/whl/cu118/torch_stable.html


!pip install -q kaolin==0.17.0 \
  -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.0.1_cu118.html

!pip install -q open3d==0.18.0 tqdm pillow gdown transforms3d scipy


!rm -rf /kaggle/working/Affordance_Highlighting_Project_2024 /kaggle/working/output_PART3
!git clone -q https://github.com/MirkoDiMa/Affordance_Highlighting_Project_2024.git /kaggle/working/Affordance_Highlighting_Project_2024

import sys, os, numpy as np, random, json, time
sys.path.append('/kaggle/working/Affordance_Highlighting_Project_2024')

# CPU libs conservative 
os.environ["OMP_NUM_THREADS"]="1"
os.environ["OPENBLAS_NUM_THREADS"]="1"
os.environ["MKL_NUM_THREADS"]="1"


[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[33m  DEPRECATION: Building 'clip' using the legacy setup.py bdist_wheel mechanism, which will be removed in a future version. pip 25.3 will enforce this behaviour change. A possible replacement is to use the standardized build interface by setting the `--use-pep517` option, (possibly combined with `--no-build-isolation`), or adding a `pyproject.toml` file to the source tree of 'clip'. Discussion can be found at https://github.com/pypa/pip/issues/6334[0m[33m
[0m  Building wheel for clip (setup.py) ... [?25l[?25hdone
[0m[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
pytorch-lightning 2.5.2 requires torch>=2.1.0, but you have torch 2.0.1+cu118 which is incompa

In [None]:
from pathlib import Path
import random, numpy as np

REPO_ROOT   = '/kaggle/working/Affordance_Highlighting_Project_2024'
OUTPUT_ROOT = '/kaggle/working/output_PART3'
Path(OUTPUT_ROOT).mkdir(parents=True, exist_ok=True)

exp_config = {
    "target_category":   "Bottle",
    "target_affordance": "wrap_grasp",
    "prompt_tpl":        "A 3D render of a gray {cat} with the grasped area highlighted",

    "clip_model_name": "ViT-B/32",

    "mlp_input_dim":   3,
    "mlp_hidden_dim":  256,
    "mlp_num_layers":  6,
    "mlp_out_dim":     2,
    "positional_encoding": False,   
    "sigma":           5.0,

    "render_res":      224,
    "n_views":         8,
    "learning_rate":   1e-4,
    "n_iter_obj":      2500,
    "n_augs":          3,         
    "clipavg":         "view",

    
    "recon_mode":      "bare_poisson",
    "poisson_depth":   9,
}

seed = 45
random.seed(seed); np.random.seed(seed)


In [None]:
import torch, torch.nn as nn, torchvision
import clip
from torchvision import transforms
from tqdm import tqdm
import open3d as o3d

from render import Renderer
from mesh import Mesh

# Normalizer 
try:
    from Normalization import MeshNormalizer
except:
    from MeshNormalizer import MeshNormalizer

from utils import device, color_mesh
try:
    from utils import FourierFeatureTransform
    HAS_FOURIER = True
except:
    HAS_FOURIER = False

def set_seed(s):
    torch.manual_seed(s); torch.cuda.manual_seed(s); torch.cuda.manual_seed_all(s)
    np.random.seed(s); random.seed(s)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

class NeuralHighlighter(nn.Module):
    """MLP 2 classi (highlight / background) — identico stile ai compagni."""
    def __init__(self, depth, width, out_dim, input_dim=3, positional_encoding=False, sigma=5.0):
        super().__init__()
        layers = []
        if positional_encoding and HAS_FOURIER:
            layers += [FourierFeatureTransform(input_dim, width, sigma),
                       nn.Linear(width * 2 + input_dim, width)]
        else:
            layers += [nn.Linear(input_dim, width)]
        layers += [nn.ReLU(), nn.LayerNorm([width])]
        for _ in range(depth):
            layers += [nn.Linear(width, width), nn.ReLU(), nn.LayerNorm([width])]
        layers += [nn.Linear(width, out_dim), nn.Softmax(dim=1)]
        self.mlp = nn.Sequential(*layers)
    def forward(self, x): return self.mlp(x)

def get_clip(name):
    model, _ = clip.load(name, device=device, jit=False)
    return model

# CLIP transforms 
res = exp_config["render_res"]
clip_normalizer = transforms.Normalize((0.48145466, 0.4578275, 0.40821073),
                                       (0.26862954, 0.26130258, 0.27577711))
clip_transform = transforms.Compose([transforms.Resize((res,res), antialias=False),
                                     clip_normalizer])
augment_transform = transforms.Compose([
    transforms.RandomResizedCrop(res, scale=(0.95, 1.0), antialias=False),  # meno zoom
    transforms.RandomPerspective(fill=1, p=0.5, distortion_scale=0.2),      # meno distorsione
    clip_normalizer
])


def clip_loss(rendered_images, text_embedding, clip_model, n_augs, clipavg="view"):
    """Stessa loss usata dai tuoi notebook precedenti e dai colleghi."""
    if n_augs == 0:
        enc = clip_model.encode_image(clip_transform(rendered_images))
        enc = enc / enc.norm(dim=1, keepdim=True)
        txt = text_embedding / text_embedding.norm(dim=1, keepdim=True)
        return -torch.cosine_similarity(enc.mean(0, keepdim=True), txt, dim=1)
    loss = 0.0
    for _ in range(n_augs):
        enc = clip_model.encode_image(augment_transform(rendered_images))
        enc = enc / enc.norm(dim=1, keepdim=True)
        txt = text_embedding / text_embedding.norm(dim=1, keepdim=True)
        loss -= torch.cosine_similarity(enc.mean(0, keepdim=True), txt, dim=1)
    return loss


Jupyter environment detected. Enabling Open3D WebVisualizer.
[Open3D INFO] WebRTC GUI backend enabled.
[Open3D INFO] WebRTCWindowSystem: HTTP handshake server disabled.
Warp 1.8.1 initialized:
   CUDA Toolkit 12.8, Driver 12.6
   Devices:
     "cpu"      : "x86_64"
     "cuda:0"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
     "cuda:1"   : "Tesla T4" (15 GiB, sm_75, mempool enabled)
   CUDA peer access:
     Not supported
   Kernel cache:
     /root/.cache/warp/1.8.1


In [None]:
import gdown, zipfile, os, pickle as pkl
from os.path import join as opj

DATASET_DIR = '/kaggle/working/dataset_affnet'
os.makedirs(DATASET_DIR, exist_ok=True)

file_id = '1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF'
zip_path = '/kaggle/working/full-shape.zip'
if not os.path.exists(zip_path):
    gdown.download(f'https://drive.google.com/uc?export=download&id={file_id}', zip_path, quiet=False)

with zipfile.ZipFile(zip_path, 'r') as zf:
    zf.extractall(DATASET_DIR)

def load_split(split='train'):
    with open(opj(DATASET_DIR, f'full_shape_{split}_data.pkl'), 'rb') as f:
        data = pkl.load(f, encoding="latin1")
    return data

train_data = load_split('train')
val_data   = load_split('val')
print(f"Train items: {len(train_data)} | Val items: {len(val_data)}")

# Utility
def select_indices_by_category(data, category, k=3):
    idxs = [i for i, it in enumerate(data) if it["semantic class"]==category]
    random.shuffle(idxs)
    return idxs[:k]


Downloading...
From (original): https://drive.google.com/uc?export=download&id=1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF
From (redirected): https://drive.google.com/uc?export=download&id=1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF&confirm=t&uuid=085f6d7e-d9a7-4891-8d6c-3267631ae38d
To: /kaggle/working/full-shape.zip
100%|██████████| 558M/558M [00:02<00:00, 270MB/s] 


Train items: 16082 | Val items: 2285


In [None]:
import open3d as o3d
import numpy as np
from scipy.spatial import cKDTree

def reconstruct_mesh_from_points(points_xyz: np.ndarray,
                                 out_obj: str,
                                 mode="bpa_clean",
                                 poisson_depth=9,
                                 fallback_bpa=True):
    """
    mode="bpa_clean": Ball Pivoting multiscala + cleanup + crop su AABB + smoothing
    mode="bare_poisson": Poisson depth=9
    """
    # 1) Point cloud Open3D
    pcd = o3d.geometry.PointCloud()
    pts = points_xyz.astype(np.float64)
    pcd.points = o3d.utility.Vector3dVector(pts)

    pcd.estimate_normals(search_param=o3d.geometry.KDTreeSearchParamKNN(knn=16))
    try:
        pcd.orient_normals_consistent_tangent_plane(24)
    except Exception:
        pcd.orient_normals_towards_camera_location(np.array([0, 0, 3.0]))

    if mode == "bare_poisson":
        mesh, _ = o3d.geometry.TriangleMesh.create_from_point_cloud_poisson(
            pcd, depth=int(poisson_depth)
        )
        aabb = pcd.get_axis_aligned_bounding_box()
        bbox = aabb.scale(1.01, aabb.get_center())
        mesh = mesh.crop(bbox)

        # topological cleanup 
        mesh.remove_unreferenced_vertices()
        mesh.remove_degenerate_triangles()
        mesh.remove_duplicated_vertices()
        mesh.remove_duplicated_triangles()
        mesh.remove_non_manifold_edges()

    elif mode == "bpa_clean":
        # 2) BPA 
        dists = np.asarray(pcd.compute_nearest_neighbor_distance())
        med = float(np.median(dists)) if dists.size else 0.01
        radii = o3d.utility.DoubleVector([med*1.2, med*1.6, med*2.0, med*2.4, med*3.0, med*3.8])
        mesh = o3d.geometry.TriangleMesh.create_from_point_cloud_ball_pivoting(pcd, radii)

        # 3) Point cloud Open3D
        mesh.remove_unreferenced_vertices()
        mesh.remove_degenerate_triangles()
        mesh.remove_duplicated_vertices()
        mesh.remove_duplicated_triangles()
        mesh.remove_non_manifold_edges()

        # 4) Crop to AABB 
        aabb = pcd.get_axis_aligned_bounding_box()
        bbox = aabb.scale(1.01, aabb.get_center())
        mesh = mesh.crop(bbox)

        # 5) Normali + smoothing
        mesh.compute_vertex_normals()
        if hasattr(mesh, "filter_smooth_taubin"):
            mesh = mesh.filter_smooth_taubin(number_of_iterations=10)
        else:
            mesh = mesh.filter_smooth_simple(number_of_iterations=3)

    else:
        raise ValueError(f"Unknown recon_mode: {mode}")

    # 6) Save OBJ 
    o3d.io.write_triangle_mesh(out_obj, mesh)
    return mesh



def map_vertices_to_points(verts: np.ndarray, points_xyz: np.ndarray):
    tree = cKDTree(points_xyz)
    _, idx = tree.query(verts, k=1)
    return idx
    
def pc_normalize_np(pc: np.ndarray):
    c = pc.mean(axis=0, keepdims=True)
    pc0 = pc - c
    r = np.linalg.norm(pc0, axis=1).max()
    r = r if r > 0 else 1.0
    return pc0 / r, c.squeeze(0), r


In [None]:
set_seed(seed)
clip_model = get_clip(exp_config["clip_model_name"])

palette = torch.tensor([[204/255, 1.0, 0.0],    
                        [180/255, 180/255, 180/255]], device=device)

def train_on_mesh(obj_path, prompt, out_dir,
                  n_iter=2500, n_views=5, n_augs=5, lr=1e-4):
    os.makedirs(out_dir, exist_ok=True)

    renderer = Renderer(dim=(exp_config["render_res"], exp_config["render_res"]))
    mesh     = Mesh(obj_path)
    MeshNormalizer(mesh)()
    vertices = mesh.vertices.clone()
    background = torch.tensor((1.,1.,1.), device=device)

    # Text CLIP
    with torch.no_grad():
        tok = clip.tokenize([prompt]).to(device)
        txt = clip_model.encode_text(tok)
        txt = txt / txt.norm(dim=1, keepdim=True)

    mlp = NeuralHighlighter(
        depth=exp_config["mlp_num_layers"],
        width=exp_config["mlp_hidden_dim"],
        out_dim=exp_config["mlp_out_dim"],
        input_dim=exp_config["mlp_input_dim"],
        positional_encoding=exp_config["positional_encoding"],
        sigma=exp_config["sigma"]
    ).to(device)

    optim = torch.optim.Adam(mlp.parameters(), lr)
    best_loss, best_state = float('inf'), None

    for i in tqdm(range(n_iter), desc=f"train {os.path.basename(obj_path)}", leave=False):
        optim.zero_grad()
        pred = mlp(vertices)# (V,2)
        p_high = pred[:,0]

        area_target = 0.75  
        area_prior = (p_high.mean() - area_target).pow(2) * 0.02 
        
        color_mesh(pred, mesh, palette)             
        imgs, _, _ = renderer.render_views(
            mesh, num_views=n_views, show=False,
            center_azim=0, center_elev=0, std=4,
            return_views=True, lighting=True,
            background=background
        )
        
        loss = clip_loss(imgs, txt, clip_model, n_augs, clipavg=exp_config["clipavg"])
        loss = loss.mean() + area_prior 
        (loss if torch.is_tensor(loss) else torch.tensor(loss, device=device)).mean().backward()
        
        optim.step()

        with torch.no_grad():
            val = float(loss.mean().item() if torch.is_tensor(loss) else loss)
            if val < best_loss:
                best_loss, best_state = val, {k:v.detach().clone() for k,v in mlp.state_dict().items()}

        if i%100==0:
            torchvision.utils.save_image(imgs, os.path.join(out_dir, f"iter_{i}.jpg"))

    if best_state is not None:
        mlp.load_state_dict(best_state)

    # Renders + PLY 
    mlp.eval()
    with torch.no_grad():
        probs   = mlp(vertices)
        max_idx = torch.argmax(probs, 1, keepdim=True)
        one_hot = torch.zeros_like(probs).scatter_(1, max_idx, 1)
        color_mesh(one_hot, mesh, palette)
        imgs, _, _ = renderer.render_views(
            mesh, num_views=5, show=False, center_azim=0, center_elev=0,
            std=4, return_views=True, lighting=True,
            background=background
        )
        torchvision.utils.save_image(imgs, os.path.join(out_dir, f"final_render.jpg"))
        final_color = torch.where(max_idx==0,
                                  torch.tensor([204,255,0], device=device),
                                  torch.tensor([180,180,180], device=device))
        base = os.path.splitext(os.path.basename(obj_path))[0]
        mesh.export(os.path.join(out_dir, f"{base}.ply"), extension="ply", color=final_color)

    return mlp

def miou_from_mesh_preds(mlp, mesh_vertices, points_xyz, gt_binary):
    """
    Assigns to each point in the cloud the label of the nearest mesh vertex.
    This guarantees a 1:1 coverage between points and predictions and avoids 'holes' due to scattered voting.
    """
    import numpy as np
    from scipy.spatial import cKDTree

    # 1) prediction per-vertex
    with torch.no_grad():
        logits = mlp(mesh_vertices).detach().cpu().numpy()   # (V,2)
    vert_highlight = (logits.argmax(axis=1) == 0).astype(np.uint8)  # 0 = highlight

    # 2) normalize the points as done by MeshNormalizer (center + unit scale)
    def _pc_normalize_np(pc):
        c = pc.mean(axis=0, keepdims=True)
        pc0 = pc - c
        r = np.linalg.norm(pc0, axis=1).max()
        r = r if r > 0 else 1.0
        return pc0 / r

    pts_norm = _pc_normalize_np(points_xyz.astype(np.float32))
    verts_np = mesh_vertices.detach().cpu().numpy()  

    # 3) KDTree on  each vertex 
    tree = cKDTree(verts_np)
    _, nn_vert = tree.query(pts_norm, k=1)  # (N,) 

    # 4) predicted mask per point
    pred_mask = vert_highlight[nn_vert].astype(np.uint8)

    # 5) IoU
    gt_mask = (gt_binary > 0.5).astype(np.uint8)
    inter = np.logical_and(pred_mask, gt_mask).sum()
    union = np.logical_or (pred_mask, gt_mask).sum()
    return (inter/union) if union > 0 else 0.0




100%|████████████████████████████████████████| 338M/338M [00:03<00:00, 112MiB/s]


In [None]:
from pathlib import Path
import os, numpy as np, json

target_cat = exp_config["target_category"]
target_aff = exp_config["target_affordance"]
prompt     = exp_config["prompt_tpl"].format(cat=target_cat.lower(), aff=target_aff)

out_mesh_dir = os.path.join(OUTPUT_ROOT, "meshes")
out_res_dir  = os.path.join(OUTPUT_ROOT, "results")
Path(out_mesh_dir).mkdir(parents=True, exist_ok=True)
Path(out_res_dir).mkdir(parents=True, exist_ok=True)

def get_points_and_gt(item, aff_name):
    coords = item["full_shape"]["coordinate"].astype(np.float32)     # (N,3)
    labels = item["full_shape"]["label"][aff_name].flatten()         # (N,)
    return coords, labels

def run_split(data, split_name, max_items=3):
    idxs = select_indices_by_category(data, target_cat, k=max_items)
    res = []
    for idx in idxs:
        item = data[idx]
        pts, gt = get_points_and_gt(item, target_aff)
        model_id = item["shape_id"]

        # 1) mesh reconstruction
        obj_path = os.path.join(out_mesh_dir, f"{split_name}_{target_cat}_{model_id}.obj")
        _mesh = reconstruct_mesh_from_points(
            pts, obj_path,
            mode=exp_config["recon_mode"],
            poisson_depth=exp_config["poisson_depth"],
            fallback_bpa=True  
        )



        # 2) Training 3D-Highlighter 
        this_out = os.path.join(out_res_dir, f"{split_name}_{target_cat}_{model_id}")
        mlp = train_on_mesh(
            obj_path, prompt, this_out,
            n_iter=exp_config["n_iter_obj"],
            n_views=exp_config["n_views"],
            n_augs=exp_config["n_augs"],
            lr=exp_config["learning_rate"]
        )

        # 3) mIOU: mesh -> points
        mesh_tmp = Mesh(obj_path)          
        MeshNormalizer(mesh_tmp)()
        miou = miou_from_mesh_preds(mlp, mesh_tmp.vertices, pts, gt)

        res.append((model_id, float(miou)))
        print(f"[{split_name}] {model_id}  mIOU={miou:.4f}")
    return res

set_seed(seed)
train_results = run_split(train_data, "train", max_items=3)
val_results   = run_split(val_data,   "val",   max_items=3)

with open(os.path.join(OUTPUT_ROOT, 'optimization_results.json'), 'w') as f:
    json.dump({"train_results": train_results, "val_results": val_results}, f, indent=2)

print("TRAIN:", train_results)
print("VAL  :", val_results)


                                                                                                            

[train] 6e57c665a47d7d7730612f5c0ef21eb8  mIOU=0.2170


train train_Bottle_40e5d2c6e9e9cbbf5cafd3b1501bc74.obj:   2%|▏         | 58/2500 [00:10<07:32,  5.39it/s]