In [None]:
!git clone --branch extensions https://github.com/meldashti/Affordance3DHighlighter.git

In [None]:
!git pull origin extensions

In [None]:
import os

os.chdir('/kaggle/working/Affordance3DHighlighter')

In [None]:
!pip install gdown
!gdown --id 1siZtGusB1LfQVapTvNOiYi8aeKKAgcDF
!unzip full-shape.zip -d /kaggle/working/Affordance3DHighlighter/data/

In [None]:
import pickle
from collections import defaultdict

# Load training data
with open('/kaggle/working/Affordance3DHighlighter/data/full_shape_train_data.pkl', 'rb') as train_file:
    dataset = pickle.load(train_file)

# Create a set to store unique semantic classes
semantic_classes = set()
all_affordances = dataset[0]['affordance']

# Iterate through the dataset and collect semantic classes
for item in dataset:
    semantic_class = item['semantic class']
    semantic_classes.add(semantic_class)

# Print all unique semantic classes
print("Unique semantic classes in the dataset:")
for cls in sorted(semantic_classes):
    print(f"- {cls}")

# Print all unique semantic classes
print("Unique Affordances in the dataset:")
for cls in sorted(all_affordances):
    print(f"- {cls}")

# Print total count
print(f"\nTotal number of unique semantic classes: {len(semantic_classes)}")
print(f"\nTotal number of unique Affordances: {len(all_affordances)}")

# Optional: Print count of items per semantic class
class_counts = {}
for item in dataset:
    semantic_class = item['semantic class']
    class_counts[semantic_class] = class_counts.get(semantic_class, 0) + 1

print("\nNumber of items per semantic class:")
for cls, count in sorted(class_counts.items()):
    print(f"- {cls}: {count} items")


In [None]:
!pip install git+https://github.com/openai/CLIP.git
!pip install kaolin==0.17.0 -f https://nvidia-kaolin.s3.us-east-2.amazonaws.com/torch-2.5.1_cu121.html

In [None]:

import sys
import torch

need_pytorch3d = False
try:
    import pytorch3d
except ModuleNotFoundError:
    need_pytorch3d = True
if need_pytorch3d:
    pyt_version_str = torch.__version__.split("+")[0].replace(".", "")
    version_str = "".join([
        f"py3{sys.version_info.minor}_cu",
        torch.version.cuda.replace(".", ""),
        f"_pyt{pyt_version_str}"
    ])
    !pip install iopath
    if sys.platform.startswith("linux"):
        print("Trying to install wheel for PyTorch3D")
        !pip install --no-index --no-cache-dir pytorch3d -f https://dl.fbaipublicfiles.com/pytorch3d/packaging/wheels/{version_str}/download.html
        pip_list = !pip freeze
        need_pytorch3d = not any(i.startswith("pytorch3d==") for i in pip_list)
    if need_pytorch3d:
        print(f"failed to find/install wheel for {version_str}")
if need_pytorch3d:
    print("Installing PyTorch3D from source")
    !pip install ninja
    !pip install 'git+https://github.com/facebookresearch/pytorch3d.git@stable'

In [None]:
!pip install --ignore-installed open3d

In [None]:

from src.mesh import Mesh
from pytorch3d.structures import Pointclouds

from src.convertor import obj_to_pointcloud


def bounding_sphere_normalize(points: torch.Tensor) -> torch.Tensor:
    """
    points: (N,3) tensor of point coords
    Return normalized points in a unit sphere centered at origin.
    """
    center = points.mean(dim=0, keepdim=True)
    max_dist = (points - center).norm(p=2, dim=1).max()
    points_normed = (points - center) / max_dist
    return points_normed


def load_3d_data(file_path, num_points=10000, device="cuda", do_normalize=True):
    """
    Loads 3D data as PyTorch3D Pointclouds from either NPZ point cloud or OBJ mesh.

    Args:
        file_path: Path to either .npz point cloud or .obj mesh file
        num_points: Number of points to sample if loading from mesh
        device: Device to load data on

    Returns:
        Pointclouds object containing points and features
    """
    file_ext = file_path.split('.')[-1].lower()

    if file_ext == 'npz':
        # Load NPZ point cloud directly like in the example
        pointcloud = np.load(file_path)
        verts = torch.Tensor(pointcloud['verts']).to(device)
        rgb = torch.Tensor(pointcloud['rgb']).to(device)

        print("lenght of the data")
        print(len(verts))

        # Subsample if needed
        if len(verts) > num_points:
            idx = torch.randperm(len(verts))[:num_points]
            verts = verts[idx]
            rgb = rgb[idx]

        if do_normalize:
            verts = bounding_sphere_normalize(verts)

        # Return both the points tensor and the Pointclouds object
        point_cloud = Pointclouds(points=[verts], features=[rgb])
        return verts, point_cloud  # Return both

    elif file_ext == 'obj':
        # Load and convert your OBJ file
        points, point_cloud = obj_to_pointcloud(
            file_path,
            num_points=num_points,  # Adjust this number as needed
            device="cuda"  # Use "cpu" if you don't have a GPU
        )
        if do_normalize:
            points = bounding_sphere_normalize(points)
            # here we update the point cloud too
            rgb = point_cloud.features_packed()  # shape [N,3]
            point_cloud = Pointclouds(points=[points], features=[rgb])
        return points, point_cloud

    else:
        raise ValueError(f"Unsupported file format: {file_ext}. Only .npz and .obj are supported.")



In [38]:
def save_experiment_report(background_info, shape_results, global_miou, output_dir):
    """
    Saves experiment results to a report file.
    
    Args:
        background_info (str): Description of background used
        shape_results (list): List of dictionaries containing per-shape results
        global_miou (float): Overall mIoU across all shapes
        output_dir (str): Directory to save the report
    """
    import json
    from datetime import datetime
    
    # Create report directory if it doesn't exist
    report_dir = os.path.join(output_dir, "reports")
    os.makedirs(report_dir, exist_ok=True)
    
    # Generate timestamp for unique filename
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    
    report = {
        "timestamp": timestamp,
        "background": background_info,
        "global_miou": float(global_miou),
        "shape_results": shape_results,
    }
    
    # Save as JSON file
    report_path = os.path.join(report_dir, f"report_{timestamp}.json")
    with open(report_path, 'w') as f:
        json.dump(report, f, indent=4)
        
    # Also save a human-readable summary
    summary_path = os.path.join(report_dir, f"summary_{timestamp}.txt")
    with open(summary_path, 'w') as f:
        f.write(f"Experiment Report\n")
        f.write(f"================\n")
        f.write(f"Date: {timestamp}\n")
        f.write(f"Background: {background_info}\n")
        f.write(f"Global mIoU: {global_miou:.4f}\n\n")
        f.write("Per-Shape Results:\n")
        for result in shape_results:
            f.write(f"\nShape {result['shape_class']} (ID: {result['shape_id']}):\n")
            f.write(f"  Shape mIoU: {result['shape_miou']:.4f}\n")
            f.write("  Per-Affordance IoU:\n")
            for aff, iou in result['affordance_ious'].items():
                f.write(f"    - {aff}: {iou:.4f}\n")

In [46]:
from src.utils import compute_mIoU
from src.prompt_strategies import generate_affordance_prompt
from src.data_loader_fullshape import FullShapeDataset
from src.render.cloud_point_renderer import MultiViewPointCloudRenderer
from src.save_results import save_renders, save_results
from src.neural_highlighter import NeuralHighlighter
from src.Clip.loss_function import clip_loss
from src.Clip.clip_model import get_clip_model, encode_text, setup_clip_transforms

import torch
import numpy as np
import random
from tqdm import tqdm

# Constrain most sources of randomness
# (some torch backwards functions within CLIP are non-determinstic)
# Set a consistent seed for reproducibility
seed = 0  # You can use any integer value
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
random.seed(seed)
np.random.seed(seed)
torch.backends.cudnn.benchmark = False
torch.backends.cudnn.deterministic = True


def optimize_point_cloud(points, clip_model,clip_transform, augment_transform, renderer, encoded_text, log_dir: str,background_path=None, **kwargs):
    num_iterations = kwargs.get('num_iterations', 1000)
    learning_rate = kwargs.get('learning_rate', 1e-4)
    depth = kwargs.get('depth', 5)
    width = kwargs.get('network_width', 256)
    n_views = kwargs.get("n_views", 4)
    n_augs = kwargs.get('n_augs', 1)
    clipavg = kwargs.get('clipavg', 'view')
    device = kwargs.get('device', 'cuda')
    

    # Initialize network and optimizer
    net = NeuralHighlighter(
        depth=depth,  # Number of hidden layers
        width=width,  # Width of each layer
        out_dim=2,  # Binary classification (highlight/no-highlight)
        input_dim=3,  # 3D coordinates (x,y,z)
        positional_encoding=False  # As recommended in the paper
    ).to(device)
    optimizer = torch.optim.Adam(net.parameters(), lr=learning_rate)


    # Training loop
    for i in tqdm(range(num_iterations)):
        optimizer.zero_grad()

        # Predict highlight probabilities
        pred_class = net(points)

        # Create colors based on predictions
        highlight_color = torch.tensor([204 / 255, 1.0, 0.0]).to(device)
        base_color = torch.tensor([180 / 255, 180 / 255, 180 / 255]).to(device)

        colors = pred_class[:, 0:1] * highlight_color + pred_class[:, 1:2] * base_color

        # Create and render point cloud
        point_cloud = renderer.create_point_cloud(points, colors)
        rendered_images = renderer.render_all_views(
            point_cloud=point_cloud,
            n_views=n_views,
            background_path=background_path
        )
        # Convert dictionary of images to tensor
        rendered_tensor = []
        for name, img in rendered_images.items():
            rendered_tensor.append(img.to(device))
        rendered_tensor = torch.stack(rendered_tensor)

        #Convert rendered images to CLIP format
        rendered_images = rendered_tensor.permute(0, 3, 1, 2)  # [B, H, W, C] -> [B, C, H, W]
        #print(rendered_images.shape)

        # Calculate CLIP loss
        loss = clip_loss(
            rendered_images=rendered_images,
            encoded_text=encoded_text,
            clip_transform=clip_transform,
            augment_transform=augment_transform,
            clip_model=clip_model,
            n_augs=n_augs,
            clipavg=clipavg
        )
        #print("Loss computation graph:")
        #print_grad_fn(loss)
        loss.backward()
        optimizer.step()

        if i % 100 == 0:
            print(f"Iteration {i}, Loss: {loss.item():.4f}")
            save_renders(log_dir, i, rendered_images)

    return net


def main(input_path,target_classes=['Bowl'],target_affordances=['contain'],prompt_strategy="affordance_specific",iou_threshold = 0.15, **kwargs):
    """
    Main function for 3D highlighting with configurable parameters.

    Args:
        input_path: Path to input 3D file (mesh or point cloud)
        object_name: Name of the object for the prompt
        highlight_region: Region to highlight
        **kwargs: Optional parameters with defaults:
            n_views: Number of views to render (default: 5)
            n_aug: Number of augmentations (default: 5)
            clipavg: Method for CLIP averaging (default: "view")
            network_depth: Depth of neural network (default: 5)
            network_width: Width of neural layers (default: 256)
            learning_rate: Learning rate for optimization (default: 1e-4)
            num_iterations: Number of training iterations (default: 500)
            num_points: Number of points to sample (default: 10000)
            device: Device to run on (default: "cuda")
            output_dir: Directory for outputs (default: "./output")
    """
    # Extract parameters from kwargs with defaults
    n_views = kwargs.get("n_views", 4)
    num_points = kwargs.get("num_points", 10000)
    device = kwargs.get("device", "cuda")
    output_dir = kwargs.get("output_dir", "./output")
    do_normalize = kwargs.get("do_normalize", True)
    background_paths = kwargs.get("background_paths", [None])

    try:

        # LOAD AffordanceNet Dataset
        file_type = input_path.split(".")[-1]
        print(f"Loading AffordanceNet Dataset...")
        if file_type == "pkl":
            dataset = FullShapeDataset(
                input_path,
                target_classes=target_classes,
                target_affordances=target_affordances,
                device=device
            )
        else:
            raise ValueError(f"Invalid file format. Expected .pkl file,got: {file_type}")

        val_indices = list(range(min(3, len(dataset))))

        # Create output directory if it doesn't exist
        os.makedirs(output_dir, exist_ok=True)

        # Setup CLIP model
        print("Setting up CLIP model...")
        clip_model, preprocess, resolution = get_clip_model()

        # Initialize renderer
        print("Setting up renderer...")
        renderer = MultiViewPointCloudRenderer(
            camera_type="perspective",
            image_size=512,
            base_dist=2.5,  # Your default view distance
            base_elev=10,  # Your default elevation
            base_azim=45,  # Your default azimuth
            device=device,
            point_radius=0.008
        )

        transforms_list = ['default','balanced','viewpoint','lighting']

        for aug_type in transforms_list:

            print(50*"=")
            print(f"using augmentation type: {aug_type}")
            print(50*"=")
            # Set up the transforms
            clip_transform, augment_transform = setup_clip_transforms()

            aug_dir = os.path.join(output_dir, f"{aug_type}")
            os.makedirs(aug_dir, exist_ok=True)

            print(f"Loading 3D data from AffordanceNet...")
            background_results = []
            for background_path in background_paths:

                if background_path is None:
                    background_info = "No background"
                else:
                    background_info = background_path.split("/")[-1]

                print(f"\nTraining with background: {background_info}")

                bg_dir = os.path.join(aug_dir, f"{background_info}")
                os.makedirs(bg_dir, exist_ok=True)

                #Train on random Shape
                global_mIou = 0
                shape_results = []
                for i,idx in enumerate(val_indices):

                    shape_entry = dataset[idx]
                    shape_class = shape_entry["shape_class"]
                    affordances = shape_entry["affordances"]
                    label_dict = shape_entry["labels_dict"]

                    # Convert coords to tensor if not already
                    points = shape_entry["coords"]
                    if not isinstance(points, torch.Tensor):
                        points = torch.tensor(points, device=device)

                    print(f"Loaded {len(points)} points")
                    shape_subdir = os.path.join(bg_dir, f"shape_{shape_class}")
                    os.makedirs(shape_subdir, exist_ok=True)
                    shape_obj  = os.path.join(shape_subdir, f"{shape_class}_{idx}")
                    os.makedirs(shape_obj, exist_ok=True)

                    shape_mIOU = 0
                    affordance_ious = {}
                    for affordance in affordances:

                        affordance_subdir = os.path.join(shape_obj, f"affordance_{affordance}")
                        os.makedirs(affordance_subdir, exist_ok=True)

                        # Create and encode prompt
                        prompt = generate_affordance_prompt(shape_class, affordance, strategy=prompt_strategy)
                        print(f"Using prompt: {prompt}")
                        text_features = encode_text(clip_model, prompt, device)

                        # Optimize point cloud highlighting
                        print("Starting optimization...")
                        net = optimize_point_cloud(
                            points=points,
                            renderer=renderer,
                            clip_model=clip_model,
                            clip_transform=clip_transform,
                            augment_transform=augment_transform,
                            encoded_text=text_features,
                            log_dir=affordance_subdir,
                            background_path=background_path,
                            **kwargs
                        )

                        #Compute IoU for *this* affordance
                        with torch.no_grad():
                            pred_class = net(points)  # shape [N,2]
                            highlight_scores = pred_class[:, 0]

                            gt_bin = (label_dict[affordance] > 0.0).long()
                            bin_pred = (highlight_scores >= iou_threshold).long()
                            iou_val = compute_mIoU(bin_pred, gt_bin)
                            print(f"IoU: {iou_val}")
                            affordance_ious[affordance] = float(iou_val)
                            shape_mIOU += iou_val

                    shape_mIOU = shape_mIOU / len(affordances)
                    print(f"shape mIOU: {shape_mIOU}")
                    global_mIou += shape_mIOU
                    # Store results for this shape
                    shape_results.append({
                        "shape_id": idx,
                        "shape_class": shape_class,
                        "shape_miou": float(shape_mIOU),
                        "affordance_ious": affordance_ious
                    })

                global_mIou = global_mIou / len(val_indices)
                print(f"global mIOU: {global_mIou}")

                # Save report for this background
                save_experiment_report(
                    background_info=background_info,
                    shape_results=shape_results,
                    global_miou=global_mIou,
                    output_dir=output_dir
                )

                # Store results for final comparison
                background_results.append({
                    "background": background_info,
                    "global_miou": global_mIou
                })

                print(50 *"=")
                print(50 *"=")
                print(50 *"=")

            # Print final comparison
            print("\nFinal Results Comparison:")
            print(f"\n Augumentation Type: {aug_type}")
            print("========================")
            for result in background_results:
                print(f"Background: {result['background']:<20} mIoU: {result['global_miou']:.4f}")
        
       
    except Exception as e:
        print(f"Error in processing: {str(e)}")
        raise



In [47]:

background_paths = [
    None,
    "/kaggle/working/Affordance3DHighlighter/data/background.jpg",
    "/kaggle/working/Affordance3DHighlighter/data/background2.jpg",
    "/kaggle/working/Affordance3DHighlighter/data/kitchen1.jpg",
    "/kaggle/working/Affordance3DHighlighter/data/kitchen2.jpg"
]
main(
    input_path="/kaggle/working/Affordance3DHighlighter/data/full_shape_train_data.pkl",
    target_classes=['Knife'],
    target_affordances=['cut'],
    prompt_strategy="basic",
    iou_threshold = 0.01,
    n_views=2,
    n_augs=3,
    clipavg="view",
    network_depth=4,
    network_width=256,
    learning_rate=1e-4,
    num_iterations=301,
    num_points=100000,
    device="cuda",
    output_dir="./output",
    background_paths=background_paths
)

Loading AffordanceNet Dataset...
Found 129 valid ['Bowl'] objects with all affordances ['contain']
Setting up CLIP model...
Setting up renderer...
Loading 3D data from /kaggle/working/Affordance3DHighlighter/data/full_shape_train_data.pkl...

Training with background: No background
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  1%|          | 2/200 [00:00<00:15, 12.64it/s]

Iteration 0, Loss: -0.2401


 52%|█████▏    | 104/200 [00:06<00:05, 16.54it/s]

Iteration 100, Loss: -0.2917


100%|██████████| 200/200 [00:11<00:00, 16.84it/s]


IoU: 0.62451171875
shape mIOU: 0.62451171875
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  1%|          | 2/200 [00:00<00:11, 16.64it/s]

Iteration 0, Loss: -0.2194


 52%|█████▏    | 104/200 [00:06<00:05, 17.14it/s]

Iteration 100, Loss: -0.2542


100%|██████████| 200/200 [00:11<00:00, 17.07it/s]


IoU: 0.6435546875
shape mIOU: 0.6435546875
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  1%|          | 2/200 [00:00<00:11, 16.66it/s]

Iteration 0, Loss: -0.2362


 52%|█████▏    | 104/200 [00:06<00:05, 16.96it/s]

Iteration 100, Loss: -0.2671


100%|██████████| 200/200 [00:11<00:00, 16.92it/s]


IoU: 0.70361328125
shape mIOU: 0.70361328125
global mIOU: 0.6572265625

Training with background: background.jpg
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  1%|          | 2/200 [00:00<00:15, 12.74it/s]

Iteration 0, Loss: -0.1842


 51%|█████     | 102/200 [00:07<00:07, 12.88it/s]

Iteration 100, Loss: -0.2407


100%|██████████| 200/200 [00:15<00:00, 13.01it/s]


IoU: 0.935546875
shape mIOU: 0.935546875
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  1%|          | 2/200 [00:00<00:15, 12.98it/s]

Iteration 0, Loss: -0.2012


 51%|█████     | 102/200 [00:07<00:07, 13.08it/s]

Iteration 100, Loss: -0.2279


100%|██████████| 200/200 [00:15<00:00, 12.91it/s]


IoU: 0.623046875
shape mIOU: 0.623046875
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  1%|          | 2/200 [00:00<00:17, 11.58it/s]

Iteration 0, Loss: -0.2195


 51%|█████     | 102/200 [00:07<00:07, 12.82it/s]

Iteration 100, Loss: -0.2281


100%|██████████| 200/200 [00:15<00:00, 12.96it/s]


IoU: 0.91162109375
shape mIOU: 0.91162109375
global mIOU: 0.8234049479166666

Training with background: background2.jpg
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  0%|          | 1/200 [00:00<00:55,  3.61it/s]

Iteration 0, Loss: -0.1172


 50%|█████     | 101/200 [00:29<00:29,  3.38it/s]

Iteration 100, Loss: -0.1902


100%|██████████| 200/200 [00:58<00:00,  3.40it/s]


IoU: 0.60986328125
shape mIOU: 0.60986328125
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  0%|          | 1/200 [00:00<00:59,  3.33it/s]

Iteration 0, Loss: -0.2059


 50%|█████     | 101/200 [00:29<00:29,  3.37it/s]

Iteration 100, Loss: -0.2383


100%|██████████| 200/200 [00:59<00:00,  3.39it/s]


IoU: 0.74169921875
shape mIOU: 0.74169921875
Loaded 2048 points
Using prompt: A 3D render of a gray Bowl emphasizing the main storage compartment, internal volume, and any additional pockets or compartments designed to hold and organize items
Starting optimization...


  0%|          | 1/200 [00:00<00:59,  3.36it/s]

Iteration 0, Loss: -0.2155


 50%|█████     | 101/200 [00:29<00:29,  3.35it/s]

Iteration 100, Loss: -0.2212


100%|██████████| 200/200 [00:58<00:00,  3.41it/s]

IoU: 0.6962890625
shape mIOU: 0.6962890625
global mIOU: 0.6826171875

Final Results Comparison:
Background: No background        mIoU: 0.6572
Background: background.jpg       mIoU: 0.8234
Background: background2.jpg      mIoU: 0.6826



