In [1]:
%load_ext autoreload
%autoreload 2

# 1. Create dataset
Use Sam to extract proposals from input images (100 images) then for proposals with same regions as the ground truth masks, they are considered as positive sameple to the templates ( use the BlenderProc 42 templates), negative otherwise

Run through 100 images - then return positive proposals and negative proposals

Use IoU >0.5 to get the postive proposals

In [139]:
from PIL import Image
import numpy as np
import glob
import logging
from src.model.sam import CustomSamAutomaticMaskGenerator, load_sam

logging.basicConfig(level=logging.INFO, format='[%(asctime)s] [%(levelname)s] %(name)s: %(message)s')
log = logging.getLogger(__name__)


def extract_object_by_mask(image, mask, width: int = 512):
    mask = Image.fromarray(mask)
    masked_image = Image.composite(
        image, Image.new("RGB", image.size, (0, 0, 0)), mask)
    cropped_image = masked_image.crop(masked_image.getbbox())
    # new_height = width * cropped_image.height // cropped_image.width
    return cropped_image


def calculate_iou(ground_truth, prediction):
    intersection = np.logical_and(ground_truth, prediction)
    union = np.logical_or(ground_truth, prediction)
    iou_score = np.sum(intersection) / np.sum(union)
    return iou_score


def extract_dataset(dataset="icbin",data_type="test", scene_id=1):  # data_type test or train 
    model_type = "vit_h"
    checkpoint_dir =  "datasets/bop23_challenge/pretrained/segment-anything"
    log.info("loading sam")
    sam_model = load_sam(model_type, checkpoint_dir)
    custom_sam_model = CustomSamAutomaticMaskGenerator(sam=sam_model)
    custom_sam_model.predictor.model.to("cuda")

    frames_path = f"datasets/bop23_challenge/datasets/{dataset}/{data_type}/{scene_id:06d}/rgb/*.png" #"datasets/bop23_challenge/datasets/icbin/test/000001/rgb/000008.png"
    frames_path = glob.glob(frames_path)
    for frame_path in frames_path:
        rgb = Image.open(frame_path).convert("RGB") # rotate(180)
        detections = custom_sam_model.generate_masks(np.array(rgb)).cpu() # Include masks and bboxes

        masked_images = []
        for mask in detections["masks"]:
            binary_mask = np.array(mask) * 255
            binary_mask = binary_mask.astype(np.uint8)
            masked_image = extract_object_by_mask(rgb, binary_mask)
            masked_images.append(masked_image)

        frame_id = frame_path.split("/")[-1].split(".")[0]
        visib_mask_paths = f"datasets/bop23_challenge/datasets/{dataset}/{data_type}/{scene_id:06d}/mask_visib/{frame_id}_*.png" #"datasets/bop23_challenge/datasets/icbin/test/000001/rgb/000008.png"
        mask_paths = glob.glob(visib_mask_paths)
        masks_gt = [(np.array(Image.open(mask_path).convert("L"))>0).astype(int) for mask_path in mask_paths]
        masks_pred = [np.array(mask.cpu()).astype(int) for mask in detections["masks"]]

        best_mask_indices = []
        for gt_i, gt in enumerate(masks_gt):

            best_iou = 0
            best_mask_index = -1

            for i, mask in enumerate(masks_pred):
                iou = calculate_iou(gt, mask)
                if iou > best_iou:
                    best_iou = iou
                    best_mask_index = i
            if best_iou >0.5:
                best_mask_indices.append(best_mask_index)
            log.info(f"The best for {gt_i}th mask is at index {best_mask_index} with an IoU of {best_iou}")

        pos_proposals = [masked_images[i] for i in best_mask_indices]
        neg_proposals = [masked_images[j] for j in range(len(masked_images)) if j not in best_mask_indices]
        return pos_proposals, neg_proposals
    

extract_dataset()

In [114]:
import glob



In [126]:
import numpy as np





In [136]:
# np.unique(np.array(pos_proposals[0])/255.0)

# 2. Process dataset
Organize your images into classes. You'll need positive pairs (same class) and negative pairs (different classes) for training.
Resize/pad image to 224*224  then /255.0
then transform the image (also with std, mean as in ImageNet)

# 3. Design your network architecture


# 4. Implement the contrastive loss function


# 5. Create data pairs for training


# 6. Train the model