In [12]:
import os
import time
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
from os import listdir
import pandas as pd
import numpy as np
import glob
import cv2
import json
from os.path import expanduser
import splitfolders
import shutil
from define_path import Def_Path

from tqdm import tqdm

import torch 
import torchvision
from torchvision import models
from torchvision.models.detection.rpn import AnchorGenerator
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn 
import torchvision.transforms as T
from torchvision.transforms import functional as F
from torchsummary import summary
from sklearn.model_selection import train_test_split

import albumentations as A # Library for augmentations

import matplotlib.pyplot as plt 
from PIL import Image

import transforms, utils, engine, train
from utils import collate_fn
from engine import train_one_epoch, evaluate

t = torch.cuda.get_device_properties(0).total_memory
print(t)
torch.cuda.empty_cache()

r = torch.cuda.memory_reserved(0)
print(r)
a = torch.cuda.memory_allocated(0)
print(a)
# f = r-a  # free inside reserved

weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_sim_b1_e25_v0.pth'

16908615680
346030080
308891136


In [13]:
# to generalize home directory. User can change their parent path without entering their home directory
path = Def_Path()

parent_path =  path.home + "/Pictures/" + "Data/"

root_dir = parent_path + path.year + "-" + path.month + "-" + path.day + "/"

In [14]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# torch.cuda.set_per_process_memory_fraction(0.9, 0)
print(device)

cuda


In [15]:
def train_transform():
    return A.Compose([
        A.Sequential([
            A.RandomRotate90(p=1), # Random rotation of an image by 90 degrees zero or more times
            A.RandomBrightnessContrast(brightness_limit=0.2, contrast_limit=0.3, brightness_by_max=True, always_apply=False, p=1), # Random change of brightness & contrast
        ], p=1)
#         A.Resize(640, 480)  # Resize all images to be 640x480
    ],
    keypoint_params=A.KeypointParams(format='xy'), # More about keypoint formats used in albumentations library read at https://albumentations.ai/docs/getting_started/keypoints_augmentation/
    bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bboxes_labels']) # Bboxes should have labels, read more at https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/
    )

In [16]:
def train_test_split(src_dir):
    dst_dir_img = src_dir + "images"
    dst_dir_anno = src_dir + "annotations"
    
    if os.path.exists(dst_dir_img) and os.path.exists(dst_dir_anno):
        print("folders exist")
    else:
        os.mkdir(dst_dir_img)
        os.mkdir(dst_dir_anno)
        
    for jpgfile in glob.iglob(os.path.join(src_dir, "*.jpg")):
        shutil.copy(jpgfile, dst_dir_img)

    for jsonfile in glob.iglob(os.path.join(src_dir, "*.json")):
        shutil.copy(jsonfile, dst_dir_anno)
        
    output = parent_path + "split_folder_output" + "-" + path.year + "-" + path.month + "-" + path.day 
    
    splitfolders.ratio(src_dir, # The location of dataset
                   output=output, # The output location
                   seed=42, # The number of seed
                   ratio=(.7, .2, .1), # The ratio of split dataset
                   group_prefix=None, # If your dataset contains more than one file like ".jpg", ".pdf", etc
                   move=False # If you choose to move, turn this into True
                   )
    
    shutil.rmtree(dst_dir_img)
    shutil.rmtree(dst_dir_anno)
    
    return output  
    

In [17]:
class KPDataset(Dataset):
    def __init__(self, root, transform=None, demo=False):                
        self.root = root
        self.transform = transform
        self.demo = demo # Use demo=True if you need transformed and original images (for example, for visualization purposes)
        self.imgs_files = sorted(os.listdir(os.path.join(root, "images")))
        self.annotations_files = sorted(os.listdir(os.path.join(root, "annotations")))
    
    def __getitem__(self, idx):
        img_file = self.imgs_files[idx]
        img_path = os.path.join(self.root, "images", self.imgs_files[idx])
        annotations_path = os.path.join(self.root, "annotations", self.annotations_files[idx])

        img_original = cv2.imread(img_path)
        img_original = cv2.cvtColor(img_original, cv2.COLOR_BGR2RGB)
        
        with open(annotations_path) as f:
            data = json.load(f)
            bboxes_original = data['bboxes']
            keypoints_original = data['keypoints']
            
            # All objects are keypoints on the robot
            bboxes_labels_original = [] 
            bboxes_labels_original.append('base_joint')
            bboxes_labels_original.append('joint2')
            bboxes_labels_original.append('joint3')
            bboxes_labels_original.append('joint4')
            bboxes_labels_original.append('joint5')
            bboxes_labels_original.append('joint6')  

        if self.transform:   
            # Converting keypoints from [x,y,visibility]-format to [x, y]-format + Flattening nested list of keypoints            
            # For example, if we have the following list of keypoints for three objects (each object has two keypoints):
            # [[obj1_kp1, obj1_kp2], [obj2_kp1, obj2_kp2], [obj3_kp1, obj3_kp2]], where each keypoint is in [x, y]-format            
            # Then we need to convert it to the following list:
            # [obj1_kp1, obj1_kp2, obj2_kp1, obj2_kp2, obj3_kp1, obj3_kp2]
            keypoints_original_flattened = [el[0:2] for kp in keypoints_original for el in kp]
            
            # Apply augmentations
            transformed = self.transform(image=img_original, bboxes=bboxes_original, bboxes_labels=bboxes_labels_original, keypoints=keypoints_original_flattened)
            img = transformed['image']
            bboxes = transformed['bboxes']
            # Unflattening list transformed['keypoints']
            # For example, if we have the following list of keypoints for three objects (each object has two keypoints):
            # [obj1_kp1, obj1_kp2, obj2_kp1, obj2_kp2, obj3_kp1, obj3_kp2], where each keypoint is in [x, y]-format
            # Then we need to convert it to the following list:
            # [[obj1_kp1, obj1_kp2], [obj2_kp1, obj2_kp2], [obj3_kp1, obj3_kp2]]
            keypoints_transformed_unflattened = np.reshape(np.array(transformed['keypoints']), (-1,1,2)).tolist()

            # Converting transformed keypoints from [x, y]-format to [x,y,visibility]-format by appending original visibilities to transformed coordinates of keypoints
            keypoints = []
            for o_idx, obj in enumerate(keypoints_transformed_unflattened):
#                 print("object", obj)
#                 print(" obj index", o_idx)# Iterating over objects
                obj_keypoints = []
                for k_idx, kp in enumerate(obj): # Iterating over keypoints in each object
                    obj_keypoints.append(kp + [keypoints_original[o_idx][k_idx][2]])
                keypoints.append(obj_keypoints)
        
        else:
            img, bboxes, keypoints = img_original, bboxes_original, keypoints_original        
        
        # Convert everything into a torch tensor        
        bboxes = torch.as_tensor(bboxes, dtype=torch.float32)       
        target = {}
        labels = [1, 2, 3, 4, 5, 6]            
        target["boxes"] = bboxes
        target["labels"] = torch.as_tensor(labels, dtype=torch.int64) # all objects are joint positions
        target["image_id"] = torch.tensor([idx])
        target["area"] = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
        target["iscrowd"] = torch.zeros(len(bboxes), dtype=torch.int64)
        target["keypoints"] = torch.as_tensor(keypoints, dtype=torch.float32)
        img = F.to_tensor(img)        
        bboxes_original = torch.as_tensor(bboxes_original, dtype=torch.float32)
        target_original = {}
        target_original["boxes"] = bboxes_original
        target_original["labels"] = torch.as_tensor(labels, dtype=torch.int64) # all objects are glue tubes
        target_original["image_id"] = torch.tensor([idx])
        target_original["area"] = (bboxes_original[:, 3] - bboxes_original[:, 1]) * (bboxes_original[:, 2] - bboxes_original[:, 0])
        target_original["iscrowd"] = torch.zeros(len(bboxes_original), dtype=torch.int64)
        target_original["keypoints"] = torch.as_tensor(keypoints_original, dtype=torch.float32)        
        img_original = F.to_tensor(img_original)

        if self.demo:
            return img, target, img_original, target_original, img_file
        else:
            return img, target, img_file
    
    def __len__(self):
        return len(self.imgs_files)

In [18]:
def get_model(num_keypoints, weights_path=None):
    
    anchor_generator = AnchorGenerator(sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0))
    model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=False,
                                                                   pretrained_backbone=True,
                                                                   num_keypoints=num_keypoints,
                                                                   num_classes = 7, # Background is the first class, object is the second class
                                                                   rpn_anchor_generator=anchor_generator)

    if weights_path:
        state_dict = torch.load(weights_path)
        model.load_state_dict(state_dict)        
        
    return model

In [32]:
import torch
import torch.nn as nn
import torch.nn.functional as func
import networkx as nx


class GNNEncoder(nn.Module):
    def __init__(self, vertices_dim=5, hidden_dim=128, num_vertices=6):
        super(GNNEncoder, self).__init__()
        self.f_enc = nn.Linear(vertices_dim, hidden_dim)
        self.f_e1 = nn.Linear((hidden_dim * 2)+2, hidden_dim)
        self.f_v = nn.Linear(hidden_dim, hidden_dim)
        self.f_e2 = nn.Linear((hidden_dim * 2)+2, 4)
        self.num_vertices = num_vertices        
    
    def get_node_features(self, vertices):
        print("Vertices in node features", vertices)
        node_features = []
        for keypoint in vertices:
            x, y, confidence, visibility, label = keypoint
            node_features.append([x, y, confidence, visibility, label])        
        nodes = torch.tensor(node_features, dtype=torch.float).to(device)
        print(nodes)
        return nodes

    def get_edge_features(self, vertices):
        edges = [(0,1), (1, 2), (2, 3), (3, 4), (4, 5), (5, 0)]
        edge_features = []
        for edge in edges:
            k1, k2 = vertices[edge[0]][:2], vertices[edge[1]][:2]
            distance = torch.norm(k1 - k2)
            angle = torch.atan2(k2[1] - k1[1], k2[0] - k1[0])
            edge_features.append([distance.item(), angle.item()])
            
        edges = torch.tensor(edges, dtype=torch.long).to(device)
        edge_features = torch.tensor(edge_features, dtype=torch.float).to(device)
        return edges, edge_features

    def forward(self, vertices):
        nodes = self.get_node_features(vertices)
        edges, edge_features = self.get_edge_features(vertices)
        h1 = self.f_enc(nodes)
        h1_source = h1[edges[:, 0]]
        h1_target = h1[edges[:, 1]]
        h_e1 = self.f_e1(torch.cat((h1_source, h1_target, edge_features), dim=1))  # Include edge feature in the input
        h_j_2 = self.f_v(h_e1)
        h2_source = h_j_2[edges[:, 0]]
        h2_target = h_j_2[edges[:, 1]]
        h_e2 = self.f_e2(torch.cat((h1_source, h1_target, edge_features), dim=1))  # Include edge feature in the input
        h_e2_prob = torch.sigmoid(h_e2)
        return vertices, h_e2_prob, edges, edge_features

class GNNDecoder(nn.Module):
    def __init__(self, vertices_dim=5, hidden_dim=128, num_vertices=6):
        super(GNNDecoder, self).__init__()
        self.f_e = nn.Linear((vertices_dim * 2)+2, 4)  # Concatenate two vertices features
        self.f_h = nn.Linear(4, vertices_dim)  # Transform h_ij to the same dimension as vertices
        self.f_v = nn.Linear(vertices_dim, vertices_dim)  # Update vertex feature
    
    def forward(self, vertices, h_e2_prob, edges, edge_features):
        h_source = vertices[edges[:, 0]]
        h_target = vertices[edges[:, 1]]
        h = torch.zeros_like(vertices)

        for idx, (i, j) in enumerate(edges):  # Iterate over edges
            print(i, j, idx)
            single_edge_features = edge_features[idx].unsqueeze(0)    
            h_ij = h_e2_prob[idx] * self.f_e(torch.cat((h_source[idx].unsqueeze(0), h_target[idx].unsqueeze(0), single_edge_features), dim=1))  # Include edge weights in the input
            h_ij_transformed = self.f_h(h_ij.squeeze())  # Transform h_ij to the same dimension as vertices
            h[j] += h_ij_transformed  # Accumulate edge features to the target vertex

        h_transformed = self.f_v(h.view(-1, vertices.shape[1]))  # Transform h
        h_transformed = h_transformed.view(vertices.shape)  # Reshape back to original shape
        vertices_g = vertices + h_transformed  # Update vertex features

        return vertices_g  # Return vertices_g as the prediction and vertices_g itself as the mean for Gaussian distribution



In [33]:
class OccludedKeyPointLoss(nn.Module):
    def __init__(self, delta=1.0):
        super().__init__()
        self.delta = delta

    def forward(self, vertices_pred, vertices_gt):
#         vertices_gt = vertices_gt.squeeze()
        visibility = vertices_gt[:, 3].unsqueeze(1)  # Extracting the visibility
        vertices_pred = vertices_pred[:, :3]  # Considering only x, y coordinates, confidence_score
        vertices_gt = vertices_gt[:, :3]  # Considering only x, y coordinates, confidence_score
        # Compute differences
        diff = (vertices_gt - vertices_pred).abs()
        # Compute Huber loss
        huber_loss = torch.where(diff < self.delta, 0.5 * diff**2, self.delta * (diff - 0.5 * self.delta))
        return huber_loss.mean()
    
def visibility_loss (vertices_pred, vertices_gt):    
    return func.cross_entropy(vertices_pred[:, 3], vertices_gt[:, 3])  # Loss based on visibility of keypoints

def edge_loss(edges_prob, edges_gt):
#     edges_gt_expanded = torch.zeros(edges_prob.shape, dtype=torch.float32)
    # Compute the cross-entropy loss
    loss = -torch.sum(edges_gt.to(device) * torch.log(torch.clamp(edges_prob, min=1e-7)))
                      
    return loss

def temporal_consistency_loss(y_true_sequence, y_pred_sequence):
    loss = 0
    for t in range(1, len(y_true_sequence)):
        # Selecting the x, y coordinates and visibility for true and predicted sequences
        true_diff = y_true_sequence[t, :, :3] - y_true_sequence[t-1, :, :3]
        pred_diff = y_pred_sequence[t, :, :3] - y_pred_sequence[t-1, :, :3]
        loss += torch.mean(torch.abs(true_diff - pred_diff))
    return loss

In [34]:
class KeypointPipeline(nn.Module):
    def __init__(self, weights_path, num_vertices):
        super().__init__()

        self.keypoint_model = torch.load(weights_path).to(device)
        self.num_vertices = num_vertices
        self.gnn_encoder = GNNEncoder()
        self.gnn_decoder = GNNDecoder()

    def process_model_output(self, output):
        scores = output[0]['scores'].detach().cpu().numpy()
        high_scores_idxs = np.where(scores > 0.7)[0].tolist()

        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], 
                                            output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy()

        confidence = output[0]['scores'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy()
        labels = output[0]['labels'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy()
        keypoints = []
        for idx, kps in enumerate(output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy()):
            # Setting t_i = 1 because label is found
            keypoints.append(list(map(int, kps[0,0:2])) + [confidence[idx]] + [1] + [labels[idx]])

        # Create a dictionary where the key is the label and the value is the keypoint
        label_to_keypoint = {}
        for keypoint in keypoints:
            label = keypoint[-1]
            if label not in label_to_keypoint or label_to_keypoint[label][-2] < keypoint[-2]:
                label_to_keypoint[label] = keypoint

        # Use a dictionary to keep track of all possible keypoints and their locations.
        # Initialize with placeholders for missing keypoints.
        all_keypoints = {i: [0, 0, 0, 0, i] for i in range(1, self.num_vertices+1)}  # added another 0 for t_i

        for label, keypoint in label_to_keypoint.items():
            all_keypoints[label] = keypoint

        # Convert the dictionary values back into a list
        keypoints = list(all_keypoints.values())
        keypoints = torch.stack([torch.tensor(kp) for kp in keypoints]).float().to(device)
        visibility = keypoints[:, 3].unsqueeze(1)  # Extracting the visibility
        keypoints_visible = keypoints * visibility  # Predicted visible vertices
#         keypoints_occluded = keypoints * (1 - visibility)  # Predicted occluded vertices
        print("Vertices as encoder input", keypoints_visible)
        vertices, self.enc_e, self.edges, self.edge_features = self.gnn_encoder(keypoints_visible)  # Adjust here to include edge weights
        vertices_pred = self.gnn_decoder(vertices, self.enc_e, self.edges, self.edge_features)  # Adjust here to pass edge weights
#         vertices_pred_occluded = torch.cat((vertices_pred, keypoints_visible[:, 3].unsqueeze(1)), dim=1)
#         nonzero_indices = keypoints_occluded.nonzero(as_tuple=True)
#         if nonzero_indices[0].size()[0] > 0:  # Check if there are any non-zero elements
#             keypoints_occluded[nonzero_indices] = vertices_pred_occluded[nonzero_indices]
        return vertices_pred

    def process_image(self, img):
        img = img.unsqueeze(0).to(device)
        # Temporarily set the keypoint model to evaluation mode
        keypoint_model_training = self.keypoint_model.training  # Save the current mode
        self.keypoint_model.eval()
        with torch.no_grad():
            output = self.keypoint_model(img)
        # Set the keypoint model back to its previous mode
        self.keypoint_model.train(keypoint_model_training)
        img = (img[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        labeled_keypoints = self.process_model_output(output)

        return labeled_keypoints

    def forward(self, imgs):
        outputs = []

        for i in range(imgs.shape[0]):
            labeled_keypoints = self.process_image(imgs[i])
            outputs.append(labeled_keypoints)
            
        print(outputs)

        return outputs
    

In [35]:
# Define the model
model = KeypointPipeline(weights_path, num_vertices=6)
model = model.to(device)

# Define the loss
criterion = OccludedKeyPointLoss()

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

num_epochs = 25  # Define your number of epochs
batch_size = 16

KEYPOINTS_FOLDER_TRAIN = train_test_split(root_dir) +"/train" #train_test_split(root_dir) +"/train"
KEYPOINTS_FOLDER_VAL = train_test_split(root_dir) +"/val"
KEYPOINTS_FOLDER_TEST = train_test_split(root_dir) +"/test"

dataset_train = KPDataset(KEYPOINTS_FOLDER_TRAIN, transform=None, demo=False)
dataset_val = KPDataset(KEYPOINTS_FOLDER_VAL, transform=None, demo=False)
dataset_test = KPDataset(KEYPOINTS_FOLDER_TEST, transform=None, demo=False)

data_loader_train = DataLoader(dataset_train, batch_size=batch_size, shuffle=True, collate_fn=collate_fn)
data_loader_val = DataLoader(dataset_val, batch_size=1, shuffle=False, collate_fn=collate_fn)
data_loader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate_fn)

v = 5

# Initialize sequences for true and predicted keypoints
y_true_sequence = []
y_pred_sequence = []

model.train()
for epoch in range(num_epochs):
    start_time = time.time()
    for i, batch in enumerate(data_loader_train):
        img_tuple, target_dict_tuple, img_files = batch
        print(f"Processing batch {i+1} with images:", img_files)
        
        imgs = [img.to(device) for img in img_tuple]  # Create list of images

        # Process each image individually
        losses = []
        for i in range(len(imgs)):
            img = imgs[i].unsqueeze(0)  # Unsqueeze to add batch dimension

            # Prepare ground truth vertices for the image
            keypoints = target_dict_tuple[i]['keypoints'].to(device)
            visibility = torch.ones((keypoints.shape[0], keypoints.shape[1], 1)).to(device)
            vertices_gt = torch.cat((keypoints, visibility), dim=2).unsqueeze(0)  # Unsqueeze to add batch dimension
            vertices_gt = vertices_gt.squeeze()            
            y_true_sequence.append(vertices_gt)

            # Forward pass
            output = model(img)
            vertices_pred = output[0]
            y_pred_sequence.append(vertices_pred)
            
            edges_prob = model.enc_e
            edges = model.edges
            edge_features = model.edge_features
            edges_gt = torch.cat((edges, edge_features), dim=1) 

            # Compute loss for the image
            huber_loss = criterion(vertices_pred, vertices_gt)
            ce_loss = edge_loss(edges_prob, edges_gt)
            vis_loss = visibility_loss(vertices_pred, vertices_gt)

            loss = huber_loss + ce_loss + vis_loss
            losses.append(loss)  # Store loss for the image
            
        # Convert true and predicted sequences to tensors
        y_true_tensor = torch.stack(y_true_sequence)
        y_pred_tensor = torch.stack(y_pred_sequence)
        
        # Compute temporal consistency loss
        temporal_loss = temporal_consistency_loss(y_true_tensor, y_pred_tensor)

        # Average loss over all images in the batch
        other_losses = torch.mean(torch.stack(losses))
        
        # Combine temporal loss with other losses
        total_loss = other_losses + temporal_loss

        # Backward pass and optimization
        optimizer.zero_grad()
        total_loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        # Clear the sequences for the next batch
        y_true_sequence.clear()
        y_pred_sequence.clear()

    end_time = time.time()
    epoch_time = end_time - start_time
    eta = epoch_time * (num_epochs - epoch - 1)
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}, ETA: {eta} seconds')

model_save_path = f"/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_occ_b{batch_size}_e{num_epochs}_v{v}.pth"

torch.save(model, model_save_path)
    
# Save the state dict of the model, not the entire model
# torch.save(model.state_dict(), model_save_path)
    
torch.save(model, model_save_path)



Copying files: 2662 files [00:00, 20517.09 files/s]
Copying files: 2662 files [00:00, 20655.71 files/s]
Copying files: 2662 files [00:00, 20243.75 files/s]


Processing batch 1 with images: ('001174.rgb.jpg', '001184.rgb.jpg', '000835.rgb.jpg', '001241.rgb.jpg', '000032.rgb.jpg', '000579.rgb.jpg', '000187.rgb.jpg', '000005.rgb.jpg', '000525.rgb.jpg', '000345.rgb.jpg', '001172.rgb.jpg', '000199.rgb.jpg', '001035.rgb.jpg', '000431.rgb.jpg', '001202.rgb.jpg', '000482.rgb.jpg')
Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [304.0000, 218.0000,   0.9997,   1.0000,   3.0000],
        [321.0000, 230.0000,   0.9997,   1.0000,   4.0000],
        [381.0000, 308.0000,   0.9991,   1.0000,   5.0000],
        [398.0000, 324.0000,   0.9972,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [304.0000, 218.0000,   0.9997,   1.0000,   3.0000],
        [321.0000, 230.0000,   0.9997,   1.0000,   4.0000],
        [

Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   1.0000,   1.0000,   2.0000],
        [211.0000, 218.0000,   0.9998,   1.0000,   3.0000],
        [229.0000, 206.0000,   0.9997,   1.0000,   4.0000],
        [216.0000, 107.0000,   0.9977,   1.0000,   5.0000],
        [208.0000,  86.0000,   0.9988,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   1.0000,   1.0000,   2.0000],
        [211.0000, 218.0000,   0.9998,   1.0000,   3.0000],
        [229.0000, 206.0000,   0.9997,   1.0000,   4.0000],
        [216.0000, 107.0000,   0.9977,   1.0000,   5.0000],
        [208.0000,  86.0000,   0.9988,   1.0000,   6.0000]], device='cuda:0')
tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   1.0000,   1.0000,   2.0000],
        [211.0000, 218.0000,   0.9998,   1.0000,   3.0000],
        [229

Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [178.0000, 276.0000,   0.9998,   1.0000,   3.0000],
        [180.0000, 255.0000,   0.9997,   1.0000,   4.0000],
        [209.0000, 161.0000,   0.9991,   1.0000,   5.0000],
        [230.0000, 167.0000,   0.9983,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [178.0000, 276.0000,   0.9998,   1.0000,   3.0000],
        [180.0000, 255.0000,   0.9997,   1.0000,   4.0000],
        [209.0000, 161.0000,   0.9991,   1.0000,   5.0000],
        [230.0000, 167.0000,   0.9983,   1.0000,   6.0000]], device='cuda:0')
tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [178.0000, 276.0000,   0.9998,   1.0000,   3.0000],
        [180

Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [310.0000, 223.0000,   0.9994,   1.0000,   3.0000],
        [326.0000, 237.0000,   0.9998,   1.0000,   4.0000],
        [407.0000, 292.0000,   0.9990,   1.0000,   5.0000],
        [425.0000, 306.0000,   0.9966,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [310.0000, 223.0000,   0.9994,   1.0000,   3.0000],
        [326.0000, 237.0000,   0.9998,   1.0000,   4.0000],
        [407.0000, 292.0000,   0.9990,   1.0000,   5.0000],
        [425.0000, 306.0000,   0.9966,   1.0000,   6.0000]], device='cuda:0')
tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [310.0000, 223.0000,   0.9994,   1.0000,   3.0000],
        [326

Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   1.0000,   1.0000,   2.0000],
        [211.0000, 218.0000,   0.9996,   1.0000,   3.0000],
        [228.0000, 206.0000,   0.9996,   1.0000,   4.0000],
        [191.0000, 114.0000,   0.9982,   1.0000,   5.0000],
        [202.0000,  96.0000,   0.9966,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   1.0000,   1.0000,   2.0000],
        [211.0000, 218.0000,   0.9996,   1.0000,   3.0000],
        [228.0000, 206.0000,   0.9996,   1.0000,   4.0000],
        [191.0000, 114.0000,   0.9982,   1.0000,   5.0000],
        [202.0000,  96.0000,   0.9966,   1.0000,   6.0000]], device='cuda:0')
tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   1.0000,   1.0000,   2.0000],
        [211.0000, 218.0000,   0.9996,   1.0000,   3.0000],
        [228

Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [281.0000, 207.0000,   0.9998,   1.0000,   3.0000],
        [301.0000, 213.0000,   0.9998,   1.0000,   4.0000],
        [386.0000, 266.0000,   0.9979,   1.0000,   5.0000],
        [407.0000, 264.0000,   0.9985,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [281.0000, 207.0000,   0.9998,   1.0000,   3.0000],
        [301.0000, 213.0000,   0.9998,   1.0000,   4.0000],
        [386.0000, 266.0000,   0.9979,   1.0000,   5.0000],
        [407.0000, 264.0000,   0.9985,   1.0000,   6.0000]], device='cuda:0')
tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [281.0000, 207.0000,   0.9998,   1.0000,   3.0000],
        [301

Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [282.0000, 207.0000,   0.9999,   1.0000,   3.0000],
        [302.0000, 214.0000,   0.9999,   1.0000,   4.0000],
        [391.0000, 170.0000,   0.9983,   1.0000,   5.0000],
        [413.0000, 169.0000,   0.9981,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [282.0000, 207.0000,   0.9999,   1.0000,   3.0000],
        [302.0000, 214.0000,   0.9999,   1.0000,   4.0000],
        [391.0000, 170.0000,   0.9983,   1.0000,   5.0000],
        [413.0000, 169.0000,   0.9981,   1.0000,   6.0000]], device='cuda:0')
tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [282.0000, 207.0000,   0.9999,   1.0000,   3.0000],
        [302

Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [296.0000, 213.0000,   0.9997,   1.0000,   3.0000],
        [314.0000, 223.0000,   0.9998,   1.0000,   4.0000],
        [366.0000, 308.0000,   0.9989,   1.0000,   5.0000],
        [358.0000, 324.0000,   0.9908,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [296.0000, 213.0000,   0.9997,   1.0000,   3.0000],
        [314.0000, 223.0000,   0.9998,   1.0000,   4.0000],
        [366.0000, 308.0000,   0.9989,   1.0000,   5.0000],
        [358.0000, 324.0000,   0.9908,   1.0000,   6.0000]], device='cuda:0')
tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [296.0000, 213.0000,   0.9997,   1.0000,   3.0000],
        [314

KeyboardInterrupt: 

In [40]:
def visualize_and_save(img, vertices, filename):
    print("type of image befor conversion",type(img))    
    print("type of vertices before conversion", type(vertices))
    print(img)
    img = (img.permute(1,2,0).cpu().numpy() * 255).astype(np.uint8)
#     img = (img * 255).astype(np.uint8)  # Convert back from [0, 1] range to [0, 255]
    img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
    vertices = vertices.cpu().numpy()

    print(f"Image shape before saving: {img.shape}")  # print the image shape
    print("type of vertices", type(vertices))
#     print("entered vertices", vertices)
#     print("entered image", img)

    # Convert grayscale to BGR if necessary
    if len(img.shape) == 2:
        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
        
    for i in range(vertices.shape[0]):
        img = cv2.circle(img, (int(vertices[i, 0]), int(vertices[i, 1])), radius=2, color=(0, 0, 255), thickness=-1)
        
    result = cv2.imwrite(filename, img)
    print(f"Image saved at {filename}: {result}")  # print if save was successful

    # If the image didn't save correctly, save the image data to a text file for examination
    if not result:
        with open(filename + ".txt", "w") as f:
            np.savetxt(f, img.flatten())

In [41]:
def test_and_save_model(model, data_loader_test):
    model.eval()  # Set the model to evaluation mode
    total_loss = 0.0
    total_trifocal_loss = 0.0
    total_ce_loss = 0.0
    total_vis_loss = 0.0
    num_batches = 0

    y_true_sequence = []
    y_pred_sequence = []

    # We don't need to track gradients during evaluation
    with torch.no_grad():
        for idx, batch in enumerate(data_loader_test):
            img_tuple, target_dict_tuple, img_files = batch

            total_batch_loss = 0.0
            total_batch_trifocal_loss = 0.0
            total_batch_ce_loss = 0.0
            total_batch_vis_loss = 0.0

            # Process each image individually
            for i in range(len(img_tuple)):
                img = img_tuple[i].to(device)
                target = target_dict_tuple[i]

                # Prepare ground truth vertices for the image
                keypoints = target['keypoints'].to(device)
                visibility = torch.ones((keypoints.shape[0], keypoints.shape[1], 1)).to(device)
                vertices_gt = torch.cat((keypoints, visibility), dim=2).unsqueeze(0)  # Unsqueeze to add batch dimension
                vertices_gt = vertices_gt.squeeze()
                y_true_sequence.append(vertices_gt)

                # Forward pass
                output = model(img.unsqueeze(0))
                vertices_pred = output[0]
                y_pred_sequence.append(vertices_pred)

                edges_prob = model.enc_e
                edges = model.edges
                edge_features = model.edge_features
                edges_gt = torch.cat((edges, edge_features), dim=1) 

                trifocal_loss = criterion(vertices_pred, vertices_gt)
                ce_loss = edge_loss(edges_prob, edges_gt)
                vis_loss = visibility_loss(vertices_pred, vertices_gt)
                loss = trifocal_loss + ce_loss + vis_loss

                total_batch_loss += loss.item()
                total_batch_trifocal_loss += trifocal_loss.item()
                total_batch_ce_loss += ce_loss.item()
                total_batch_vis_loss += vis_loss.item()

                # Visualize and save the prediction
                filename = f'/home/jc-merlab/Pictures/Data/occ_vis_data/image_{idx}_{i}.jpg'
                visualize_and_save(img, vertices_pred, filename)
                print(f"Image saved at {filename}")  # Print statement to confirm image save

            # Convert true and predicted sequences to tensors
            y_true_tensor = torch.stack(y_true_sequence)
            y_pred_tensor = torch.stack(y_pred_sequence)

            # Compute temporal consistency loss
            temporal_loss = temporal_consistency_loss(y_true_tensor, y_pred_tensor)

            total_loss += (total_batch_loss + temporal_loss.item()) / len(img_tuple)
            total_trifocal_loss += total_batch_trifocal_loss / len(img_tuple)
            total_ce_loss += total_batch_ce_loss / len(img_tuple)
            num_batches += 1

            # Clear the sequences for the next batch
            y_true_sequence.clear()
            y_pred_sequence.clear()
    
    # Average the loss over all batches
    avg_loss = total_loss / num_batches
    avg_trifocal_loss = total_trifocal_loss / num_batches
    avg_ce_loss = total_ce_loss / num_batches
    
    print(f'Avg. Test Loss: {avg_loss}, Avg. Trifocal Loss: {avg_trifocal_loss}, Avg. Cross Entropy Loss: {avg_ce_loss}')
    return avg_loss, avg_trifocal_loss, avg_ce_loss

In [42]:
# avg_loss, avg_trifocal_loss, avg_ce_loss, all_preds = test_and_save_model(model, data_loader_test)

avg_loss, avg_trifocal_loss, avg_ce_loss = test_and_save_model(model, data_loader_test)

Vertices as encoder input tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [179.0000, 298.0000,   0.9992,   1.0000,   3.0000],
        [176.0000, 278.0000,   0.9997,   1.0000,   4.0000],
        [230.0000, 195.0000,   0.9985,   1.0000,   5.0000],
        [249.0000, 203.0000,   0.9990,   1.0000,   6.0000]], device='cuda:0')
Vertices in node features tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [179.0000, 298.0000,   0.9992,   1.0000,   3.0000],
        [176.0000, 278.0000,   0.9997,   1.0000,   4.0000],
        [230.0000, 195.0000,   0.9985,   1.0000,   5.0000],
        [249.0000, 203.0000,   0.9990,   1.0000,   6.0000]], device='cuda:0')
tensor([[258.0000, 367.0000,   1.0000,   1.0000,   1.0000],
        [258.0000, 283.0000,   0.9999,   1.0000,   2.0000],
        [179.0000, 298.0000,   0.9992,   1.0000,   3.0000],
        [176

AttributeError: 'int' object has no attribute 'item'

In [None]:
import cv2
import os

# Directory containing images
dir_path = '/home/jc-merlab/Pictures/Data/occ_vis_data/'
images = []

# Ensure the images are sorted by name
for f in sorted(os.listdir(dir_path)):
    if f.endswith('.jpg') or f.endswith('.png'):  # Check for image file extension
        images.append(f)

# Determine the width and height from the first image
image_path = os.path.join(dir_path, images[0])
frame = cv2.imread(image_path)
cv2.imshow('video',frame)
height, width, channels = frame.shape

# Define the codec and create a VideoWriter object
fourcc = cv2.VideoWriter_fourcc(*'mp4v')  # Be sure to use the correct codec
video_filename = 'output.mp4'
video = cv2.VideoWriter(video_filename, fourcc, 3.0, (width, height))

for image in images:
    image_path = os.path.join(dir_path, image)
    frame = cv2.imread(image_path)
    video.write(frame)  # Write out frame to video

# Release everything when job is finished
video.release()
cv2.destroyAllWindows()

print("The output video is", video_filename)

In [23]:
model_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_occ_b8_e25_v4.pth'

model = torch.load(model_path).to(device)


image = Image.open("/home/jc-merlab/Pictures/Data/planar_occluded/002626.rgb.jpg")
print(type(image))

img = F.to_tensor(image).to(device)
img.unsqueeze_(0)
# print(image.shape)
# image = list(image)
# print(type(images))
# images = list(image.to(device) for image in images)

with torch.no_grad():
    model.to(device)
    model.eval()
    output = model(img)
    
keypoints = output[0]

print(keypoints)
plt.imshow(image)

# Assuming each keypoint is a tensor representing (x, y)
for i, keypoint in enumerate(keypoints):
    print(f'Key point {i}: {keypoint}')
    keypoint = keypoint.cpu().numpy()
    plt.plot(keypoint[0], keypoint[1], 'ro')
plt.show()

# Plotting the image

# plt.imshow(image)

# for keypoint in output[0]:
#     plt.plot(keypoint[0], keypoint[1], 'ro')

# plt.show()

<class 'PIL.JpegImagePlugin.JpegImageFile'>
h1 shape torch.Size([6, 128])
value of edges tensor([[1, 2],
        [2, 3],
        [4, 5]], device='cuda:0')
h1_source shape:  torch.Size([3, 128])
h1_target shape:  torch.Size([3, 128])
edges_weights shape:  torch.Size([3])


../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [64,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [65,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [66,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [67,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [68,0,0] Assertion `index >= -sizes[i] && index < sizes[i] && "index out of bounds"` failed.
../aten/src/ATen/native/cuda/IndexKernel.cu:92: operator(): block: [0,0,0], thread: [69,0,0] Assertion `index >= -s

RuntimeError: CUDA error: device-side assert triggered
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.
