In [56]:
import os
from os import listdir
import pandas as pd
import numpy as np
import glob
import cv2
import json
from os.path import expanduser
import splitfolders
import shutil
from define_path import Def_Path

from tqdm import tqdm

import torch 
import torchvision
from torchvision import models
from torchvision.models.detection.rpn import AnchorGenerator
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn 
import torchvision.transforms as T
from torchvision.transforms import functional as F
from torchsummary import summary

from sklearn.model_selection import train_test_split

import albumentations as A # Library for augmentations

import matplotlib.pyplot as plt 
from PIL import Image

import transforms, utils, engine, train
from utils import collate_fn
from engine import train_one_epoch, evaluate


t = torch.cuda.get_device_properties(0).total_memory
print(t)
torch.cuda.empty_cache()

r = torch.cuda.memory_reserved(0)
print(r)
a = torch.cuda.memory_allocated(0)
print(a)
# f = r-a  # free inside reserved

weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_sim_b1_e25_v0.pth'

16908615680
1006632960
976208896


In [57]:
# to generalize home directory. User can change their parent path without entering their home directory
path = Def_Path()

parent_path =  path.home + "/Pictures/" + "Data/"

root_dir = parent_path + path.year + "-" + path.month + "-" + path.day + "/"

In [58]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
# torch.cuda.set_per_process_memory_fraction(0.9, 0)
print(device)

cuda


In [59]:
def train_transform():
    return A.Compose([
        A.Sequential([
            A.RandomRotate90(p=1), # Random rotation of an image by 90 degrees zero or more times
            A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, brightness_by_max=True, always_apply=False, p=1), # Random change of brightness & contrast
        ], p=1)
    ],
    keypoint_params=A.KeypointParams(format='xy'), # More about keypoint formats used in albumentations library read at https://albumentations.ai/docs/getting_started/keypoints_augmentation/
    bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bboxes_labels']) # Bboxes should have labels, read more at https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/
    )

In [60]:
def train_test_split(src_dir):
    dst_dir_img = src_dir + "images"
    dst_dir_anno = src_dir + "annotations"
    
    if os.path.exists(dst_dir_img) and os.path.exists(dst_dir_anno):
        print("folders exist")
    else:
        os.mkdir(dst_dir_img)
        os.mkdir(dst_dir_anno)
        
    for jpgfile in glob.iglob(os.path.join(src_dir, "*.jpg")):
        shutil.copy(jpgfile, dst_dir_img)

    for jsonfile in glob.iglob(os.path.join(src_dir, "*.json")):
        shutil.copy(jsonfile, dst_dir_anno)
        
    output = parent_path + "split_folder_output" + "-" + path.year + "-" + path.month + "-" + path.day 
    
    print(type(output))
    
    splitfolders.ratio(src_dir, # The location of dataset
                   output=output, # The output location
                   seed=42, # The number of seed
                   ratio=(.7, .2, .1), # The ratio of split dataset
                   group_prefix=None, # If your dataset contains more than one file like ".jpg", ".pdf", etc
                   move=False # If you choose to move, turn this into True
                   )
    
    shutil.rmtree(dst_dir_img)
    shutil.rmtree(dst_dir_anno)
    
    return output  
    

In [61]:
class KPDataset(Dataset):
    def __init__(self, root, transform=None, demo=False):                
        self.root = root
        self.transform = transform
        self.demo = demo # Use demo=True if you need transformed and original images (for example, for visualization purposes)
        self.imgs_files = sorted(os.listdir(os.path.join(root, "images")))
        self.annotations_files = sorted(os.listdir(os.path.join(root, "annotations")))
    
    def __getitem__(self, idx):
        img_path = os.path.join(self.root, "images", self.imgs_files[idx])
        annotations_path = os.path.join(self.root, "annotations", self.annotations_files[idx])

        img_original = cv2.imread(img_path)
        img_original = cv2.cvtColor(img_original, cv2.COLOR_BGR2RGB)
        print(img_original.shape)
        
        with open(annotations_path) as f:
            data = json.load(f)
            bboxes_original = data['bboxes']
            keypoints_original = data['keypoints']
            
            # All objects are keypoints on the robot
            bboxes_labels_original = [] 
            bboxes_labels_original.append('base_joint')
            bboxes_labels_original.append('joint2')
            bboxes_labels_original.append('joint3')
            bboxes_labels_original.append('joint4')
            bboxes_labels_original.append('joint5')
            bboxes_labels_original.append('joint6')  

        if self.transform:   
            # Converting keypoints from [x,y,visibility]-format to [x, y]-format + Flattening nested list of keypoints            
            # For example, if we have the following list of keypoints for three objects (each object has two keypoints):
            # [[obj1_kp1, obj1_kp2], [obj2_kp1, obj2_kp2], [obj3_kp1, obj3_kp2]], where each keypoint is in [x, y]-format            
            # Then we need to convert it to the following list:
            # [obj1_kp1, obj1_kp2, obj2_kp1, obj2_kp2, obj3_kp1, obj3_kp2]
            keypoints_original_flattened = [el[0:2] for kp in keypoints_original for el in kp]
            
            # Apply augmentations
            transformed = self.transform(image=img_original, bboxes=bboxes_original, bboxes_labels=bboxes_labels_original, keypoints=keypoints_original_flattened)
            img = transformed['image']
            bboxes = transformed['bboxes']
            # Unflattening list transformed['keypoints']
            # For example, if we have the following list of keypoints for three objects (each object has two keypoints):
            # [obj1_kp1, obj1_kp2, obj2_kp1, obj2_kp2, obj3_kp1, obj3_kp2], where each keypoint is in [x, y]-format
            # Then we need to convert it to the following list:
            # [[obj1_kp1, obj1_kp2], [obj2_kp1, obj2_kp2], [obj3_kp1, obj3_kp2]]
            keypoints_transformed_unflattened = np.reshape(np.array(transformed['keypoints']), (-1,1,2)).tolist()

            # Converting transformed keypoints from [x, y]-format to [x,y,visibility]-format by appending original visibilities to transformed coordinates of keypoints
            keypoints = []
            for o_idx, obj in enumerate(keypoints_transformed_unflattened):
#                 print("object", obj)
#                 print(" obj index", o_idx)# Iterating over objects
                obj_keypoints = []
                for k_idx, kp in enumerate(obj): # Iterating over keypoints in each object
                    obj_keypoints.append(kp + [keypoints_original[o_idx][k_idx][2]])
                keypoints.append(obj_keypoints)
        
        else:
            img, bboxes, keypoints = img_original, bboxes_original, keypoints_original        
        
        # Convert everything into a torch tensor        
        bboxes = torch.as_tensor(bboxes, dtype=torch.float32)       
        target = {}
        labels = [1, 2, 3, 4, 5, 6]            
        target["boxes"] = bboxes
        target["labels"] = torch.as_tensor(labels, dtype=torch.int64) # all objects are joint positions
        target["image_id"] = torch.tensor([idx])
        target["area"] = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
        target["iscrowd"] = torch.zeros(len(bboxes), dtype=torch.int64)
        target["keypoints"] = torch.as_tensor(keypoints, dtype=torch.float32)
        img = F.to_tensor(img)        
        bboxes_original = torch.as_tensor(bboxes_original, dtype=torch.float32)
        target_original = {}
        target_original["boxes"] = bboxes_original
        target_original["labels"] = torch.as_tensor(labels, dtype=torch.int64) # all objects are glue tubes
        target_original["image_id"] = torch.tensor([idx])
        target_original["area"] = (bboxes_original[:, 3] - bboxes_original[:, 1]) * (bboxes_original[:, 2] - bboxes_original[:, 0])
        target_original["iscrowd"] = torch.zeros(len(bboxes_original), dtype=torch.int64)
        target_original["keypoints"] = torch.as_tensor(keypoints_original, dtype=torch.float32)        
        img_original = F.to_tensor(img_original)

        if self.demo:
            return img, target, img_original, target_original
        else:
            return img, target
    
    def __len__(self):
        return len(self.imgs_files)

In [62]:
# def get_model(num_keypoints, weights_path=None):
    
#     anchor_generator = AnchorGenerator(sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0))
#     model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=False,
#                                                                    pretrained_backbone=True,
#                                                                    num_keypoints=num_keypoints,
#                                                                    num_classes = 7, # Background is the first class, object is the second class
#                                                                    rpn_anchor_generator=anchor_generator)

#     if weights_path:
# #         state_dict = torch.load(weights_path)
# #         model.load_state_dict(state_dict)      
#         model = torch.load(weights_path)
        
        
#     return model

In [63]:
import torch
import torch.nn as nn
import torch.nn.functional as functional

# class GNNEncoder(nn.Module):
#     def __init__(self, graph_edges, vertices_dim=3, edges_dim=1, hidden_dim=128):
#         super(GNNEncoder, self).__init__()
#         # Define your layers here, e.g., 
#         self.f_enc = nn.Linear(3, 128)  # Assuming 3 features per vertex as per the paper
#         self.f_e1 = nn.Linear(256, 128)  # Concatenate two vertices
#         self.f_v = nn.Linear(128, 128)  # Hidden layer for vertices
#         self.f_e2 = nn.Linear(256, 128)  # Concatenate updated vertices

#         # Define the graph edges
#         self.graph_edges = torch.tensor([[1,2], [2,3], [3,4], [4,5], [5,6]]) - 1  # Subtract 1 for zero-based indexing
#         self.graph_edges = torch.cat((self.graph_edges, torch.flip(self.graph_edges, [1])), dim=0)  # Make graph bidirectional

#     def forward(self, vertices):
#         h1 = self.f_enc(vertices)
#         h1_expand_1 = h1[self.graph_edges[:,0]]
#         h1_expand_2 = h1[self.graph_edges[:,1]]
#         h_e1 = self.f_e1(torch.cat((h1_expand_1, h1_expand_2), dim=-1))  # Concatenate along the last dimension
#         h2 = self.f_v(h_e1.sum(dim=0))  # e->v
#         print("h2 shape:", h2.shape)
#         h2_prob = functional.log_softmax(h2, dim=-1) # vertices probability
#         h2_expand_1 = h2[self.graph_edges[:,0]]
#         h2_expand_2 = h2[self.graph_edges[:,1]]
#         print("h1 shape:", h1.shape)
#         print("h1_expand_1 shape:", h1_expand_1.shape)
#         print("h1_expand_2 shape:", h1_expand_2.shape)
#         print("h_e1 shape:", h_e1.shape)        
#         print("h2_expand_1 shape:", h2_expand_1.shape)
#         print("h2_expand_2 shape:", h2_expand_2.shape)
#         h_e2 = self.f_e2(torch.cat((h2_expand_1, h2_expand_2), dim=-1))  # v->e
        
#         return h2_prob, h_e2


class GNNEncoder(nn.Module):
    def __init__(self, graph_edges, vertices_dim=3, edges_dim=1, hidden_dim=128):
        super(GNNEncoder, self).__init__()
        # Define your layers here, e.g., 
        self.f_enc = nn.Linear(vertices_dim, hidden_dim)  # Assuming 3 features per vertex as per the paper
        self.f_e1 = nn.Linear(hidden_dim * 2, hidden_dim)  # Concatenate two vertices features
        self.f_v = nn.Linear(hidden_dim, hidden_dim)  # Hidden layer for vertices
        self.f_e2 = nn.Linear(hidden_dim * 2, hidden_dim)  # Concatenate updated vertices features

        # Define the graph edges
        self.graph_edges = graph_edges.to(device)

    def forward(self, vertices):
        h1 = self.f_enc(vertices)
        print("h1 shape:", h1.shape)
        h1_expand_1 = h1[self.graph_edges[:,0]]
        h1_expand_2 = h1[self.graph_edges[:,1]]
        h_e1 = self.f_e1(torch.cat((h1_expand_1, h1_expand_2), dim=-1))  # Concatenate along the last dimension
        print("h1_expand_1 shape:", h1_expand_1.shape)
        print("h1_expand_2 shape:", h1_expand_2.shape)
        print("h_e1 shape:", h_e1.shape)
        h2 = torch.tanh(h1)
        print("h2 shape:", h2)
        h2_prob = torch.sigmoid(h2)  # vertices probability
        h2_expand_1 = h2[self.graph_edges[:,0]]
        h2_expand_2 = h2[self.graph_edges[:,1]]
        print("h2_expand_1 shape:", h2_expand_1.shape)
        print("h2_expand_2 shape:", h2_expand_2.shape)
        h_e2 = self.f_e2(torch.cat((h2_expand_1, h2_expand_2), dim=-1))  # v->e
        print("h_e2 shape:", h_e2.shape)
        
        return h2_prob, h_e2

# class GNNDecoder(nn.Module):
#     def __init__(self):
#         super().__init__()
#         # Define your layers here, e.g., 
#         self.f_ep = nn.ModuleList([nn.Linear(6, 128) for _ in range(3)])  # Assuming 3 features per vertex as per the paper
#         self.f_v = nn.Linear(128, 3)  # To update vertices

#     def forward(self, vertices, edges):
#         h = sum(edge*f_ep(torch.cat((vertices[None, :], vertices[:, None]), dim=-1)) for edge, f_ep in zip(edges, self.f_ep))  # v->e
#         updated_vertices = vertices + self.f_v(h.sum(dim=1))  # e->v

class GNNDecoder(nn.Module):
    def __init__(self, graph_edges, hidden_dim=128, vertices_dim=3, num_classes=6):
        super().__init__()
        # Define your layers here
        self.f_ep = nn.Linear(2 * hidden_dim, hidden_dim)
        self.f_v = nn.Linear(hidden_dim, vertices_dim + num_classes - 1)  # To update vertices + classes

        # Define the graph edges
        self.graph_edges = torch.tensor(graph_edges) - 1  # Subtract 1 for zero-based indexing
        self.graph_edges = torch.cat((self.graph_edges, torch.flip(self.graph_edges, [1])), dim=0)  # Make graph bidirectional

    def forward(self, vertices, edges):
        vertices_expand_1 = vertices[self.graph_edges[:,0]]
        vertices_expand_2 = vertices[self.graph_edges[:,1]]
        print("Edges shape", edges.shape)
        h = self.f_ep(torch.cat((vertices_expand_1, vertices_expand_2), dim=-1))  # v->e
        print("h shape", h.shape)
        edges_expand = edges.unsqueeze(-1).unsqueeze(-1).expand_as(h)  # Expand edges to match the shape of h
        h = h * edges_expand  # apply edge weights
#         h = h * edges.unsqueeze(-1)  # apply edge weights

        h = h.view(-1, edges.size(1), h.size(-1))  # Reshape h to (num_edges, num_vertices, hidden_dim)
        
        h_summed = h.sum(dim=0)  # Sum over the edges
        updated_vertices_classes = self.f_v(h_summed)  # e->v

        updated_vertices = torch.tanh(updated_vertices_classes[:, :2])  # For x, y coordinates
        updated_classes = F.softmax(updated_vertices_classes[:, 2:], dim=1)  # For class probabilities

        updated_vertices = torch.cat((updated_vertices, updated_classes), dim=1)  # Concatenate along the last dimension

        return updated_vertices
  


In [64]:
# import torch
# import torch.nn as nn
# import torch.nn.functional as functional

# class GNNEncoder(nn.Module):
#     def __init__(self):
#         super().__init__()
#         # Define your layers here, e.g., 
#         self.f_enc = nn.Linear(3, 128)  # Assuming 3 features per vertex as per the paper
#         self.f_e1 = nn.Linear(256, 128)  # Concatenate two vertices
#         self.f_v = nn.Linear(128, 128)  # Hidden layer for vertices
#         self.f_e2 = nn.Linear(256, 128)  # Concatenate updated vertices

#     def forward(self, vertices):
#         h1 = self.f_enc(vertices)
#         N = h1.shape[0]
#         h1_expand_1 = h1.unsqueeze(1).expand(-1, h1.size(0), -1)  # Now the size is (N, N, F)
#         h1_expand_2 = h1.unsqueeze(0).expand(h1.size(0), -1, -1)  # Now the size is (N, N, F)
#         h_e1 = self.f_e1(torch.cat((h1_expand_1, h1_expand_2), dim=-1))  # Concatenate along the last dimension
# #         h_e1 = self.f_e1(torch.cat((h1[None, :], h1[:, None]), dim=-1))  # v->e
#         h2 = self.f_v(h_e1.sum(dim=1))  # e->v
#         h2_prob = functional.log_softmax(h2, dim=-1) # vertices probability
#         h2_expand_1 = h2.repeat(1, N).view(N, N, -1)
#         h2_expand_2 = h2.repeat(N, 1).view(N, N, -1)
#         h_e2 = self.f_e2(torch.cat((h2_expand_1, h2_expand_2), dim=-1))  # v->e
#         return h2_prob, h_e2 

# class GNNDecoder(nn.Module):
#     def __init__(self):
#         super().__init__()
#         # Define your layers here, e.g.,
#         self.f_ep = nn.ModuleList([nn.Linear(256, 128) for _ in range(3)])  # Assuming 3 features per vertex as per the paper
#         self.f_v = nn.Linear(128, 3)  # To update vertices

#     def forward(self, vertices, edges):
#         edges = edges.long()  # Convert edges to integer tensor
#         N = vertices.size(0)
#         expanded_vertices = vertices.unsqueeze(1).expand(-1, N, -1)  # Expand vertices tensor to match the shape of edges
#         h = sum(edge * f_ep(torch.cat((expanded_vertices, expanded_vertices.transpose(0, 1)), dim=-1)) for edge, f_ep in zip(edges, self.f_ep))  # v->e
#         updated_vertices = vertices + self.f_v(h.sum(dim=1).reshape(vertices.shape))  # e->v (reshape h)
#         return updated_vertices

# class GNNDecoder(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.f_ep = nn.ModuleList([nn.Linear(256, 128) for _ in range(3)])  # Assuming 3 features per vertex as per the paper
#         self.f_v = nn.Linear(128, 3)  # To update vertices

#     def forward(self, vertices, edges):
#         N = vertices.size(0)
#         expanded_vertices = vertices.unsqueeze(1).expand(-1, N, -1)  # Expand vertices tensor to match the shape of edges
#         h = sum(edge * f_ep(torch.cat((expanded_vertices, expanded_vertices.transpose(0, 1)), dim=-1)) for edge, f_ep in zip(edges, self.f_ep))  # v->e
#         updated_vertices = vertices + self.f_v(h.view(-1, 128)).view(vertices.size())  # e->v
#         return updated_vertices

# class GNNDecoder(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.f_ep = nn.ModuleList([nn.Linear(256, 128) for _ in range(3)])  # Assuming 3 features per vertex as per the paper
#         self.f_v = nn.Linear(128, 3)  # To update vertices

#     def forward(self, vertices, edges):
#         N = vertices.size(0)
#         expanded_vertices = vertices.unsqueeze(1).expand(-1, N, -1)  # Expand vertices tensor to match the shape of edges
#         h = sum(edge * f_ep(torch.cat((expanded_vertices, expanded_vertices.transpose(0, 1)), dim=-1)) for edge, f_ep in zip(edges, self.f_ep))  # v->e
#         updated_vertices = vertices + self.f_v(h.view(-1, 128)).view(vertices.size(0), -1)  # e->v
#         return updated_vertices

# class GNNDecoder(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.f_ep = nn.ModuleList([nn.Linear(256, 128) for _ in range(3)])  # Assuming 3 features per vertex as per the paper
#         self.f_v = nn.Linear(128, 18)  # To update vertices (assuming 6 keypoints with 3 dimensions each)

#     def forward(self, vertices, edges):
#         N = vertices.size(0)
#         expanded_vertices = vertices.unsqueeze(1).expand(-1, N, -1)  # Expand vertices tensor to match the shape of edges
#         h = sum(edge * f_ep(torch.cat((expanded_vertices, expanded_vertices.transpose(0, 1)), dim=-1)) for edge, f_ep in zip(edges, self.f_ep))  # v->e
#         updated_vertices = vertices + self.f_v(h.view(vertices.size(0), -1))  # e->v
#         return updated_vertices

# class GNNDecoder(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.f_ep = nn.ModuleList([nn.Linear(256, 128) for _ in range(3)])  # Assuming 3 features per vertex as per the paper
#         self.f_v = nn.Linear(128, 18)  # To update vertices (assuming 6 keypoints with 3 dimensions each)

#     def forward(self, vertices, edges):
#         N = vertices.size(0)
#         expanded_vertices = vertices.unsqueeze(1).expand(-1, N, -1)  # Expand vertices tensor to match the shape of edges
#         h = sum(edge * f_ep(torch.cat((expanded_vertices, expanded_vertices.transpose(0, 1)), dim=-1)) for edge, f_ep in zip(edges, self.f_ep))  # v->e
#         updated_vertices = vertices + self.f_v(h.view(vertices.size(0), -1)).view(vertices.size())  # e->v
#         return updated_vertices

# class GNNDecoder(nn.Module):
#     def __init__(self):
#         super().__init__()
#         self.f_ep = nn.ModuleList([nn.Linear(256, 128) for _ in range(3)])  # Assuming 3 features per vertex as per the paper
#         self.f_v = nn.Linear(128, 18)  # To update vertices (assuming 6 keypoints with 3 dimensions each)

#     def forward(self, vertices, edges):
#         N = vertices.size(0)
#         expanded_vertices = vertices.unsqueeze(1).expand(-1, N, -1)  # Expand vertices tensor to match the shape of edges
#         h = sum(edge * f_ep(torch.cat((expanded_vertices, expanded_vertices.transpose(0, 1)), dim=-1)) for edge, f_ep in zip(edges, self.f_ep))  # v->e
#         updated_vertices = vertices + self.f_v(h.view(vertices.size(0), -1)).view(vertices.size(0), -1)  # e->v
#         return updated_vertices

class TrifocalLoss(nn.Module):
    def __init__(self):
        super().__init__()

    def forward(self, vertices_pred, vertices_gt):
        loss = (vertices_gt - vertices_pred).pow(2).mean()  # Changed from sum() to mean()
        return loss
    
def cross_entropy_loss_func(vertices_prob, vertices_gt):
    return -torch.sum(vertices_gt * vertices_prob)

def one_hot_encode(vertices_gt, num_classes):
    """
    One-hot encode the ground truth vertices.
    :param vertices_gt: tensor of shape (N, K, D), where N is the number of samples, K is the number of keypoints, and D is the number of dimensions per keypoint.
    :param num_classes: the total number of keypoints types (classes).
    :return: one-hot encoded vertices of shape (N, K, num_classes).
    """
    # Subtract 1 for zero-based indexing
    vertices_gt = vertices_gt.long() - 1

    # Create a tensor of zeros of size (N, K, num_classes)
    one_hot = torch.zeros(vertices_gt.size(0), vertices_gt.size(1), num_classes).to(device)

    # Fill the appropriate elements with ones
    one_hot.scatter_(2, vertices_gt.unsqueeze(-1), 1)

    return one_hot

In [65]:
class KeypointPipeline(nn.Module):
    def __init__(self):
        super().__init__()
        self.keypoint_model = torch.load(weights_path).to(device)
#         self.graph_edges = torch.tensor([[0, 1], [1, 2], [2, 3], [3, 4], [4, 5]], dtype=torch.long)
        # Define edges 
        edges = [(1,2), (2,3), (3,4), (4,5), (5,6)]
        # Convert to tensor and subtract 1 for zero-based indexing
        self.graph_edges = torch.tensor(edges, dtype=torch.long) - 1
        self.gnn_encoder = GNNEncoder(self.graph_edges.to(device))
        self.gnn_decoder = GNNDecoder(self.graph_edges.to(device))

    def forward(self, img):
        print("image in keypoints eval phase", img.shape)
#         img = F.to_tensor(img).to(device)
        img.unsqueeze_(0)
        img = list(img)
        with torch.no_grad():
            self.keypoint_model.to(device)
            self.keypoint_model.eval()
            output = self.keypoint_model(img)
            
        img = (img[0].permute(1,2,0).detach().cpu().numpy() * 255).astype(np.uint8)
        scores = output[0]['scores'].detach().cpu().numpy()
        high_scores_idxs = np.where(scores > 0.7)[0].tolist() # Indexes of boxes with scores > 0.7
        post_nms_idxs = torchvision.ops.nms(output[0]['boxes'][high_scores_idxs], \
            output[0]['scores'][high_scores_idxs], 0.3).cpu().numpy() # Indexes of boxes left after applying NMS (iou_threshold=0.3)
        
        keypoints = []
        key_points = []
        for kps in output[0]['keypoints'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            keypoints.append(list(map(int, kps[0,0:2])))
            key_points.append([list(map(int, kp[:2])) for kp in kps])

        labels = []
        for label in output[0]['labels'][high_scores_idxs][post_nms_idxs].detach().cpu().numpy():
            labels.append(label)
#         keypoints_ = [(x,_) for _,x in sorted(zip(labels,keypoints))]
#         keypoints_ = [list(x,_) for (x, _) in sorted(zip(keypoints, labels))]
        keypoints_ = [list(x) + [y] for (x, y) in sorted(zip(keypoints, labels))]
        
        print("keypoints_",keypoints_)
        
        keypoints = torch.stack([torch.tensor(kp) for kp in keypoints_]).float().to(device)

        print("keypoints", keypoints)
        print(keypoints.shape)
#         keypoints = keypoints_
        vertices_prob, edges = self.gnn_encoder(keypoints)
#         print(vertices_prob)
        print(vertices_prob.shape, edges.shape)
        vertices = self.gnn_decoder(vertices_prob, edges)
        
        return vertices, vertices_prob



In [66]:
# Define the model
model = KeypointPipeline()
model = model.to(device)

# Define the loss
criterion = TrifocalLoss()

# Define the optimizer
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Load the trained model
model.keypoint_model = torch.load(weights_path).to(device)

num_epochs = 10  # Define your number of epochs

KEYPOINTS_FOLDER_TRAIN = train_test_split(root_dir) +"/train" #train_test_split(root_dir) +"/train"
KEYPOINTS_FOLDER_VAL = train_test_split(root_dir) +"/val"
KEYPOINTS_FOLDER_TEST = train_test_split(root_dir) +"/test"

dataset_train = KPDataset(KEYPOINTS_FOLDER_TRAIN, transform=train_transform(), demo=False)
dataset_val = KPDataset(KEYPOINTS_FOLDER_VAL, transform=None, demo=False)
dataset_test = KPDataset(KEYPOINTS_FOLDER_TEST, transform=None, demo=False)

data_loader_train = DataLoader(dataset_train, batch_size=1, shuffle=True, collate_fn=collate_fn)
data_loader_val = DataLoader(dataset_val, batch_size=1, shuffle=False, collate_fn=collate_fn)
data_loader_test = DataLoader(dataset_test, batch_size=1, shuffle=False, collate_fn=collate_fn)


model.train()
# Epoch loop
for epoch in range(num_epochs):
    # For each batch in your training data
    for batch in data_loader_train:
        img_tuple, target_dict_tuple = batch
        img = img_tuple[0]
        print(img.shape)
        target = target_dict_tuple[0]

        img = img.to(device)
        vertices_gt = target['keypoints'].to(device)
        num_vertices = vertices_gt.shape[0]
        vertices_gt[:, :, 2] = torch.arange(1, num_vertices+1).unsqueeze(1).to(device)

#         vertices_gt = one_hot_encode(target['keypoints'].to(device), num_classes=6)

        
        print("ground_truth_vertex", vertices_gt)

        # Forward pass
        vertices_pred, vertices_prob = model(img)
        
        print(vertices_prob.shape)

        # Compute the losses
        trifocal_loss = criterion(vertices_pred, vertices_gt)
        ce_loss = cross_entropy_loss_func(vertices_prob, vertices_gt)

        # Combined loss
        loss = trifocal_loss + ce_loss

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Print loss for each epoch
    print(f'Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}')

  self.graph_edges = torch.tensor(graph_edges) - 1  # Subtract 1 for zero-based indexing


<class 'str'>


Copying files: 2662 files [00:00, 21202.74 files/s]


<class 'str'>


Copying files: 2662 files [00:00, 20683.99 files/s]


<class 'str'>


Copying files: 2662 files [00:00, 20460.73 files/s]

(480, 640, 3)
torch.Size([3, 640, 480])
ground_truth_vertex tensor([[[112.0801, 257.9522,   1.0000]],

        [[195.9869, 257.9597,   2.0000]],

        [[271.7243, 282.4390,   3.0000]],

        [[265.3522, 302.1998,   4.0000]],

        [[330.3802, 376.7305,   5.0000]],

        [[338.7143, 397.0408,   6.0000]]], device='cuda:0')
image in keypoints eval phase torch.Size([3, 640, 480])





keypoints_ [[112, 258, 1], [196, 258, 2], [265, 302, 4], [271, 282, 3], [330, 377, 5], [339, 397, 6]]
keypoints tensor([[112., 258.,   1.],
        [196., 258.,   2.],
        [265., 302.,   4.],
        [271., 282.,   3.],
        [330., 377.,   5.],
        [339., 397.,   6.]], device='cuda:0')
torch.Size([6, 3])
h1 shape: torch.Size([6, 128])
h1_expand_1 shape: torch.Size([5, 128])
h1_expand_2 shape: torch.Size([5, 128])
h_e1 shape: torch.Size([5, 128])
h2 shape: tensor([[ 1.0000,  1.0000, -1.0000,  1.0000, -1.0000, -1.0000, -1.0000,  1.0000,
         -1.0000,  1.0000, -1.0000,  1.0000, -1.0000,  1.0000,  1.0000,  1.0000,
         -1.0000,  1.0000, -1.0000,  1.0000,  0.9999, -1.0000, -1.0000,  1.0000,
         -1.0000, -1.0000,  1.0000, -1.0000,  1.0000,  1.0000, -1.0000, -1.0000,
          1.0000, -1.0000, -0.6751,  1.0000,  1.0000,  1.0000,  1.0000,  1.0000,
          1.0000, -1.0000, -1.0000,  1.0000,  1.0000, -1.0000, -1.0000, -1.0000,
         -1.0000, -0.9769, -1.0000,  1.0000

RuntimeError: expand(torch.cuda.FloatTensor{[5, 128, 1, 1]}, size=[10, 128]): the number of sizes provided (2) must be greater or equal to the number of dimensions in the tensor (4)

In [None]:
Please go through the following notebook and tell where I am going wrong . I am going to submit the code in two parts                                                                                                                                                  Part1:                                                                                                                                                                                      weights_path = '/home/jc-merlab/Pictures/Data/trained_models/keypointsrcnn_weights_ld_b1_e25_v2.pth'                                                      # to generalize home directory. User can change their parent path without entering their home directory
path = Def_Path()

parent_path =  path.home + "/Pictures/" + "Data/"

root_dir = parent_path + path.year + "-" + path.month + "-" + path.day + "/"                                                                                device = 'cuda' if torch.cuda.is_available() else 'cpu'
def train_transform():
    return A.Compose([
        A.Sequential([
            A.RandomRotate90(p=1), # Random rotation of an image by 90 degrees zero or more times
            A.RandomBrightnessContrast(brightness_limit=0.3, contrast_limit=0.3, brightness_by_max=True, always_apply=False, p=1), # Random change of brightness & contrast
        ], p=1)
    ],
    keypoint_params=A.KeypointParams(format='xy'), # More about keypoint formats used in albumentations library read at https://albumentations.ai/docs/getting_started/keypoints_augmentation/
    bbox_params=A.BboxParams(format='pascal_voc', label_fields=['bboxes_labels']) # Bboxes should have labels, read more at https://albumentations.ai/docs/getting_started/bounding_boxes_augmentation/
    )                                                                                                                                                                                                                  def train_test_split(src_dir):
    dst_dir_img = src_dir + "images"
    dst_dir_anno = src_dir + "annotations"
    
    if os.path.exists(dst_dir_img) and os.path.exists(dst_dir_anno):
        print("folders exist")
    else:
        os.mkdir(dst_dir_img)
        os.mkdir(dst_dir_anno)
        
    for jpgfile in glob.iglob(os.path.join(src_dir, "*.jpg")):
        shutil.copy(jpgfile, dst_dir_img)

    for jsonfile in glob.iglob(os.path.join(src_dir, "*.json")):
        shutil.copy(jsonfile, dst_dir_anno)
        
    output = parent_path + "split_folder_output" + "-" + path.year + "-" + path.month + "-" + path.day 
    
    print(type(output))
    
    splitfolders.ratio(src_dir, # The location of dataset
                   output=output, # The output location
                   seed=42, # The number of seed
                   ratio=(.7, .2, .1), # The ratio of split dataset
                   group_prefix=None, # If your dataset contains more than one file like ".jpg", ".pdf", etc
                   move=False # If you choose to move, turn this into True
                   )
    
    shutil.rmtree(dst_dir_img)
    shutil.rmtree(dst_dir_anno)
    
    return output                                                                                                                                                                                                          class KPDataset(Dataset):
    def __init__(self, root, transform=None, demo=False):                
        self.root = root
        self.transform = transform
        self.demo = demo # Use demo=True if you need transformed and original images (for example, for visualization purposes)
        self.imgs_files = sorted(os.listdir(os.path.join(root, "images")))
        self.annotations_files = sorted(os.listdir(os.path.join(root, "annotations")))
#         self.imgs_files = [file for file in sorted(os.listdir(root)) if file.endswith(".jpg")]
#         self.annotations_files = [file for file in sorted(os.listdir(root)) if file.endswith(".json")]
    
    def __getitem__(self, idx):
#         img_path = os.path.join(self.root, self.imgs_files[idx])
#         annotations_path = os.path.join(self.root, self.annotations_files[idx])
        img_path = os.path.join(self.root, "images", self.imgs_files[idx])
        annotations_path = os.path.join(self.root, "annotations", self.annotations_files[idx])

        img_original = cv2.imread(img_path)
        img_original = cv2.cvtColor(img_original, cv2.COLOR_BGR2RGB)        
        
        with open(annotations_path) as f:
            data = json.load(f)
            bboxes_original = data['bboxes'][:6]
#             print("bounding boxes", bboxes_original)
            keypoints_original = data['keypoints'][:6]
#             print("original keypoints", np.array(keypoints_original))
#             print("original keypoints shape", (np.array(keypoints_original)).shape)
            
            # All objects are keypoints on the robot
            bboxes_labels_original = [] 
            bboxes_labels_original.append('base_joint')
            bboxes_labels_original.append('joint2')
            bboxes_labels_original.append('joint3')
            bboxes_labels_original.append('joint4')
            bboxes_labels_original.append('joint5')
            bboxes_labels_original.append('joint6')  

        if self.transform:   
            # Converting keypoints from [x,y,visibility]-format to [x, y]-format + Flattening nested list of keypoints            
            # For example, if we have the following list of keypoints for three objects (each object has two keypoints):
            # [[obj1_kp1, obj1_kp2], [obj2_kp1, obj2_kp2], [obj3_kp1, obj3_kp2]], where each keypoint is in [x, y]-format            
            # Then we need to convert it to the following list:
            # [obj1_kp1, obj1_kp2, obj2_kp1, obj2_kp2, obj3_kp1, obj3_kp2]
            keypoints_original_flattened = [el[0:2] for kp in keypoints_original for el in kp]
            
            # Apply augmentations
            transformed = self.transform(image=img_original, bboxes=bboxes_original, bboxes_labels=bboxes_labels_original, keypoints=keypoints_original_flattened)
            img = transformed['image']
            bboxes = transformed['bboxes']
            # Unflattening list transformed['keypoints']
            # For example, if we have the following list of keypoints for three objects (each object has two keypoints):
            # [obj1_kp1, obj1_kp2, obj2_kp1, obj2_kp2, obj3_kp1, obj3_kp2], where each keypoint is in [x, y]-format
            # Then we need to convert it to the following list:
            # [[obj1_kp1, obj1_kp2], [obj2_kp1, obj2_kp2], [obj3_kp1, obj3_kp2]]
            keypoints_transformed_unflattened = np.reshape(np.array(transformed['keypoints']), (-1,1,2)).tolist()

            # Converting transformed keypoints from [x, y]-format to [x,y,visibility]-format by appending original visibilities to transformed coordinates of keypoints
            keypoints = []
            for o_idx, obj in enumerate(keypoints_transformed_unflattened):
#                 print("object", obj)
#                 print(" obj index", o_idx)# Iterating over objects
                obj_keypoints = []
                for k_idx, kp in enumerate(obj): # Iterating over keypoints in each object
#                     print("kp index", k_idx)
#                     print("key points",kp)
#                     print("keypoints original second iter", [keypoints_original[0][o_idx][k_idx]],
#                           [keypoints_original[o_idx][k_idx][0]], [keypoints_original[o_idx][k_idx][1]], \
#                          [keypoints_original[o_idx][k_idx][2]], [keypoints_original[o_idx][k_idx][3]])
                    # kp - coordinates of keypoint
                    # keypoints_original[o_idx][k_idx][2] - original visibility of keypoint
                    obj_keypoints.append(kp + [keypoints_original[o_idx][k_idx][2]])
                keypoints.append(obj_keypoints)
#             print(keypoints)
        
        else:
            img, bboxes, keypoints = img_original, bboxes_original, keypoints_original        
        
        # Convert everything into a torch tensor        
        bboxes = torch.as_tensor(bboxes, dtype=torch.float32)       
        target = {}
        labels = [1, 2, 3, 4, 5, 6]            
        target["boxes"] = bboxes
        target["labels"] = torch.as_tensor(labels, dtype=torch.int64) # all objects are joint positions
        target["image_id"] = torch.tensor([idx])
        target["area"] = (bboxes[:, 3] - bboxes[:, 1]) * (bboxes[:, 2] - bboxes[:, 0])
        target["iscrowd"] = torch.zeros(len(bboxes), dtype=torch.int64)
        target["keypoints"] = torch.as_tensor(keypoints, dtype=torch.float32)
        img = F.to_tensor(img)        
        bboxes_original = torch.as_tensor(bboxes_original, dtype=torch.float32)
        target_original = {}
        target_original["boxes"] = bboxes_original
        target_original["labels"] = torch.as_tensor(labels, dtype=torch.int64) # all objects are glue tubes
        target_original["image_id"] = torch.tensor([idx])
        target_original["area"] = (bboxes_original[:, 3] - bboxes_original[:, 1]) * (bboxes_original[:, 2] - bboxes_original[:, 0])
        target_original["iscrowd"] = torch.zeros(len(bboxes_original), dtype=torch.int64)
        target_original["keypoints"] = torch.as_tensor(keypoints_original, dtype=torch.float32)        
        img_original = F.to_tensor(img_original)

        if self.demo:
            return img, target, img_original, target_original
        else:
            return img, target
    
    def __len__(self):
        return len(self.imgs_files)                                                                                                                                                          def get_model(num_keypoints, weights_path=None):
    
    anchor_generator = AnchorGenerator(sizes=(32, 64, 128, 256, 512), aspect_ratios=(0.25, 0.5, 0.75, 1.0, 2.0, 3.0, 4.0))
    model = torchvision.models.detection.keypointrcnn_resnet50_fpn(pretrained=False,
                                                                   pretrained_backbone=True,
                                                                   num_keypoints=num_keypoints,
                                                                   num_classes = 7, # Background is the first class, object is the second class
                                                                   rpn_anchor_generator=anchor_generator)

    if weights_path:
        state_dict = torch.load(weights_path)
        model.load_state_dict(state_dict)        
        
    return model         