In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import json
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.image as mpimg
import matplotlib.patches as patches

from typing import List
# from base import Scene
from PIL import Image
import torch
from torch import nn
from torch.utils.data import DataLoader
import torchvision
from pytorch_metric_learning.losses import ContrastiveLoss, LiftedStructureLoss
from pytorch_metric_learning.distances import CosineSimilarity

np.set_printoptions(suppress=True)
# %matplotlib ipympl

In [None]:
from mmdet.apis import init_detector, inference_detector, show_result_pyplot
from torchvision.models import resnet18, ResNet18_Weights
from torchvision.models import resnet50, ResNet50_Weights
from torchvision.models import resnet101, ResNet101_Weights

config_file = '../../Projects/mmdetection/configs/swin/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco.py'
checkpoint_file = '../../Projects/mmdetection/checkpoints/mask_rcnn_swin-t-p4-w7_fpn_fp16_ms-crop-3x_coco_20210908_165006-90a4008c.pth'

# build the model from a config file and a checkpoint file
model = init_detector(config_file, checkpoint_file, device='cuda:0')

folder_dir = 'C:/Users/Leonard/AppData/LocalLow/DefaultCompany/Perception2/solo_17'
training_dir = 'C:/Users/Leonard/data'
testing_dir = 'C:/Users/Leonard/data_test'

In [None]:

for i in range(100):
    print(f'\r{i}', end='')
    f_dir = f'{folder_dir}/sequence.{i}'
    with open(f'{f_dir}/step0.frame_data.json') as json_file:
        data = json.load(json_file)
    captures = data['captures']

    for j, capture in enumerate(captures):
        filename = f'{f_dir}/{capture["filename"]}'
        image = Image.open(filename).convert('RGB')
        annotations = capture['annotations']
        bboxes = [anno['values'] for anno in annotations if "2D" in anno['id']][0]
        
        for z, bbox in enumerate(bboxes):
            size = np.array([dim/2 for dim in bbox['dimension']])
            center = np.array(bbox['origin']) + size

            tl, br = center - size, center + size
            crop = image.crop((*tl, *br))
            crop.save(f'{training_dir}/{bbox["labelName"]}/{i}_{j}.png')
            # plt.title(f'{bbox["labelName"]}')
            # plt.show()

        # break
    # break

In [None]:
weights = ResNet50_Weights.IMAGENET1K_V2
resnet = resnet50(weights=weights)
# weights = ResNet101_Weights.IMAGENET1K_V2
# resnet = resnet101(weights=weights)
resnet = nn.Sequential(*list(resnet.children())[:-1])
resnet = resnet.cuda()
preprocess = weights.transforms()

train_ds = torchvision.datasets.ImageFolder(training_dir, transform=preprocess)
train_loader = DataLoader(train_ds, batch_size=32, pin_memory=True, num_workers=0, shuffle=True)

torch.manual_seed(222)
distance = CosineSimilarity()
# loss_fnc = ContrastiveLoss(pos_margin=0.9, neg_margin=0.1, distance=distance)
# loss_fnc = ContrastiveLoss(pos_margin=0.1, neg_margin=0.9)
# loss_fnc = LiftedStructureLoss(pos_margin=0.9, neg_margin=0.1, distance=distance)
loss_fnc = LiftedStructureLoss(pos_margin=0.1, neg_margin=0.9)
optimizer = torch.optim.Adam(resnet.parameters(), lr=5e-5)

scaler = torch.cuda.amp.GradScaler()
step = 0
for epoch in range(3):
    losses = []
    for images, labels in train_loader:
        step += 1
        optimizer.zero_grad()
        
        with torch.autocast(device_type='cuda', dtype=torch.float16):
            features = resnet(images.cuda())[:, :, 0, 0]
            loss = loss_fnc(features, labels)
        
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        losses.append(loss.item())
        print(f'\r{epoch} {step} {np.mean(losses)}', end='')
        # if step == 100:
        #     break
    print()

    # for i in range(100, 150):
    # print(f'\r{i}', end='')
    f_dir = f'{folder_dir}/sequence.100'
    with open(f'{f_dir}/step0.frame_data.json') as json_file:
        data = json.load(json_file)
    captures = data['captures']

    instances = {}

    for j, capture in enumerate(captures):
        filename = f'{f_dir}/{capture["filename"]}'
        image = Image.open(filename).convert('RGB')
        annotations = capture['annotations']
        result = inference_detector(model, filename)
        human_pred = [ret for ret in result[0][0] if ret[-1] > 0.9]
        
        for pred in human_pred:
            center=np.array([(pred[0] + pred[2])/2, (pred[1] + pred[3])/2])
            size=np.array([(pred[2]-pred[0])/2, (pred[3]-pred[1])/2])
            
        # bboxes = [anno['values'] for anno in annotations if "2D" in anno['id']][0]
        
        # for z, bbox in enumerate(bboxes):
        #     size = np.array([dim/2 for dim in bbox['dimension']])
        #     center = np.array(bbox['origin']) + size

            tl, br = center - size, center + size
            crop = image.crop((*tl, *br))
            with torch.no_grad():
                feature = resnet(preprocess(crop).unsqueeze(0).cuda()).cpu()[0, :, 0, 0]
            obj = {
                'instanceId': None,
                'image': crop,
                'features': feature
            }
            if instances.get(capture["id"], None) is None:
                instances[capture["id"]] = [obj]
            else:
                instances[capture["id"]].append(obj)

    instances_0 = instances["camera_0"]
    instances_1 = instances["camera"]

    features_1 = torch.stack([instance["features"] for instance in instances_1], dim=0)

    distance = CosineSimilarity()
    ds = []
    for instance in instances_0:
        f = instance["features"].unsqueeze(0)

        d = distance(f, features_1)
        ds.append(d)
        # print(d)
    print(abs(ds[1] - ds[0]))
        # break
    # break
    # break


tensor([[0.1167, 0.0438]])  
tensor([[0.2519, 0.1248]])  
tensor([[0.3222, 0.1715]])  
tensor([[0.3459, 0.2030]])  
tensor([[0.3637, 0.2037]])  
tensor([[0.3626, 0.1916]])  
tensor([[0.3517, 0.1961]])  
tensor([[0.3379, 0.1876]])  
tensor([[0.3295, 0.1583]])  
tensor([[0.3229, 0.1450]])  

In [None]:
222-3-.028, 0.20

In [None]:
torch.save(resnet.state_dict(), './best.pth')

In [None]:
weights = ResNet50_Weights.IMAGENET1K_V2
resnet = resnet50(weights=weights)

resnet = nn.Sequential(*list(resnet.children())[:-1])
resnet = resnet.cuda()
preprocess = weights.transforms()

In [None]:
folder_dir = 'C:/Users/Leonard/AppData/LocalLow/DefaultCompany/Perception2/solo_17'
testing_dir = 'C:/Users/Leonard/data_test'

distance = CosineSimilarity()

acc = 0
count = 0
n_obj = 0

resnet.eval()
# for i in range(100, 110):
for i in range(100, 150):
    # print(f'\r{i}', end='')
    f_dir = f'{folder_dir}/sequence.{i}'
    with open(f'{f_dir}/step0.frame_data.json') as json_file:
        data = json.load(json_file)
    captures = data['captures']

    bbox_dict = {}
    det_obj = 100
    for j, capture in enumerate(captures):
        filename = f'{f_dir}/{capture["filename"]}'
        image = Image.open(filename).convert('RGB')
        camera_id = capture['id']

        # gt bounding box and instance Id
        annotations = capture['annotations']
        bboxes = [anno['values'] for anno in annotations if "2D" in anno['id']][0]
        for z, bbox in enumerate(bboxes):
            if bbox_dict.get(bbox['instanceId'], None) is None:
                bbox_dict[bbox['instanceId']] = {}
                
            # print(bbox)
            size = np.array([dim/2 for dim in bbox['dimension']])
            center = np.array(bbox['origin']) + size
            tl, br = center - size, center + size
            crop = image.crop((*tl, *br))
            obj = {}
            with torch.no_grad():
                feature_temp = resnet(preprocess(crop).unsqueeze(0).cuda()).cpu()[0, :, 0, 0]
            obj['gt'] = {
                'center': center,
                'size': size,
                'features': feature_temp,
                'image': crop,
            }
            bbox_dict[bbox['instanceId']][camera_id] = obj

        # pred
        result = inference_detector(model, filename)
        human_pred = [ret for ret in result[0][0] if ret[-1] > 0.9]
        # print(human_pred)
        det_obj = min(len(human_pred), det_obj)
        
        crops = {}
        for pred in human_pred:
            center = np.array([(pred[0] + pred[2])/2, (pred[1] + pred[3])/2])
            size = np.array([(pred[2]-pred[0])/2, (pred[3]-pred[1])/2])
            tl, br = center - size, center + size
            crop = image.crop((*tl, *br))

            t, l = tl
            b, r = br
            
            max_iou = 0
            key_idx = None
            for key in bbox_dict:
                bbox_d = bbox_dict[key][camera_id]['gt']
                tl_, br_ = bbox_d['center'] - bbox_d['size'], bbox_d['center'] + bbox_d['size']
                t_, l_ = tl_
                b_, r_ = br_

                o = np.array([max(t, t_), max(l, l_)])
                u = np.array([min(b, b_), min(r, r_)])

                w = u[0] - o[0]
                h = u[1] - o[1]
                intersection = w * h
                union = ((b-t)*(r-l)) + ((b_-t_)*(r_-l_)) - intersection
                iou = intersection / (union + 1e-8)
                if iou > max_iou:
                    key_idx = key
                    max_iou = iou
                    
            with torch.no_grad():
                feature_temp = resnet(preprocess(crop).unsqueeze(0).cuda()).cpu()[0, :, 0, 0]
            bbox_d = bbox_dict[key_idx][camera_id]['pred'] = {
                'center': center,
                'size': size,
                'features': feature_temp,
                # 'image': crop
            }
    n_obj += det_obj

    # for instance_key in bbox_dict:
    f_1_0 = bbox_dict[1]['camera_0'].get('pred', {'features': torch.zeros(2048)})['features']
    f_1_1 = bbox_dict[1]['camera'].get('pred', {'features': torch.zeros(2048)})['features']

    f_2_0 = bbox_dict[2]['camera_0'].get('pred', {'features': torch.zeros(2048)})['features']
    f_2_1 = bbox_dict[2]['camera'].get('pred', {'features': torch.zeros(2048)})['features']

    distance = CosineSimilarity()

    fs = torch.stack([f_1_1, f_2_1], dim=0)
    s_12 = distance(f_1_0.unsqueeze(0), fs)
    s_22 = distance(f_2_0.unsqueeze(0), fs)

    a, b = torch.concat([s_12, s_22], dim=0).argmax(0).tolist()
    print(i, a, b)
    if a == 0 and not f_1_0.eq(zero).all(): acc += 1
    if b == 1 and not f_2_0.eq(zero).all(): acc += 1

    zero = torch.zeros(2048)
    if not (f_1_0.eq(zero).all() or f_2_0.eq(zero).all()):
        count += 1
    if not (f_1_1.eq(zero).all() or f_2_1.eq(zero).all()):
        count += 1

    # break
# print(bbox_dict[1])
# print(bbox_dict[2])

print()
print(acc)
print(count, n_obj)

In [None]:
52/91

In [None]:
bbox_dict[1]['camera']

In [None]:
# for instance_key in bbox_dict:
f_1_0 = bbox_dict[1]['camera_0']['pred']['features']
f_1_1 = bbox_dict[1]['camera']['pred']['features']

f_2_0 = bbox_dict[2]['camera_0']['pred']['features']
f_2_1 = bbox_dict[2]['camera']['pred']['features']

distance = CosineSimilarity()

fs = torch.stack([f_1_1, f_2_1], dim=0)
s_12 = distance(f_1_0.unsqueeze(0), fs)
s_22 = distance(f_2_0.unsqueeze(0), fs)

print(s_12)
print(s_22)
print('-'*30)

print(
    torch.concat([s_12, s_22], dim=0)
)
i, j = torch.concat([s_12, s_22], dim=0).argmax(0).tolist()
print(i+1, j+1)


In [None]:
resnet(preprocess(crop).unsqueeze(0).cuda()).cpu().shape

In [None]:


    # max_iou = 0
    # for j, capture in enumerate(captures):
    #     tl_, br_ = center - size, center + size
    #     t_, l_ = tl_
    #     b_, r_ = br_

    #     o = np.array([max(t, t_), max(l, l_)])
    #     u = np.array([min(b, b_), min(r, r_)])

    #     w = u[0] - o[0]
    #     h = u[1] - o[1]
    #     intersection = w * h
    #     union = ((b-t)*(r-l)) + ((b_-t_)*(r_-l_)) - intersection
    #     iou = intersection / (union + 1e-8)
    #     max_iou = max(max_iou, iou)

    # instances = {}

    # for j, capture in enumerate(captures):
    #     filename = f'{f_dir}/{capture["filename"]}'
    #     image = Image.open(filename).convert('RGB')
    #     annotations = capture['annotations']
    #     result = inference_detector(model, filename)
    #     human_pred = [ret for ret in result[0][0] if ret[-1] > 0.9]
        
    #     for pred in human_pred:
    #         center=np.array([(pred[0] + pred[2])/2, (pred[1] + pred[3])/2])
    #         size=np.array([(pred[2]-pred[0])/2, (pred[3]-pred[1])/2])
            
    #     # bboxes = [anno['values'] for anno in annotations if "2D" in anno['id']][0]
        
    #     # for z, bbox in enumerate(bboxes):
    #     #     size = np.array([dim/2 for dim in bbox['dimension']])
    #     #     center = np.array(bbox['origin']) + size

    #         tl, br = center - size, center + size
    #         crop = image.crop((*tl, *br))
            
    #         with torch.no_grad():
    #             feature = resnet(preprocess(crop).unsqueeze(0).cuda()).cpu()[0, :, 0, 0]
    #         feature_list.append(feature)
    #         obj = {
    #             'instanceId': None,
    #             'image': crop,
    #             'features': feature
    #         }
    #         if instances.get(capture["id"], None) is None:
    #             instances[capture["id"]] = [obj]
    #         else:
    #             instances[capture["id"]].append(obj)

    # instances_0 = instances["camera_0"]
    # instances_1 = instances["camera"]

    # features_1 = torch.stack([instance["features"] for instance in instances_1], dim=0)

    # distance = CosineSimilarity()
    # for instance in instances_0:
    #     f = instance["features"].unsqueeze(0)

    #     d = distance(f, features_1)
    #     print(d)

    # break

In [None]:
instance_names = ['camera_0', 'camera']

instances_0 = instances["camera_0"]
instances_1 = instances["camera"]

features_1 = torch.stack([instance["features"] for instance in instances_1], dim=0)

distance = CosineSimilarity()
for instance in instances_0:
    features = instance["features"].unsqueeze(0)
    # print(features[:, :10])

    d = distance(features, features_1)
    print(d)

In [None]:
features_1[:, :10]