In [1]:
%load_ext autoreload
%autoreload 2
import os

import torch
from torch.utils.data import DataLoader
from datasets import TransformedRoboEireanData, RoboEireanDataWithEncoder
import utils
from pytorch_lightning.callbacks import RichProgressBar
import pytorch_lightning as pl
from models import MultiClassJetNet
import torchvision.transforms as T
torch.manual_seed(2)

<torch._C.Generator at 0x7fc408bfa190>

In [2]:
# # Default scalings for the default boxes determined by k-means clustering
# default_box_scalings = torch.tensor(
#     [
#         [0.06549374, 0.12928654],
#         [0.11965626, 0.26605093],
#         [0.20708716, 0.38876095],
#         [0.31018215, 0.47485098],
#         [0.415882, 0.8048184],
#         [0.7293086, 0.8216225],
#     ]
# )

default_box_scalings = torch.tensor(
    [
       [ 49.858948,  42.32408 ],
       [ 79.69058 ,  96.98148 ],
       [162.30188 , 125.641266],
       [229.88889 , 248.09436 ],
       [251.65385 , 434.61536 ],
       [626.4     , 461.2     ]
    ]
)

classes = ["robot"]
encoder = utils.Encoder(default_box_scalings, ["robot"])
transformed_train_data = TransformedRoboEireanData(
    os.path.join("data", "transformed", "train"), encoder
)
transformed_val_data = TransformedRoboEireanData(
    os.path.join(
        "data",
        "transformed",
        "val",
    ),
    encoder,
)


image_transforms = T.Compose(
            [
                T.Grayscale(),
                T.PILToTensor(),
                T.ConvertImageDtype(torch.float32),
                T.Resize((60, 80)),
            ]
        )
bounding_box_transforms = T.Compose([])

raw_train_data = RoboEireanDataWithEncoder(os.path.join("data", "coco_ball_nao", "train"),
                                           encoder,
                                           ["robot"], image_transforms=image_transforms, bounding_box_transforms=bounding_box_transforms)

raw_val_data = RoboEireanDataWithEncoder(os.path.join("data", "coco_ball_nao", "val"),
                                           encoder,
                                           ["robot"], image_transforms=image_transforms, bounding_box_transforms=bounding_box_transforms)


In [16]:
for i in range(1000):
    image, encoded_bounding_boxes, target_masks, encoded_target_classes = transformed_train_data[i]
    image, encoded_bounding_boxes, target_masks, encoded_target_classes = raw_train_data[i]
    print(image)

tensor([[[0.2588, 0.2578, 0.2549,  ..., 0.5049, 0.5059, 0.5059],
         [0.2510, 0.2569, 0.2588,  ..., 0.4980, 0.5000, 0.5020],
         [0.2431, 0.2529, 0.2490,  ..., 0.4941, 0.4941, 0.4941],
         ...,
         [0.2833, 0.3275, 0.3029,  ..., 0.1324, 0.1363, 0.1412],
         [0.3284, 0.3069, 0.3127,  ..., 0.1529, 0.1186, 0.1294],
         [0.3569, 0.3294, 0.3314,  ..., 0.1549, 0.1304, 0.1480]]])
tensor([[[0.4745, 0.4745, 0.4824,  ..., 0.3176, 0.3853, 0.8275],
         [0.4716, 0.4735, 0.4745,  ..., 0.6912, 1.0000, 0.5578],
         [0.4706, 0.4667, 0.4676,  ..., 0.7578, 0.4373, 0.3598],
         ...,
         [0.1588, 0.1431, 0.1598,  ..., 0.1422, 0.1529, 0.1471],
         [0.1402, 0.1696, 0.1618,  ..., 0.1784, 0.1686, 0.1784],
         [0.1833, 0.1324, 0.1382,  ..., 0.1539, 0.1333, 0.1716]]])
tensor([[[0.4284, 0.3206, 0.2235,  ..., 0.5529, 0.5333, 0.5206],
         [0.3843, 0.3422, 0.2196,  ..., 0.5343, 0.5206, 0.5118],
         [0.3696, 0.3578, 0.2157,  ..., 0.5206, 0.5098, 0.

In [14]:
train_loader.dataset

<datasets.RoboEireanDataWithEncoder at 0x7fc40773bee0>

In [10]:
train_loader.dataset.dataset.images

['camera_1001.png',
 'camera_1005.png',
 'camera_101.png',
 'camera_1011.png',
 'camera_1013.png',
 'camera_1025.png',
 'camera_1033.png',
 'camera_1047.png',
 'camera_105.png',
 'camera_1053.png',
 'camera_1057.png',
 'camera_1059.png',
 'camera_1067.png',
 'camera_1081.png',
 'camera_1085.png',
 'camera_1087.png',
 'camera_1089.png',
 'camera_109.png',
 'camera_1095.png',
 'camera_11.png',
 'camera_1109.png',
 'camera_111.png',
 'camera_1113.png',
 'camera_1119.png',
 'camera_1123.png',
 'camera_1125.png',
 'camera_1129.png',
 'camera_1131.png',
 'camera_1139.png',
 'camera_1141.png',
 'camera_1143.png',
 'camera_1147.png',
 'camera_115.png',
 'camera_1153.png',
 'camera_1157.png',
 'camera_1161.png',
 'camera_1163.png',
 'camera_1167.png',
 'camera_1173.png',
 'camera_1181.png',
 'camera_1185.png',
 'camera_1191.png',
 'camera_1197.png',
 'camera_1201.png',
 'camera_1205.png',
 'camera_1207.png',
 'camera_121.png',
 'camera_1213.png',
 'camera_1219.png',
 'camera_1223.png',
 'camera

In [11]:
train_loader.dataset.dataset.labels

['camera_1001.txt',
 'camera_1005.txt',
 'camera_101.txt',
 'camera_1011.txt',
 'camera_1013.txt',
 'camera_1025.txt',
 'camera_1033.txt',
 'camera_1047.txt',
 'camera_105.txt',
 'camera_1053.txt',
 'camera_1057.txt',
 'camera_1059.txt',
 'camera_1067.txt',
 'camera_1081.txt',
 'camera_1085.txt',
 'camera_1087.txt',
 'camera_1089.txt',
 'camera_109.txt',
 'camera_1095.txt',
 'camera_11.txt',
 'camera_1109.txt',
 'camera_111.txt',
 'camera_1113.txt',
 'camera_1119.txt',
 'camera_1123.txt',
 'camera_1125.txt',
 'camera_1129.txt',
 'camera_1131.txt',
 'camera_1139.txt',
 'camera_1141.txt',
 'camera_1143.txt',
 'camera_1147.txt',
 'camera_115.txt',
 'camera_1153.txt',
 'camera_1157.txt',
 'camera_1161.txt',
 'camera_1163.txt',
 'camera_1167.txt',
 'camera_1173.txt',
 'camera_1181.txt',
 'camera_1185.txt',
 'camera_1191.txt',
 'camera_1197.txt',
 'camera_1201.txt',
 'camera_1205.txt',
 'camera_1207.txt',
 'camera_121.txt',
 'camera_1213.txt',
 'camera_1219.txt',
 'camera_1223.txt',
 'camera

In [33]:
learning_rate = 1e-3
train_loader = DataLoader(
    raw_train_data, batch_size=32, shuffle=True, num_workers=1
)
val_loader = DataLoader(
    raw_val_data, batch_size=32, shuffle=False, num_workers=1
)


In [34]:

pl_model = MultiClassJetNet(len(classes), default_box_scalings.size(0), learning_rate)
trainer = pl.Trainer(
    limit_predict_batches=32, max_epochs=6, callbacks=[RichProgressBar()]
)
trainer.fit(model=pl_model, train_dataloaders=train_loader, val_dataloaders=val_loader)

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

`Trainer.fit` stopped: `max_epochs=6` reached.


In [52]:
pl_model.eval()
batch = next(iter(val_loader))




In [56]:
with torch.no_grad():
    predictions_single_batch = pl_model(batch[0])

In [57]:
predictions_single_batch

(tensor([[[-0.3069, -0.7462,  0.4994, -0.9650],
          [-0.2458,  0.5230, -0.6875,  0.5082],
          [-0.3937,  0.0277, -0.7175,  0.1454],
          ...,
          [ 0.2929,  0.2251, -0.5001, -0.3213],
          [ 0.2434,  0.2460, -0.7069, -1.3225],
          [ 0.2634,  0.2760, -1.3202, -1.0819]],
 
         [[-0.3617, -0.8689,  0.2454, -0.7514],
          [-0.1356,  0.6806, -0.3882,  0.2912],
          [-0.2571,  0.1707, -0.6621,  0.0355],
          ...,
          [ 0.2406,  0.2547, -0.4921, -0.2255],
          [ 0.2290,  0.3037, -0.4800, -1.1284],
          [ 0.2825,  0.2943, -1.1103, -1.0122]],
 
         [[-0.4670, -0.7941,  0.2590, -0.7851],
          [-0.0815,  0.7304, -0.5485,  0.2802],
          [-0.3659,  0.0858, -0.6070,  0.0679],
          ...,
          [ 0.0973,  0.0660, -0.3651,  0.1085],
          [ 0.2926,  0.2398,  0.0473, -0.2801],
          [ 0.4115,  0.3258, -0.6346, -0.8199]],
 
         ...,
 
         [[-0.3106, -0.7788,  0.3305, -0.8348],
          [-0.1083

In [62]:
predictions_all_batches = trainer.predict(pl_model, dataloaders=train_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(
  rank_zero_warn(


Output()



AttributeError: 'list' object has no attribute 'dim'

In [40]:
trainer.validate(model=pl_model, dataloaders=train_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'val/accuracy/no_object': 1.0,
  'val/accuracy/robot': 0.0,
  'val/loss/classification': 0.6157800555229187,
  'val/loss/location': 0.25374796986579895}]

In [41]:
trainer.validate(model=pl_model, dataloaders=val_loader)

You are using a CUDA device ('NVIDIA GeForce RTX 3090') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Output()

[{'val/accuracy/no_object': 1.0,
  'val/accuracy/robot': 0.0,
  'val/loss/classification': 0.6160872578620911,
  'val/loss/location': 0.558981716632843}]

In [37]:
# from visualize import draw_model_output, image_grid
# import utils


# grid_size = 1000
# image_list = []
# for i in range(grid_size * grid_size):
#     image, encoded_bounding_boxes, target_masks, encoded_target_classes = raw_train_data[i]
#     predicted_boxes, predicted_class_logits = pl_model(image.unsqueeze(0))
#     predicted_classes = utils.calculate_predicted_classes(predicted_class_logits).squeeze()
#     print(predicted_classes)
#     decoded_boxes = encoder.decode_model_output(predicted_boxes, predicted_classes)
#     image_list.append(
#         draw_model_output(
#             image,
#             decoded_boxes,
#             predicted_classes,
#             torch.tensor([0, 1]),
#         )
#     )
# image_grid(image_list, grid_size, grid_size)
