In [32]:
import os
import torch
import torch.utils.data
import torchvision
from PIL import Image
from pycocotools.coco import COCO
import json

In [87]:
def create_train_validation_test_set(image_path, merged_coco_ann_path, batch_size, transform_fn, video_file_id_map_path, train_validation_test_split):
    with open(video_file_id_map_path, "r") as f:
        video_filename_list = json.load(f)["filenames"]       # list of video names (without .mp4)

    train_pct, valid_pct = train_validation_test_split
    train_idx = int(len(video_filename_list) * train_pct)
    valid_idx = int(len(video_filename_list) * valid_pct)

    train_video_filenames = video_filename_list[:train_idx]
    valid_video_filenames = video_filename_list[train_idx : train_idx + valid_idx]
    test_video_filenames = video_filename_list[train_idx + valid_idx:]

    print(len(train_video_filenames))
    print(len(valid_video_filenames))
    print(len(test_video_filenames))

    coco = COCO(merged_coco_ann_path)

    train_dataset_key, valid_dataset_key, test_dataset_key = filter_keys(train_video_filenames, valid_video_filenames, test_video_filenames, coco.imgs)

    # combined_key = train_dataset_key + valid_dataset_key + test_dataset_key

    # combined_key.sort()

    # print(combined_key == list(range(9246)))

    train_dataset = CustomCocoDataset(image_path, merged_coco_ann_path, sorted(train_dataset_key), transform_fn)
    valid_dataset = CustomCocoDataset(image_path, merged_coco_ann_path, sorted(valid_dataset_key), transform_fn)
    test_dataset = CustomCocoDataset(image_path, merged_coco_ann_path, sorted(test_dataset_key), transform_fn)

    return  create_dataloader(train_dataset, batch_size), create_dataloader(valid_dataset, batch_size), create_dataloader(test_dataset, batch_size)


def create_dataloader(dataset, batch_size):
    # collate_fn needs for batch
    def collate_fn(batch):
        return tuple(zip(*batch))

    # own DataLoader
    # solve the issue of 
    return torch.utils.data.DataLoader(dataset,
                                        batch_size=batch_size,
                                        shuffle=True,
                                        num_workers=0,
                                        collate_fn=collate_fn)


def filter_keys(train_video_filename_list, valid_video_filename_list, test_video_filename_list, coco_img_dict):
    """
    if it belongs to its dataset
        we get the file id index
        then we would know how much (essentially, we are trying to get the right video for each dataset)

    to access sth in the dataset, it's using the key
    0: {'id': 0,
  'width': 3840,
  'height': 2160,
  'file_name': '20200805_OneSharkSUPFollowsKids_00000.jpg',
  'license': 0,
  'flickr_url': '',
  'coco_url': '',
  'date_captured': '04/30/2022'},
 1: {'id': 1,
  'width': 3840,
  'height': 2160,
  'file_name': '20200805_OneSharkSUPFollowsKids_00001.jpg',
  'license': 0,
  'flickr_url': '',
  'coco_url': '',
  'date_captured': '04/30/2022'},

  we essentially could just check if the video file name is in the file_name
    """
    train_dataset_key = []
    valid_dataset_key = []
    test_dataset_key = []

    for key, value in coco_img_dict.items():
        video_filename_split = value['file_name'].split("_")

        if "_2" in value['file_name']:
            video_filename_split = video_filename_split[:-2]
        else:
            video_filename_split = video_filename_split[:-1]
        
        video_filename = video_filename_split[0]
        for video_filename_chuck in video_filename_split[1:]:
            video_filename += "_" + video_filename_chuck
        
        if video_filename in train_video_filename_list:
            train_dataset_key.append(key)
        elif video_filename in valid_video_filename_list:
            valid_dataset_key.append(key)
        elif video_filename in test_video_filename_list:
            test_dataset_key.append(key)
        else:
            print(f"ERROR: filename = {video_filename} does not belong to any dataset")

    return train_dataset_key, valid_dataset_key, test_dataset_key


In [88]:
class CustomCocoDataset(torch.utils.data.Dataset):
    def __init__(self, root, annotation, img_ids=None, transforms=None):
        self.root = root
        self.transforms = transforms
        self.coco = COCO(annotation)
        if img_ids == None:
            self.ids = list(sorted(self.coco.imgs.keys()))
        else:
            self.ids = img_ids

    def __getitem__(self, index):
        # Own coco file
        coco = self.coco
        # Image ID
        img_id = self.ids[index]
        # List: get annotation id from coco
        ann_ids = coco.getAnnIds(imgIds=img_id)
        # Dictionary: target coco_annotation file for an image
        coco_annotation = coco.loadAnns(ann_ids)
        # path for input image
        path = coco.loadImgs(img_id)[0]['file_name']
        # open the input image
        img = Image.open(os.path.join(self.root, path))

        # number of objects in the image
        num_objs = len(coco_annotation)

        # Bounding boxes for objects
        # In coco format, bbox = [xmin, ymin, width, height]
        # In pytorch, the input should be [xmin, ymin, xmax, ymax]
        boxes = []
        for i in range(num_objs):
            xmin = coco_annotation[i]['bbox'][0]
            ymin = coco_annotation[i]['bbox'][1]
            xmax = xmin + coco_annotation[i]['bbox'][2]
            ymax = ymin + coco_annotation[i]['bbox'][3]
            boxes.append([xmin, ymin, xmax, ymax])
        boxes = torch.as_tensor(boxes, dtype=torch.float32)
        # Labels (In my case, I only one class: target class or background)
        labels = torch.ones((num_objs,), dtype=torch.int64)
        # Tensorise img_id
        img_id = torch.tensor([img_id])
        # Size of bbox (Rectangular)
        areas = []
        for i in range(num_objs):
            areas.append(coco_annotation[i]['area'])
        areas = torch.as_tensor(areas, dtype=torch.float32)
        # Iscrowd
        iscrowd = torch.zeros((num_objs,), dtype=torch.int64)

        # Annotation is in dictionary format
        my_annotation = {}
        my_annotation["boxes"] = boxes
        my_annotation["labels"] = labels
        my_annotation["image_id"] = img_id
        my_annotation["area"] = areas
        my_annotation["iscrowd"] = iscrowd

        if self.transforms is not None:
            img = self.transforms(img)

        return img, my_annotation

    def __len__(self):
        return len(self.ids)

In [97]:
len(train_dataloader.dataset.ids)

5116

In [95]:
# (image_path, merged_coco_ann_path, batch_size, video_file_id_map_path, train_validation_test_split)
train_dataloader, valid_dataloader, test_dataloader = create_train_validation_test_set( "./frames/",
                                                                                        "merged_coco_annotation/merged_coco.json", 
                                                                                        4,
                                                                                        torchvision.transforms.Compose([torchvision.transforms.ToTensor()]),
                                                                                        "./video_file_id_map.json",                        
                                                                                        train_validation_test_split=(0.6, 0.2))

12
4
4
loading annotations into memory...
Done (t=1.02s)
creating index...
index created!
loading annotations into memory...
Done (t=0.47s)
creating index...
index created!
loading annotations into memory...
Done (t=0.18s)
creating index...
index created!
loading annotations into memory...
Done (t=0.53s)
creating index...
index created!


In [98]:
from torchvision.models.detection.faster_rcnn import FastRCNNPredictor

def get_model_instance_segmentation(num_classes):
    # load an instance segmentation model pre-trained pre-trained on COCO
    model = torchvision.models.detection.fasterrcnn_resnet50_fpn(pretrained=True)
    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features
    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = FastRCNNPredictor(in_features, num_classes)

    return model
    

# 2 classes; Only target class or background
num_classes = 2
num_epochs = 5
model = get_model_instance_segmentation(num_classes)

# move model to the right device
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)
model.to(device)
    
# parameters
params = [p for p in model.parameters() if p.requires_grad]
optimizer = torch.optim.SGD(params, lr=0.005, momentum=0.9, weight_decay=0.0005)

len_dataloader = len(train_dataloader)

for epoch in range(num_epochs):
    model.train()
    i = 0    
    for imgs, annotations in train_dataloader:
        i += 1
        print(f"Starting iteration: {i}")
        imgs = list(img for img in imgs)
        annotations = [{k: v for k, v in t.items()} for t in annotations]
        loss_dict = model(imgs, annotations)
        losses = sum(loss for loss in loss_dict.values())

        optimizer.zero_grad()
        losses.backward()
        optimizer.step()

        print(f'Iteration: {i}/{len_dataloader}, Loss: {losses}')

cpu
Starting iteration: 1


KeyboardInterrupt: 

In [75]:
# path to your own data and coco file
train_data_dir = './frames/'
train_coco = './merged_coco_annotation/merged_coco.json'

# create own Dataset
my_dataset = CustomCocoDataset(root=train_data_dir,
                                annotation=train_coco,
                                
                          )

# collate_fn needs for batch
def collate_fn(batch):
    return tuple(zip(*batch))

# Batch size
train_batch_size = 4

# own DataLoader
# solve the issue of 
data_loader = torch.utils.data.DataLoader(my_dataset,
                                          batch_size=train_batch_size,
                                          shuffle=True,
                                          num_workers=0,
                                          collate_fn=collate_fn)

loading annotations into memory...
Done (t=0.29s)
creating index...
index created!


In [80]:
data_loader

<torch.utils.data.dataloader.DataLoader at 0x7f86611798b0>

In [77]:
list(sorted(my_dataset.coco.imgs.keys()))[:10]

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

In [78]:
# select device (whether GPU or CPU)
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
print(device)

# # DataLoader is iterable over Dataset
# for imgs, annotations in data_loader:
#     imgs = list(img.to(device) for img in imgs)
#     annotations = [{k: v.to(device) for k, v in t.items()} for t in annotations]
#     print(annotations)

train_features, train_labels = next(iter(data_loader))
print(train_features)
print(train_labels)


cpu
(<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=3840x2160 at 0x7F8660DB63D0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=3840x2160 at 0x7F8660DB6760>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=3840x2160 at 0x7F8660DB64F0>, <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=3840x2160 at 0x7F86A846E160>)
({'boxes': tensor([[2141.1350,  848.9080, 2197.7290,  952.6640],
        [1245.0660, 1405.4150, 1311.0920, 1509.1700]]), 'labels': tensor([1, 1]), 'image_id': tensor([7800]), 'area': tensor([5871.9673, 6850.5278]), 'iscrowd': tensor([0, 0])}, {'boxes': tensor([]), 'labels': tensor([], dtype=torch.int64), 'image_id': tensor([9015]), 'area': tensor([]), 'iscrowd': tensor([], dtype=torch.int64)}, {'boxes': tensor([[1825.7520,  476.1460, 1929.8101,  536.0580],
        [ 892.3800,  561.2850,  977.5180,  693.7230],
        [ 845.0800,  829.3140,  983.8250,  942.8320],
        [ 917.6060, 1343.2990, 1084.7300, 1400.0580],
        [1239.2410, 1690.1610, 1

In [79]:
test = '20200805_OneSharkSUPFollowsKids_00001.jpg'
test.split("_")

['20200805', 'OneSharkSUPFollowsKids', '00001.jpg']