In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from pathlib import Path
from json import loads
import sys, os
import yaml

sys.path.append("../")
from reef.plot import stack_images, plot_sample, plot_stack
from reef.ops import coco2yolo

%load_ext autoreload
%autoreload 2

In [2]:
def get_bboxes(annotations):
    bboxes = []
    for annotation in annotations:
        bboxes.append([
            annotation['x'],
            annotation['y'],
            annotation['width'],
            annotation['height']
        ])
    return bboxes

In [3]:
data_dir = Path("../../data/raw")
images_dir = data_dir / "train_images"

In [4]:
df_train = pd.read_csv(data_dir / 'train.csv')
df_train['path'] = 'video_' + df_train.video_id.astype(str) + '/' + df_train.video_frame.astype(str) + '.jpg'
df_train['path'] = df_train.path.map(lambda x: (images_dir / x).resolve().as_posix())
df_train['annotations'] = df_train.annotations.map(lambda x: loads(x.replace("'", '"')))
df_train['bboxes'] = df_train.annotations.apply(get_bboxes)
df_train['n_bboxes'] = df_train.bboxes.apply(len)
df_train

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,path,bboxes,n_bboxes
0,0,40258,0,0,0-0,[],/home/isabella/code/competitions/great_barrier...,[],0
1,0,40258,1,1,0-1,[],/home/isabella/code/competitions/great_barrier...,[],0
2,0,40258,2,2,0-2,[],/home/isabella/code/competitions/great_barrier...,[],0
3,0,40258,3,3,0-3,[],/home/isabella/code/competitions/great_barrier...,[],0
4,0,40258,4,4,0-4,[],/home/isabella/code/competitions/great_barrier...,[],0
...,...,...,...,...,...,...,...,...,...
23496,2,29859,10755,2983,2-10755,[],/home/isabella/code/competitions/great_barrier...,[],0
23497,2,29859,10756,2984,2-10756,[],/home/isabella/code/competitions/great_barrier...,[],0
23498,2,29859,10757,2985,2-10757,[],/home/isabella/code/competitions/great_barrier...,[],0
23499,2,29859,10758,2986,2-10758,[],/home/isabella/code/competitions/great_barrier...,[],0


## Train/val split

In [5]:
import numpy as np


sequence_target_rate = (
    df_train
    .groupby('sequence')
    .agg(
        target_rate = ('annotations', lambda x: np.mean(list(map(len, x))))
    )
    .reset_index()
)

non_zero_sequences = sequence_target_rate.query('target_rate > 0').sequence.tolist()
zero_sequences = sequence_target_rate.query('target_rate == 0').sequence.tolist()

print(f"Number of sequences with nonzero target rate: {len(non_zero_sequences)}")
print(f"Number of sequences with zero target rate: {len(zero_sequences)}")

Number of sequences with nonzero target rate: 17
Number of sequences with zero target rate: 3


In [6]:
from sklearn.model_selection import train_test_split

train_sequences, test_sequences = train_test_split(non_zero_sequences, train_size=13, random_state=42)
zero_train_sequences, zero_test_sequences = train_test_split(zero_sequences, train_size=2, random_state=42)

train_sequences = train_sequences + zero_train_sequences
test_sequences = test_sequences + zero_test_sequences

In [7]:
df_train['split'] = np.where(df_train.sequence.isin(train_sequences), 'train', 'valid')
df_train

Unnamed: 0,video_id,sequence,video_frame,sequence_frame,image_id,annotations,path,bboxes,n_bboxes,split
0,0,40258,0,0,0-0,[],/home/isabella/code/competitions/great_barrier...,[],0,train
1,0,40258,1,1,0-1,[],/home/isabella/code/competitions/great_barrier...,[],0,train
2,0,40258,2,2,0-2,[],/home/isabella/code/competitions/great_barrier...,[],0,train
3,0,40258,3,3,0-3,[],/home/isabella/code/competitions/great_barrier...,[],0,train
4,0,40258,4,4,0-4,[],/home/isabella/code/competitions/great_barrier...,[],0,train
...,...,...,...,...,...,...,...,...,...,...
23496,2,29859,10755,2983,2-10755,[],/home/isabella/code/competitions/great_barrier...,[],0,train
23497,2,29859,10756,2984,2-10756,[],/home/isabella/code/competitions/great_barrier...,[],0,train
23498,2,29859,10757,2985,2-10757,[],/home/isabella/code/competitions/great_barrier...,[],0,train
23499,2,29859,10758,2986,2-10758,[],/home/isabella/code/competitions/great_barrier...,[],0,train


In [8]:
df_train.groupby("split").agg(
    target_rate = ('annotations', lambda x: np.mean(list(map(len, x))))
)

Unnamed: 0_level_0,target_rate
split,Unnamed: 1_level_1
train,0.53185
valid,0.372346


## Data files

### Train/val paths

### Create directory with symlinks to images

In [9]:
prepared_data_dir = Path("../../data/splits/1/")
prepared_data_dir.mkdir(exist_ok=True)

images_splits_dir = prepared_data_dir / "images"

In [10]:
# index of images in train set with empty bboxes
no_labels_train_idx = df_train.query("split == 'train'").query("n_bboxes == 0").index 
print(f"Got {no_labels_train_idx.shape[0]} images with no labels in train set")

# sample that set to the size of 10% of images with labels in train set
n_images_with_labels_train = df_train.query("split == 'train'").query("n_bboxes > 0").shape[0]
no_labels_train_idx_left, _ = train_test_split(
    no_labels_train_idx, 
    train_size=int(n_images_with_labels_train * 0.1), 
    random_state=42
)
print(f"Sample {no_labels_train_idx_left.shape[0]} images from that subset")

filtered_idx = (
    df_train.index.difference(no_labels_train_idx)  # from all indices remove empty images in train set
    .append(no_labels_train_idx_left)  # from removed images sample 10% and add to the remaining indices
)
print(f"Remains: {filtered_idx.shape[0]}")

Got 15771 images with no labels in train set
Sample 396 images from that subset
Remains: 8126


In [11]:
train_images_dir = (images_splits_dir / "train")
train_images_dir.mkdir(exist_ok=True, parents=True)

valid_images_dir = (images_splits_dir / "valid")
valid_images_dir.mkdir(exist_ok=True, parents=True)

src2dest = dict()

df = df_train.loc[filtered_idx]

for i, row in df.iterrows():
    src_path = row.path
    image_name = row.image_id
    split = row.split
    
    dest_path = (images_splits_dir / f"{split}/{image_name}.jpg").resolve().as_posix()
    os.symlink(src_path, dest_path)
    src2dest[src_path] = dest_path
    
dest2src = {v:k for k, v in src2dest.items()}

### Create train/valid/data files

In [12]:
with open(prepared_data_dir / 'train.txt', 'w') as f:
    for path in df.query("split == 'train'").path:
        f.write(src2dest[path] + '\n')
        
with open(prepared_data_dir / 'valid.txt', 'w') as f:
    for path in df.query("split == 'valid'").path:
        f.write(src2dest[path] + '\n')
        
data = dict(
    train = (prepared_data_dir / 'train.txt').resolve().as_posix(),
    val = (prepared_data_dir / 'valid.txt').resolve().as_posix(),
    nc = 1,
    names = ['starfish']
)

with open(prepared_data_dir / 'data.yaml', 'w') as f:
    yaml.dump(data, f, default_flow_style=False)

### Labels

Для каждого изображения, на котором есть хотя бы один объект необходимо сделать файл с расположением этих объектов в формате

```bash
> cat image_id.txt
  obj_id_1 x_1 y_1 width_1 height_1
  obj_id_2 x_2 y_2 width_2 height_2
```

x, y, width, height должны быть указаны в формате YOLO, т.е. в еденицах относительно высоты и ширины

In [13]:
train_labels_dir = prepared_data_dir / "labels/train"
train_labels_dir.mkdir(parents=True, exist_ok=True)

val_labels_dir = prepared_data_dir / "labels/valid"
val_labels_dir.mkdir(parents=True, exist_ok=True)

In [14]:
all_boxes = []

for i, row in df.iterrows():
    num_bbox = len(row.bboxes)
    if num_bbox == 0:
        continue
    
    image_name = row.image_id
    image_height = 720
    image_width = 1280
    bboxes_coco = np.array(row.bboxes).astype(np.float32).copy()
    names = ['starfish'] * num_bbox
    labels = [0] * num_bbox
    split = row.split
    
    filename = prepared_data_dir / f"labels/{split}/{image_name}.txt"
    
    with open(filename, 'w') as f:
        bboxes_yolo = coco2yolo(image_height, image_width, bboxes_coco)
        bboxes_yolo = np.clip(bboxes_yolo, 0, 1)
        
        all_boxes.extend(bboxes_yolo)
        
        for bbox_idx in range(len(bboxes_yolo)):
            bb = str(bboxes_yolo[bbox_idx])[1:-1]
            
            annot = str(str(labels[bbox_idx])) + ' ' + bb + '\n'
            annot = ''.join(annot)
            annot = annot.strip('')
            
            f.write(annot)

## Train model

In [15]:
PYTHONPATH = "../../modules/yolov5"
DATAPATH = prepared_data_dir.resolve().as_posix()
IMG_SIZE = 1280
BATCH = 4
EPOCHS = 1

In [16]:
DATAPATH

'/home/isabella/code/competitions/great_barrier_reef/data/splits/1'

In [18]:
PYTHONPATH = Path("../../modules/yolov5").resolve().as_posix()
DATAPATH = prepared_data_dir.resolve().as_posix()
IMG_SIZE = 1280
BATCH = 4
EPOCHS = 1

run_cmd = f"""
PYTHONPATH={PYTHONPATH} python {PYTHONPATH}/train.py \\
    --img {IMG_SIZE} \\
    --batch {BATCH} \\
    --epochs {EPOCHS} \\
    --data {DATAPATH}/data.yaml \\
    --weights yolov5s.pt
"""
print(run_cmd)


PYTHONPATH=/home/isabella/code/competitions/great_barrier_reef/modules/yolov5 python /home/isabella/code/competitions/great_barrier_reef/modules/yolov5/train.py \
    --img 1280 \
    --batch 4 \
    --epochs 1 \
    --data /home/isabella/code/competitions/great_barrier_reef/data/splits/1/data.yaml \
    --weights yolov5s.pt

