# Dataset preparation

In [2]:
from pathlib import Path
import json
import requests

from shutil import copyfile
from tqdm import tqdm_notebook
import xmltodict

## Convert RectLabel format

In [3]:
alphabet = list('abcdefghijklmnopqrstuvwxyz') + ['na']

In [4]:
alphabet[:10]

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [5]:
atoi = {l: c for c, l in enumerate(alphabet)}

In [6]:
atoi['na']

26

In [9]:
PATH = Path('../data/')

In [11]:
YOLO_OUTPATH = PATH/'yolo'
YOLO_OUTPATH.mkdir(exist_ok=True)

In [12]:
VID_IDS = ['33qU9tQlEa4', '6iP3g_qSf28', 'EBH3wdRtZyg', 'pAz_mIjHglw']

In [13]:
# keep_ids = set(['h', 'e', 'a', 'l', 't', 'h', 'h', 'a', 'c', 'k'])

In [20]:
for vid_id in tqdm_notebook(VID_IDS):
    VID_PATH = PATH/'annotations'/vid_id
    for item in VID_PATH.iterdir():
        with open(item) as fd:
            doc = xmltodict.parse(fd.read())
            
        if 'object' not in doc['annotation']:
            continue
            
        class_name = doc['annotation']['object']['name']
            
        img_width = int(doc['annotation']['size']['width'])
        img_height = int(doc['annotation']['size']['height'])
        
        bnd_box = doc['annotation']['object']['bndbox']
        
        x_min, y_min, x_max, y_max = (
            int(bnd_box['xmin']), int(bnd_box['ymin']), int(bnd_box['xmax']), int(bnd_box['ymax']))
        width = x_max - x_min
        height = y_max - y_min

        class_id = atoi[class_name]
        
        video_file_id = str(item).split('/')[-1].split('.')[-2]
        copyfile(PATH/f'images/{vid_id}/{video_file_id}.jpg', YOLO_OUTPATH/f'{video_file_id}.jpg')
        with open(YOLO_OUTPATH/f'{video_file_id}.txt', 'w') as fh:
            fh.write(f'{class_id} {x_min / img_width} {y_min / img_height} {width / img_width} {height / img_height}')

HBox(children=(IntProgress(value=0, max=4), HTML(value='')))




## Convert DataTurks format

In [21]:
JSON_VID_IDS = ['ZO8Npgp4xvw', '_5NbYyUlcHU']

In [23]:
for vid_id in tqdm_notebook(JSON_VID_IDS):
    for line in open(PATH/'annotations'/f'{vid_id}.json'):
        line = json.loads(line)
        anno = line['annotation'][0]

        xmin = min(
            anno['points'][0][0], anno['points'][1][0], anno['points'][2][0], anno['points'][3][0])

        ymin = min(
            anno['points'][0][1], anno['points'][1][1],
            anno['points'][2][1], anno['points'][3][1])

        xmax = max(
            anno['points'][0][0], anno['points'][1][0],
            anno['points'][2][0], anno['points'][3][0])
        ymax = max(
            anno['points'][0][1], anno['points'][1][1],
            anno['points'][2][1], anno['points'][3][1])
        
        width = xmax - xmin
        height = ymax - ymin
        
        class_id = atoi[anno['label']]
            
        url = line['content']
        
        video_file_id = url.split('.')[-2].split('__')[-1]

        response = requests.get(url)
        if response.status_code == 200:
            with open(YOLO_OUTPATH/f'{video_file_id}.jpg', 'wb') as f:
                f.write(response.content)
                print('Saved', YOLO_OUTPATH/f'{video_file_id}.jpg')

        with open(YOLO_OUTPATH/f'{video_file_id}.txt', 'w') as fh:
            fh.write(f'{class_id} {xmin} {ymin} {width} {height}')
            print('Saved', YOLO_OUTPATH/f'{video_file_id}.txt')

HBox(children=(IntProgress(value=0, max=2), HTML(value='')))

Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1367.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1367.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-262.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-262.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-504.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-504.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-2.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-2.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-699.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-699.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-114.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-114.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1213.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1213.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-316.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-316.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-459.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-459.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-3

Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-969.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-969.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-799.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-799.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-174.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-174.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-148.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-148.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-389.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-389.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-362.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-362.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-410.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-410.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-439.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-439.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-411.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-411.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1

Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-752.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-752.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1119.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1119.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1125.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1125.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-949.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-949.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-550.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-550.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-49.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-49.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-593.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-593.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-59.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-59.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-568.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-568.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-9

Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-16.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-16.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1344.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1344.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-251.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-251.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-912.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-912.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1195.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-1195.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-721.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-721.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-899.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-899.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-324.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-324.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-132.jpg
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame-132.txt
Saved ../data/yolo/_images_ZO8Npgp4xvw-frame

In [66]:
url

'http://com.dataturks.a96-i23.open.s3.amazonaws.com/2c9fafb065aeb6960165db44c029022c/57b4baaa-bb31-4444-8c47-556de7f29246____5NbYyUlcHU__5NbYyUlcHU-frame-102.jpg'