# Dataset preparation

In [68]:
from pathlib import Path
import json
import requests

from shutil import copyfile
from tqdm import tqdm_notebook
import xmltodict

## Convert RectLabel format

In [11]:
alphabet = list('abcdefghijklmnopqrstuvwxyz') + ['na']

In [12]:
alphabet[:10]

['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j']

In [13]:
atoi = {l: c for c, l in enumerate(alphabet)}

In [14]:
atoi['na']

26

In [15]:
PATH = Path('./data/')

In [16]:
YOLO_OUTPATH = PATH/'yolo-filtered'
YOLO_OUTPATH.mkdir(exist_ok=True)

In [17]:
VID_IDS = ['33qU9tQlEa4', '6iP3g_qSf28', 'EBH3wdRtZyg', 'pAz_mIjHglw']

In [24]:
keep_ids = set(['h', 'e', 'a', 'l', 't', 'h', 'h', 'a', 'c', 'k'])

In [25]:
for vid_id in tqdm_notebook(VID_IDS):
    VID_PATH = PATH/'annotations'/vid_id
    for item in VID_PATH.iterdir():
        with open(item) as fd:
            doc = xmltodict.parse(fd.read())
            
        if 'object' not in doc['annotation']:
            continue
            
        class_name = doc['annotation']['object']['name']
        if class_name not in keep_ids:
            continue
            
        img_width = int(doc['annotation']['size']['width'])
        img_height = int(doc['annotation']['size']['height'])
        
        bnd_box = doc['annotation']['object']['bndbox']
        
        x_min, y_min, x_max, y_max = (
            int(bnd_box['xmin']), int(bnd_box['ymin']), int(bnd_box['xmax']), int(bnd_box['ymax']))
        width = x_max - x_min
        height = y_max - y_min

        class_id = atoi[class_name]
        
        video_file_id = str(item).split('/')[-1].split('.')[-2]
        copyfile(Path(f'./data/images/{vid_id}/{video_file_id}.jpg'), YOLO_OUTPATH/f'{video_file_id}.jpg')
        with open(YOLO_OUTPATH/f'{video_file_id}.txt', 'w') as fh:
            fh.write(f'{class_id} {x_min / img_width} {y_min / img_height} {width / img_width} {height / img_height}')




## Convert DataTurks format

In [63]:
JSON_VID_IDS = ['ZO8Npgp4xvw', '_5NbYyUlcHU']

In [67]:
for vid_id in tqdm_notebook(JSON_VID_IDS):
    for line in open(PATH/'annotations'/f'{vid_id}.json'):
        line = json.loads(line)
        anno = line['annotation'][0]
        
        if anno['label'] not in keep_ids:
            continue

        xmin = min(
            anno['points'][0][0], anno['points'][1][0], anno['points'][2][0], anno['points'][3][0])

        ymin = min(
            anno['points'][0][1], anno['points'][1][1],
            anno['points'][2][1], anno['points'][3][1])

        xmax = max(
            anno['points'][0][0], anno['points'][1][0],
            anno['points'][2][0], anno['points'][3][0])
        ymax = max(
            anno['points'][0][1], anno['points'][1][1],
            anno['points'][2][1], anno['points'][3][1])
        
        width = xmax - xmin
        height = ymax - ymin
        
        class_id = atoi[anno['label']]
            
        url = line['content']
        
        video_file_id = url.split('.')[-2].split('__')[-1]

        response = requests.get(url)
        if response.status_code == 200:
            with open(YOLO_OUTPATH/f'{video_file_id}.jpg', 'wb') as f:
                f.write(response.content)
                print('Saved', YOLO_OUTPATH/f'{video_file_id}.jpg')

        with open(YOLO_OUTPATH/f'{video_file_id}.txt', 'w') as fh:
            fh.write(f'{class_id} {xmin} {ymin} {width} {height}')
            print('Saved', YOLO_OUTPATH/f'{video_file_id}.txt')

Saved data/yolo-filtered/5NbYyUlcHU-frame-468.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-468.txt
Saved data/yolo-filtered/5NbYyUlcHU-frame-323.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-323.txt
Saved data/yolo-filtered/5NbYyUlcHU-frame-347.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-347.txt
Saved data/yolo-filtered/5NbYyUlcHU-frame-207.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-207.txt
Saved data/yolo-filtered/5NbYyUlcHU-frame-43.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-43.txt
Saved data/yolo-filtered/5NbYyUlcHU-frame-842.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-842.txt
Saved data/yolo-filtered/5NbYyUlcHU-frame-843.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-843.txt
Saved data/yolo-filtered/5NbYyUlcHU-frame-116.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-116.txt
Saved data/yolo-filtered/5NbYyUlcHU-frame-102.jpg
Saved data/yolo-filtered/5NbYyUlcHU-frame-102.txt


In [66]:
url

'http://com.dataturks.a96-i23.open.s3.amazonaws.com/2c9fafb065aeb6960165db44c029022c/57b4baaa-bb31-4444-8c47-556de7f29246____5NbYyUlcHU__5NbYyUlcHU-frame-102.jpg'