# Constants & settings

In [53]:
DATA_DIR = '/raid/ruslan_bazhenov/projects/xray/cargoxray/data'

# Change workdir and import libraries

In [54]:
import json
import logging
import multiprocessing as mp
import os
import pathlib
import shutil
from hashlib import md5
from pathlib import Path
from typing import Any, Dict, Union

import pandas as pd
import tqdm
from IPython.display import display
from PIL import Image, UnidentifiedImageError


In [55]:
os.chdir(DATA_DIR)
os.getcwd()

'/raid/ruslan_bazhenov/projects/xray/cargoxray/data'

# Preview files counts

In [56]:
sufcnt = {}
for f in Path().glob('images/**/*'):
    if f.is_file():
        try:
            sufcnt[f.suffix] += 1
        except KeyError:
            sufcnt[f.suffix] = 1

for k, v in sufcnt.items():
    print(f'{k.strip("."):>10} : {v}')

       tif : 17922
      json : 135
           : 18
        db : 11
       rar : 3
    160521 : 1
       jpg : 861
       txt : 860
       lnk : 1


# Import images

## Load existsing images databases if exist

In [57]:
try:
    images = pd.read_json('images.json.gz',
                          orient='records',
                          typ='frame',
                          compression='gzip')
    images = images.set_index('image_id')
    
    next_id = images.index.max() + 1
except:
    images = pd.DataFrame()
    images.index.name = 'image_id'
    next_id = 0

images

Unnamed: 0_level_0,md5,size,width,height,filepath
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3ab70521b3fdcd223949fda05b60f2f0,15229794,4399,1154,images/Нурик1092/Ainur2308/327010D012019090600...
1,4873919d851f8bcc8421e3dd6a8c7781,14862814,4293,1154,images/Нурик1092/Ainur2308/327010D012019091000...
2,b1ddc51d2a1cf08e7d619bb9e4bda828,13287580,3838,1154,images/Нурик1092/Ainur2308/327010D012019091000...
3,187a39ed319c8506d959906f3b510abe,13291042,3839,1154,images/Нурик1092/Ainur2308/327010D012019091200...
4,26594213171935bb519219fc95ff6b89,13332586,3851,1154,images/Нурик1092/Ainur2308/327010D012019090300...
...,...,...,...,...,...
18081,f844d08dd67d2fd5c241b78b44672447,15285186,4415,1154,images/1960completed/1960completed/327010D0120...
18082,3f50aa80f62b295d8eff0b78fb359dc8,15257490,4407,1154,images/1960completed/1960completed/327010D0120...
18083,97ac32a9ad2385d8957a01edf106ba20,15285186,4415,1154,images/1960completed/1960completed/327010D0120...
18084,be241569eeea93e7369d686cd889a5bd,15257490,4407,1154,images/1960completed/1960completed/327010D0120...


## Scan & import image files

In [58]:
image_path: Path

for image_path in tqdm.tqdm(list(Path().glob('images/**/*.tif')), desc='Images'):

    # Skip already imported images
    if len(images.loc[images['filepath'] == image_path.as_posix()]) > 0:
        continue

    try:
        image = Image.open(image_path)
    except UnidentifiedImageError:
        logging.error(f"Corrupted image \"{image_path}\"")
        continue

    image_info = pd.Series(name=next_id, data={

        'md5': md5(image_path.read_bytes()).hexdigest(),
        'size': image_path.stat().st_size,
        'width': image.width,
        'height': image.height,
        'filepath': image_path.as_posix(),

    })

    next_id += 1

    image.close()

    images = images.append(image_info)


Images:  71%|███████   | 12726/17922 [00:14<00:05, 890.67it/s]ERROR:root:Corrupted image "images/2109new files/2109new files/(856).tif"
Images: 100%|██████████| 17922/17922 [00:20<00:00, 881.20it/s]


## Map duplicate images to the same location

In [59]:
images_filtered = images \
    .drop(columns=['filepath']) \
    .drop_duplicates('md5') \
    .reset_index() \
    .merge(images[['md5',
                   'filepath']],
           on='md5') \
    .set_index('image_id') \
    .drop_duplicates()


In [60]:
images_filtered

Unnamed: 0_level_0,md5,size,width,height,filepath
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,3ab70521b3fdcd223949fda05b60f2f0,15229794,4399,1154,images/Нурик1092/Ainur2308/327010D012019090600...
1,4873919d851f8bcc8421e3dd6a8c7781,14862814,4293,1154,images/Нурик1092/Ainur2308/327010D012019091000...
2,b1ddc51d2a1cf08e7d619bb9e4bda828,13287580,3838,1154,images/Нурик1092/Ainur2308/327010D012019091000...
3,187a39ed319c8506d959906f3b510abe,13291042,3839,1154,images/Нурик1092/Ainur2308/327010D012019091200...
4,26594213171935bb519219fc95ff6b89,13332586,3851,1154,images/Нурик1092/Ainur2308/327010D012019090300...
...,...,...,...,...,...
18081,f844d08dd67d2fd5c241b78b44672447,15285186,4415,1154,images/1960completed/1960completed/327010D0120...
18082,3f50aa80f62b295d8eff0b78fb359dc8,15257490,4407,1154,images/1960completed/1960completed/327010D0120...
18083,97ac32a9ad2385d8957a01edf106ba20,15285186,4415,1154,images/1960completed/1960completed/327010D0120...
18084,be241569eeea93e7369d686cd889a5bd,15257490,4407,1154,images/1960completed/1960completed/327010D0120...


In [61]:
images = images_filtered

# Import annotations

## Load existing annotations files

In [62]:
try:
    annotations = pd.read_json('annotations.json.gz',
                               orient='records',
                               typ='frame',
                               compression='gzip')
    annotations = annotations.set_index('bbox_id')

    next_id = annotations.index.max() + 1

except:
    
    annotations = pd.DataFrame()
    annotations.index.name = 'bbox_id'
    next_id = 0

annotations

Unnamed: 0_level_0,image_id,x,y,width,height,label
bbox_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15430,0.310262,0.585355,0.406157,0.612652,textiles
1,15430,0.724743,0.517331,0.277309,0.663778,textiles
2,15198,0.315017,0.487868,0.324232,0.752166,textiles
3,15198,0.750057,0.517764,0.423891,0.824090,textiles
4,15472,0.270107,0.512998,0.426293,0.850953,textiles
...,...,...,...,...,...,...
40427,18078,0.146565,0.912478,0.167989,0.103986,equipment
40428,18078,0.545908,0.845321,0.045341,0.097920,equipment
40429,18078,0.826230,0.666378,0.046021,0.405546,scooter
40430,18078,0.684085,0.488302,0.328950,0.803293,unknown


## Scan & import JSON files

In [63]:

images = images.assign(
    filename=images['filepath'].apply(lambda x: Path(x).name))

for src in tqdm.tqdm(list(Path().glob('images/**/*.json')),
                     desc='JSONs'):

    # Annotations loaded
    with src.open('rb') as fs:
        anns = json.load(fs)

    # For each group of annotations for one image
    ann: Dict[str, Any]
    for ref, ann in anns.items():

        # Try to find the corresponding image
        sel = images.loc[(images['filename'] == ann['filename'])
                         & (images['size'] == ann['size'])]

        sel = sel.drop_duplicates('md5')

        # Corresponding image not found, skip the annotation
        if len(sel) == 0:
            continue
        # More than one image found, warning
        elif len(sel) > 1:
            logging.warning(sel['filepath'].to_string())

        # In some JSON files regions are dict, in some are lists
        if isinstance(ann['regions'], dict):
            regions = list(ann['regions'].values())
        else:
            regions = ann['regions']

        # For each bbox
        for reg in regions:

            # Skip missing bboxes
            if reg is None:
                continue

            # Try to indentify bbox shape
            if reg['shape_attributes']['name'] == 'polyline' \
                    or reg['shape_attributes']['name'] == 'polygon':
                x_points = reg['shape_attributes']['all_points_x']
                y_points = reg['shape_attributes']['all_points_y']

            elif reg['shape_attributes']['name'] == 'rect':
                x_points = reg['shape_attributes']['x']
                y_points = reg['shape_attributes']['y']
                w = reg['shape_attributes']['width']
                h = reg['shape_attributes']['height']

                x_points = [x_points, x_points + w, x_points + w, x_points]
                y_points = [y_points, y_points, y_points + h, y_points + h]
            else:
                logging.warn(
                    f'Unexpected annotation shape in '
                    f'"{src}", "{ann["filename"]}{ann["size"]}". '
                    f'Found {reg["shape_attributes"]["name"]}')
                continue

            x = (max(x_points) + min(x_points)) / 2 \
                / sel.iloc[0]['width']
            w = (max(x_points) - min(x_points)) / sel.iloc[0]['width']

            y = (max(y_points) + min(y_points)) / 2 \
                / sel.iloc[0]['height']
            h = (max(y_points) - min(y_points)) / sel.iloc[0]['height']

            # Case for missing labels

            try:
                label = reg['region_attributes']['class name'].lower()
            except KeyError:
                label = None

            bbox_info = pd.Series(name=next_id,
                                  data={
                                      'image_id': sel.iloc[0].name,
                                      'x': x,
                                      'y': y,
                                      'width': w,
                                      'height': h,
                                      'label': label
                                  })
            next_id += 1

            annotations = annotations.append(bbox_info)

images = images.drop(columns='filename')


annotations['label'] = annotations['label'].apply(
    lambda x: x.strip().lower() if isinstance(x, str) else pd.NA)

label_mappings_fix = pd.read_csv(
    '/raid/ruslan_bazhenov/projects/xray/cargoxray/utils/label_mappings_fix.csv',
    names=['original', 'typos', 'merge']
).set_index('original')

annotations = annotations.replace(label_mappings_fix['typos'].to_dict())

annotations = annotations\
    .drop_duplicates()


  logging.warn(
JSONs: 100%|██████████| 135/135 [03:09<00:00,  1.40s/it]


In [68]:
annotations.drop_duplicates(['x', 'y', 'width', 'height'])

Unnamed: 0_level_0,image_id,x,y,width,height,label
bbox_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,15430.0,0.310262,0.585355,0.406157,0.612652,textile
1,15430.0,0.724743,0.517331,0.277309,0.663778,textile
2,15198.0,0.315017,0.487868,0.324232,0.752166,textile
3,15198.0,0.750057,0.517764,0.423891,0.824090,textile
4,15472.0,0.270107,0.512998,0.426293,0.850953,textile
...,...,...,...,...,...,...
80859,18078.0,0.146565,0.912478,0.167989,0.103986,equipment
80860,18078.0,0.545908,0.845321,0.045341,0.097920,equipment
80861,18078.0,0.826230,0.666378,0.046021,0.405546,scooter
80862,18078.0,0.684085,0.488302,0.328950,0.803293,unknown


# Dump annotations and images

In [65]:
images.reset_index().to_json('images.json.gz',
                             orient='records',
                             compression='gzip',
                             default_handler=str)

In [66]:


annotations.reset_index().to_json('annotations.json.gz',
                                  orient='records',
                                  compression='gzip',
                                  default_handler=str)