In [1]:
import json
import multiprocessing as mp
import pathlib
import shutil
from hashlib import md5
from pathlib import Path
from typing import Any, Dict, Union

import pandas as pd
import tqdm
from IPython.display import display
from PIL import Image


In [2]:
IMPORT_DIR = 'raw_backup'
DESTINATION_DIR = 'data_new'

In [3]:
indir = Path(IMPORT_DIR)
outdir = Path(DESTINATION_DIR)

In [4]:
sufcnt = {}
for f in indir.glob('**/*'):
    if f.is_file():
        try:
            sufcnt[f.suffix] += 1
        except KeyError:
            sufcnt[f.suffix] = 1
sufcnt

{'': 19,
 '.tif': 15198,
 '.json': 133,
 '.db': 10,
 '.rar': 3,
 '.160521': 1,
 '.jpg': 861,
 '.txt': 860,
 '.lnk': 1}

Load existing images database

In [5]:
try:
    images = pd.read_json(outdir / 'images.json.gz',
                          orient='records',
                          typ='frame',
                          compression='gzip')
    images = images.set_index('image_id')
    
    next_id = images.index.max() + 1
except:
    images = pd.DataFrame()
    images.index.name = 'image_id'
    next_id = 0

images

Import new images

In [6]:
for image_path in tqdm.tqdm(list(indir.glob('**/*.tif')), desc='Images'):

    image = Image.open(image_path)

    image_info = pd.Series(name=next_id, data={

        'filepath': '{}/images/{:05X}{}'.format('data', next_id, image_path.suffix),
        'md5': md5(image_path.read_bytes()).hexdigest(),

        'size': image_path.stat().st_size,
        'width': image.width,
        'height': image.height,

        'old_location': image_path.as_posix(),
        'old_filename': image_path.name,
    })

    next_id += 1

    image.close()

    images = images.append(image_info)

images


Images: 100%|██████████| 15198/15198 [03:52<00:00, 65.26it/s]


Unnamed: 0_level_0,filepath,md5,size,width,height,old_location,old_filename
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,data/images/00000.tif,3ab70521b3fdcd223949fda05b60f2f0,15229794.0,4399.0,1154.0,raw_backup/new/Нурик1092/Ainur2308/327010D0120...,327010D01201909060002.tif
1,data/images/00001.tif,4873919d851f8bcc8421e3dd6a8c7781,14862814.0,4293.0,1154.0,raw_backup/new/Нурик1092/Ainur2308/327010D0120...,327010D01201909100004.tif
2,data/images/00002.tif,b1ddc51d2a1cf08e7d619bb9e4bda828,13287580.0,3838.0,1154.0,raw_backup/new/Нурик1092/Ainur2308/327010D0120...,327010D01201909100037.tif
3,data/images/00003.tif,187a39ed319c8506d959906f3b510abe,13291042.0,3839.0,1154.0,raw_backup/new/Нурик1092/Ainur2308/327010D0120...,327010D01201909120012.tif
4,data/images/00004.tif,26594213171935bb519219fc95ff6b89,13332586.0,3851.0,1154.0,raw_backup/new/Нурик1092/Ainur2308/327010D0120...,327010D01201909030024.tif
...,...,...,...,...,...,...,...
15193,data/images/03B59.tif,1b469be64a9c6963a6f714a0a89225c9,8390562.0,3496.0,1200.0,raw_backup/old/Готовые для передачи 2806/для п...,image2704 (576).tif
15194,data/images/03B5A.tif,6332a0f7a1f8effba22169a2217955a8,10257762.0,4274.0,1200.0,raw_backup/old/Готовые для передачи 2806/для п...,image2704 (540).tif
15195,data/images/03B5B.tif,fccb2a0320903b505703c6221af92781,8047362.0,3353.0,1200.0,raw_backup/old/Готовые для передачи 2806/для п...,image2704 (563).tif
15196,data/images/03B5C.tif,7be52e434fefb6eb1c900207ee1cb339,13903362.0,5793.0,1200.0,raw_backup/old/Готовые для передачи 2806/для п...,image2704 (520).tif


Clean up images

In [7]:
images_filtered = images \
    .drop(columns=['old_filename',
                   'old_location']) \
    .sort_index() \
    .drop_duplicates('md5') \
    .reset_index() \
    .merge(images[['md5',
                   'old_filename',
                   'old_location']],
           on='md5') \
    .drop_duplicates() \
    .set_index('image_id')

images_filtered

Unnamed: 0_level_0,filepath,md5,size,width,height,old_filename,old_location
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,data/images/00000.tif,3ab70521b3fdcd223949fda05b60f2f0,15229794.0,4399.0,1154.0,327010D01201909060002.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
1,data/images/00001.tif,4873919d851f8bcc8421e3dd6a8c7781,14862814.0,4293.0,1154.0,327010D01201909100004.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
2,data/images/00002.tif,b1ddc51d2a1cf08e7d619bb9e4bda828,13287580.0,3838.0,1154.0,327010D01201909100037.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
3,data/images/00003.tif,187a39ed319c8506d959906f3b510abe,13291042.0,3839.0,1154.0,327010D01201909120012.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
4,data/images/00004.tif,26594213171935bb519219fc95ff6b89,13332586.0,3851.0,1154.0,327010D01201909030024.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
...,...,...,...,...,...,...,...
15193,data/images/03B59.tif,1b469be64a9c6963a6f714a0a89225c9,8390562.0,3496.0,1200.0,image2704 (576).tif,raw_backup/old/Готовые для передачи 2806/для п...
15194,data/images/03B5A.tif,6332a0f7a1f8effba22169a2217955a8,10257762.0,4274.0,1200.0,image2704 (540).tif,raw_backup/old/Готовые для передачи 2806/для п...
15195,data/images/03B5B.tif,fccb2a0320903b505703c6221af92781,8047362.0,3353.0,1200.0,image2704 (563).tif,raw_backup/old/Готовые для передачи 2806/для п...
15196,data/images/03B5C.tif,7be52e434fefb6eb1c900207ee1cb339,13903362.0,5793.0,1200.0,image2704 (520).tif,raw_backup/old/Готовые для передачи 2806/для п...


In [8]:
images_filtered.drop_duplicates('md5')

Unnamed: 0_level_0,filepath,md5,size,width,height,old_filename,old_location
image_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,data/images/00000.tif,3ab70521b3fdcd223949fda05b60f2f0,15229794.0,4399.0,1154.0,327010D01201909060002.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
1,data/images/00001.tif,4873919d851f8bcc8421e3dd6a8c7781,14862814.0,4293.0,1154.0,327010D01201909100004.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
2,data/images/00002.tif,b1ddc51d2a1cf08e7d619bb9e4bda828,13287580.0,3838.0,1154.0,327010D01201909100037.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
3,data/images/00003.tif,187a39ed319c8506d959906f3b510abe,13291042.0,3839.0,1154.0,327010D01201909120012.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
4,data/images/00004.tif,26594213171935bb519219fc95ff6b89,13332586.0,3851.0,1154.0,327010D01201909030024.tif,raw_backup/new/Нурик1092/Ainur2308/327010D0120...
...,...,...,...,...,...,...,...
15193,data/images/03B59.tif,1b469be64a9c6963a6f714a0a89225c9,8390562.0,3496.0,1200.0,image2704 (576).tif,raw_backup/old/Готовые для передачи 2806/для п...
15194,data/images/03B5A.tif,6332a0f7a1f8effba22169a2217955a8,10257762.0,4274.0,1200.0,image2704 (540).tif,raw_backup/old/Готовые для передачи 2806/для п...
15195,data/images/03B5B.tif,fccb2a0320903b505703c6221af92781,8047362.0,3353.0,1200.0,image2704 (563).tif,raw_backup/old/Готовые для передачи 2806/для п...
15196,data/images/03B5C.tif,7be52e434fefb6eb1c900207ee1cb339,13903362.0,5793.0,1200.0,image2704 (520).tif,raw_backup/old/Готовые для передачи 2806/для п...


In [9]:
images = images_filtered

Import existsing annotations

In [10]:
try:
    annotations = pd.read_json(outdir / 'annotations.json.gz',
                               orient='records',
                               typ='frame',
                               compression='gzip')
    annotations = annotations.set_index('bbox_id')

    next_id = annotations.index.max() + 1

except:
    
    annotations = pd.DataFrame()
    annotations.index.name = 'bbox_id'
    next_id = 0

annotations

Load new annotations

In [11]:
for src in tqdm.tqdm(list(indir.glob('**/*.json')),
                     desc='JSONs'):

    # Annotations loaded
    with src.open('rb') as fs:
        anns = json.load(fs)

    # For each group of annotations for one image
    ann: Dict[str, Any]
    for ref, ann in anns.items():

        # Try to find the corresponding image
        sel = images.loc[(images['old_filename'] == ann['filename'])
                         & (images['size'] == ann['size'])]

        sel = sel.drop_duplicates('md5')

        if len(sel) == 0:
            continue
        elif len(sel) > 1:
            display(sel)

        # In some JSON files regions are dict, in some are lists
        if isinstance(ann['regions'], dict):
            regions = list(ann['regions'].values())
        else:
            regions = ann['regions']

        # For each bbox
        for reg in regions:

            # Skip missing bboxes
            if reg is None:
                continue

            # Try to indentify bbox shape
            if reg['shape_attributes']['name'] == 'polyline' \
                    or reg['shape_attributes']['name'] == 'polygon':
                x = reg['shape_attributes']['all_points_x']
                y = reg['shape_attributes']['all_points_y']

            elif reg['shape_attributes']['name'] == 'rect':
                x = reg['shape_attributes']['x']
                y = reg['shape_attributes']['y']
                w = reg['shape_attributes']['width']
                h = reg['shape_attributes']['height']

                x = [x, x + w, x + w, x]
                y = [y, y, y + h, y + h]
            else:
                print('Omitting', reg['shape_attributes'])
                continue
            
            x_min = min(x)
            w = max(x) - x_min
            y_min = min(y)
            h = max(y) - y_min

            # Case for missing labels

            try:
                label = reg['region_attributes']['class name'].lower()
            except KeyError:
                label = None

            bbox_info = pd.Series(name=next_id,
                                  data={
                                      'image_id': sel.iloc[0].name,
                                      'x_min': x_min,
                                      'y_min': y_min,
                                      'width': w,
                                      'height': h,
                                      'label': label
                                  })
            next_id += 1

            annotations = annotations.append(bbox_info)

annotations


JSONs:  41%|████▏     | 55/133 [01:02<01:58,  1.52s/it]

Omitting {'name': 'ellipse', 'cx': 516, 'cy': 177, 'rx': 12.659, 'ry': 10.127, 'theta': 3.142}


JSONs: 100%|██████████| 133/133 [02:29<00:00,  1.13s/it]


Unnamed: 0_level_0,image_id,x_min,y_min,width,height,label
bbox_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,470.0,2760.0,115.0,416.0,893.0,scooter
1,470.0,846.0,637.0,324.0,177.0,scooter
2,455.0,2077.0,877.0,1261.0,210.0,spare parts
3,455.0,672.0,126.0,154.0,67.0,motorbike
4,114.0,2116.0,691.0,1161.0,337.0,tools
...,...,...,...,...,...,...
41998,14366.0,127.0,73.0,4064.0,750.0,shoes
41999,14387.0,122.0,129.0,2912.0,630.0,clothes
42000,14342.0,131.0,119.0,1156.0,729.0,clothes
42001,14342.0,3188.0,262.0,455.0,541.0,equipment


In [15]:
annotations = annotations \
    .dropna() \
    .drop_duplicates()
annotations['label'] = annotations['label'].apply(str.lower).apply(str.strip)
annotations

Unnamed: 0_level_0,image_id,x_min,y_min,width,height,label
bbox_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,470.0,2760.0,115.0,416.0,893.0,scooter
1,470.0,846.0,637.0,324.0,177.0,scooter
2,455.0,2077.0,877.0,1261.0,210.0,spare parts
3,455.0,672.0,126.0,154.0,67.0,motorbike
4,114.0,2116.0,691.0,1161.0,337.0,tools
...,...,...,...,...,...,...
41998,14366.0,127.0,73.0,4064.0,750.0,shoes
41999,14387.0,122.0,129.0,2912.0,630.0,clothes
42000,14342.0,131.0,119.0,1156.0,729.0,clothes
42001,14342.0,3188.0,262.0,455.0,541.0,equipment


In [14]:
annotations['image_id'].drop_duplicates()

bbox_id
0          470.0
2          455.0
4          114.0
8          384.0
10         716.0
          ...   
41994    14349.0
41997    14365.0
41998    14366.0
41999    14387.0
42000    14342.0
Name: image_id, Length: 9938, dtype: float64

In [13]:
print(' | '.join(annotations['label'].drop_duplicates().sort_values().to_list()))

accessory | agriculturaltires | air freshener | apple | appliance | appliances | armchair | aubergines | auto parts | bags | ball | bed linen | bicycle | bike | box | broccoli | brokkoli | bus | cabbage | car | car wheels | carrot | cars | carweels | carwheels | cataloga | celery | clothes | clouthes | collagen mask | cookies and candy | cucumber | curtains | disc | dummy | egg plant | electric car | electronics | equipment | equpment | fabrics | finished products | fittings | food material | garlic | genus pimenta | ginger | glasses | glue | grape | grapes | home appliances | household | household equipment | household goods | hydro scooter | intestinal raw materials | jalousie | kiwi | label | lamps | leek | lemon | lettuce | machine | mandarin | mango | meat | medical gloves | medications | motobike | motorbike | motorbike parts | motorcycle | narrow ribbons | nectarin | nectarine | office supplies | packages | pampers | parts of shoes | parts shoes | peach | pear | pears | pepper |

In [17]:
images.reset_index().to_json(outdir / 'images.json.gz',
                             orient='records',
                             compression='gzip',
                             default_handler=str)

annotations.reset_index().to_json(outdir / 'annotations.json.gz',
                                  orient='records',
                                  compression='gzip',
                                  default_handler=str)