### ICDAR2017 전체 데이터셋을 UFO로 변환


In [76]:
# 사전 데이터셋 구존

# input/data/ICDAR17_All
### images
##### {file_name}   (이미지 저장소)
### ufo             (annotation.json 저장소)
### zips_ann        
##### all_ann ()    (이미지별 annotation.txt 저장소)

In [1]:
import zipfile
import pandas as pd
import numpy as np
import re
import glob
import json
import copy
import os

In [2]:
# 사전작업 1. zip으로 묶인 이미지를 모두 같은 저장소로 합체
save_path = '/opt/ml/input/data/ICDAR17_All/images/'

if len(os.listdir(save_path)) == 0:
    for i in range(1, 9):
        zipfile.ZipFile('/opt/ml/input/data/ICDAR17_All/zips_image/ch8_training_images_{}.zip'.format(i)).extractall(save_path)

In [3]:
# 사전작업 2. zip으로 묶인 annotation.txt를 모두 같은 저장소로 합체
save_ann_path = '/opt/ml/input/data/ICDAR17_All/zips_ann/all_ann'

if len(os.listdir(save_ann_path)) == 0:
    zipfile.ZipFile('/opt/ml/input/data/ICDAR17_All/zips_ann/ch8_training_localization_transcription_gt_v2.zip').extractall(save_ann_path)

In [27]:
# 이미지 번호 순서대로 파일이름을 추출
# zip으로 페어링
all_ann_file_name_list = sorted(glob.glob('/opt/ml/input/data/ICDAR17_All/zips_ann/all_ann/*.txt'), key=lambda x : (len(x.split('_')[5]), x))
all_img_file_name_list = sorted(glob.glob('/opt/ml/input/data/ICDAR17_All/images/*'), key=lambda x : (len(x.split('_')[2]), x))
file_name_zip = zip(all_ann_file_name_list, all_img_file_name_list)

# 각 이미지마다 존재하는 annotation.txt를 UFO 포맷의 dict로 재구성
# 조건 1. Latin, Korean 외의 글자는 illegibility=True로 처리 & transcription='###'으로 처리
# 조건 2. orientation은 'Horizontal'로 일괄처리
care_lang = ['Latin', 'Korean']
cp = re.compile('img_[0-9]+[.][A-Za-z]+')

def convert2UFO(txt, img):
    word = {}
    info = {'points' : '', 'transcription' : '', 'language' : '', 'illegibility' : '', 'orientation' : 'Horizontal'}
    
    with open(txt, 'r') as f:
        idx = 0
        while True:
            ann = f.readline()
            if not ann: break
            ann_info = ann.split(',')
            
            points = np.array(list(map(int, ann_info[:8]))).reshape(4, 2)
            language = ann_info[8]
            if language not in care_lang:
                text = '###'
                illegibility = True
                language = 'Others'
            else:
                text = ann_info[9]
                illegibility = False
            
            info['points'] = points.tolist()
            info['transcription'] = text
            info['language'] = language
            info['illegibility'] = illegibility
            word[str(idx)] = copy.deepcopy(info)
            idx += 1
            
    img_name = cp.findall(img)[0]
    return img_name, {'words' : word}

In [28]:
# UFO 포맷의 dict구조를 생성하고, 이미지별 annotation 정보를 업데이트
annotation = {'images' :{}}

for txt, img in file_name_zip:
    img_name, word = convert2UFO(txt, img)
    if pd.DataFrame(word)['words'].apply(lambda x : x['language'] == 'Others').any():
        continue
    annotation['images'][img_name] = word
    
annotation

{'images': {'img_802.jpg': {'words': {'0': {'points': [[131, 79],
      [461, 77],
      [464, 120],
      [144, 123]],
     'transcription': 'WELCOME\n',
     'language': 'Latin',
     'illegibility': False,
     'orientation': 'Horizontal'},
    '1': {'points': [[245, 134], [339, 132], [333, 179], [242, 180]],
     'transcription': 'TO\n',
     'language': 'Latin',
     'illegibility': False,
     'orientation': 'Horizontal'},
    '2': {'points': [[101, 188], [490, 190], [490, 253], [90, 248]],
     'transcription': 'CARLTON\n',
     'language': 'Latin',
     'illegibility': False,
     'orientation': 'Horizontal'},
    '3': {'points': [[206, 257], [268, 260], [262, 288], [206, 287]],
     'transcription': 'EST\n',
     'language': 'Latin',
     'illegibility': False,
     'orientation': 'Horizontal'},
    '4': {'points': [[280, 260], [367, 260], [366, 289], [279, 287]],
     'transcription': '1968\n',
     'language': 'Latin',
     'illegibility': False,
     'orientation': 'Horizon

In [30]:
# 완성된 포맷을 json 파일로 저장
with open('ICDAR17_ALL.json', 'w') as f:
    json.dump(annotation, f, indent=2)

In [None]:
# all : 5803
# any : 2626