# 고장난 이미지 찾기

고장난 이미지를 찾기 위해 한번씩 흝어봅니다.


In [38]:
from os import path
import os
from pprint import pprint
from typing import (
    Any, Dict, List, Tuple, NamedTuple, Optional, NewType
)
import typing

from concurrent import futures

from PIL import Image
from tqdm import tqdm


In [18]:
# typedef
PathStr = NewType('PathStr', str)
Inspection = NamedTuple(
    'Inspection', [('path', PathStr), ('why', 'Exception')]
)


In [42]:
def get_broken(
    top: PathStr, executor: Optional[futures.Executor] = None
) -> List[Inspection]:
    shutdown = False
    if executor is None:
        executor = futures.ThreadPoolExecutor()
        shutdown = True

    tasks = []
    for stem, branches, leaves in os.walk(top):
        for leaf in leaves:
            pathname = path.join(stem, leaf)
            # rel = path.relpath(pathname, top)
            tasks.append(executor.submit(image_broken, pathname))

    broken = []
    # results = futures.wait(tasks)
    with tqdm(total=len(tasks), desc='Testing images') as pbar:
        for result in futures.as_completed(tasks):
            faulty = result.result()
            if faulty:
                pathname, why = faulty
                broken.append(Inspection(path.relpath(pathname, top), why))
            pbar.update(1)

    if shutdown:
        executor.shutdown()

    return broken


In [40]:
def image_broken(pathname: PathStr) -> Optional[Tuple[PathStr, Exception]]:
    try:
        with Image.open(pathname) as image:
            image.verify()
    except Exception as why:
        return (pathname, why)
    else:
        return None


In [43]:
# executor = futures.ThreadPoolExecutor()

train_broken = get_broken('./dataset/training')
val_broken = get_broken('./dataset/validation')

print(len(train_broken))
print(len(val_broken))

Testing images: 100%|██████████████████████████████████████████| 609926/609926 [22:57<00:00, 442.90it/s]
Testing images: 100%|█████████████████████████████████████████| 76377/76377 [00:00<00:00, 166455.63it/s]

1186
0





## 판별된 결과를 저장합니다.

저장 포맷은 텍스트로 하는 편이 범용성이 좋을 것 같습니다.

유니코드가 살짝 걱정되지만 지금 걱정할 일은 아닌 것 같습니다.


In [46]:
with open('./preproc/broken_images.txt', 'w', encoding='utf-8') as txt_out:
    for inspection in train_broken:
        txt_out.write(inspection.path)
        txt_out.write('\n')

