In [1]:
import numpy as np
import pandas as pd
from glob import glob
import os
from os.path import join, basename, dirname
import shutil

In [2]:
def get_subdirs(path):
    subdirs = glob(os.path.join(path, '*'))
    subdirs = [s for s in subdirs if os.path.isdir(s)]
    return(sorted(subdirs))

def path_check(path, false = 'assert', return_flag = False):
    flag = True
    if not os.path.exists(path):
        text = f"path not exists: {path}"
        flag = False
        if false == 'assert':
            assert False, text
        elif false == 'print':
            print(text)
    if return_flag:
        return(flag)

In [3]:
HDD_path1 = 'D:/' # dataset
HDD_path2 = 'E:/' # original image

path_check(f"{HDD_path1}dataset_for_yolo")
path_check(f"{HDD_path2}Site_1149")

In [4]:
# generate new folder
new_dataset_dir = f"{HDD_path1}dataset_for_yolo/20221224_U1366_from_detection"
for folder in ['images', 'labels']:
    os.makedirs(join(new_dataset_dir, '00_original', folder), exist_ok=True)

In [5]:
def process_img(img_name, detection_dir_site, new_dataset_dir, site):    
    
    # 情報の取得
    img_name_sep = img_name.split('_')
    sample = '_'.join(img_name_sep[:-3])
    slide = '_'.join(img_name_sep[:-2])
    img_path = join(img_dir_site, sample, slide, img_name)
    label_name = img_name[:-4] + '.txt'
    label_path = join(detection_dir_site, sample, slide, 'labels', label_name)
    
    # ラベルファイルをデータセットに組み込むかの判定
    move = True
    label_name = basename(label_path)
    img_name = label_name[:-4] + '.jpg'
    img_path = join(img_dir_site, sample, slide, img_name)
    
    # ラベル・画像のパスが存在しない場合は移動しない
    if not all([path_check(label_path, false='print', return_flag=True), 
                 path_check(img_path, false='print', return_flag=True)]):
        move = False
    
    # 以前のデータセットに含まれている場合は移動しない
    if img_name in contained:
        move = False
    
    # データセットの移動
    if move:
        new_img_path = join(new_dataset_dir, '00_original', 'images', img_name)
        new_label_path = join(new_dataset_dir, '00_original', 'labels', label_name)
        # 画像ファイルのコピー
        shutil.copyfile(img_path, new_img_path)
        
        # ラベルファイルのコピー
        # 信頼度を書き出している場合は、信頼度を除外する
        with open(label_path, 'r') as f1:
            lines = f1.readlines()
        if len(lines[0].split()) == 6:
            lines = [' '.join(l.split()[:-1]) for l in lines]        
        with open(new_label_path, 'w') as f2:
            f2.write('\n'.join(lines))

In [6]:
# 以前のデータセットにすでに入っている画像
previous_dataset_dir = f"{HDD_path1}dataset_for_yolo/20221209"
path_check(previous_dataset_dir)
contained = set([basename(p) for p in
                 glob(join(previous_dataset_dir, '02_all', '*', 'images', '*.jpg'))])
print(len(contained), 'images')

12217 images


In [7]:
exclude_samples = set(['576B_03_06_81', '576B_04_02_25', '576B_04_04_75', 
                       '576B_05_01_25', '576B_05_05_25'])
learn_tooth_samples = set(['576B_01_02_77', '576B_01_06_52', '576B_02_03_125', 
                           '576B_02_07_23', '576B_03_03_125', 
                           '1149A_14_05_87', '1149A_18_04_38', '1149B_03_06_25', 
                           'U1366C_02_CC_11', 'U1366C_03_03_125'])

In [8]:
sites = ['Site_U1366']
class_names = ['tooth', 'denticle', 'sawToothed']

for site in sites:
    path_check(f"{HDD_path2}{site}")

In [9]:
for site in sites:
    
    img_dir_site = f"{HDD_path2}{site}"
    detection_dir_site = f"../runs/detect/v7x_20221209_3cls_all/{site}"

    for p in [img_dir_site, detection_dir_site, previous_dataset_dir]:
        path_check(p)
    
    samples = [basename(p) for p in get_subdirs(img_dir_site)]
    samples = [s for s in samples if s not in exclude_samples]
    samples.sort()

    for sample in samples:
        detection_csv = join(detection_dir_site, 'cropped_images', 
                             sample, 'detections.csv')
        path_check(detection_csv)
        df = pd.read_csv(detection_csv, index_col=0)
        if sample not in learn_tooth_samples:
            df = df[df['class_no'].isin([1, 2])]
        img_names = set(df['filename'])

        for img_name in img_names:
            process_img(img_name, detection_dir_site, new_dataset_dir, site)
    
class_label_path = join(new_dataset_dir, '00_original', 'labels', 'classes.txt')
with open(class_label_path, 'w') as f3:
    f3.write('\n'.join(class_names))
    
print('Done')

Done
