In [1]:
import os
import matplotlib
from pathlib import Path
import pandas as pd
import shutil

In [2]:
DATA = Path('./data/final')

# CIRCLE

In [3]:
c1 = DATA / 'circle1'
c1_bbox = c1 / 'bbox'
c1_img = c1 / 'image'
c1_label = c1 / 'label.csv'

In [4]:
# Check matching between bbox and images

c1_bboxes = os.listdir(c1_bbox)
c1_images = os.listdir(c1_img)
c1_labeldf = pd.read_csv(c1_label, index_col=0)

# There are missing bbox files
len(c1_bboxes), len(c1_images), len(c1_labeldf)

(521, 529, 529)

In [5]:
c2 = DATA / 'circle2'
c2_bbox = c2 / 'bbox'
c2_img = c2 / 'image'
c2_label = c2 / 'label.csv'

In [6]:
# Check matching between bbox and images

c2_bboxes = os.listdir(c2_bbox)
c2_images = os.listdir(c2_img)
c2_labeldf = pd.read_csv(c2_label, index_col=0)

# There is no missing file
len(c2_bboxes), len(c2_images), len(c2_labeldf)

(226, 226, 226)

# Merge into 1 folder only

In [7]:
# Merge into 1 folder only
f_circle = DATA / 'f_circle'
f_bbox = DATA / 'f_circle' / 'bbox'
f_image = DATA / 'f_circle' / 'image'
f_label = DATA / 'f_circle' / 'label.csv'

os.makedirs(f_circle, exist_ok=True)
os.makedirs(f_bbox, exist_ok=True)
os.makedirs(f_image, exist_ok=True)

# Copy bbox
for f in c1_bbox.iterdir():
    shutil.copy(str(f), f_bbox / f.name)
for f in c2_bbox.iterdir():
    shutil.copy(str(f), f_bbox / f.name)
# copy image    
for f in c1_img.iterdir():
    shutil.copy(str(f), f_image / f.name)
for f in c2_img.iterdir():
    shutil.copy(str(f), f_image / f.name)

In [8]:
# copy label
f_labeldf = pd.concat([c1_labeldf, c2_labeldf]).reset_index().drop(columns=['index'])
# f_labeldf.to_csv(str(f_label))

In [9]:
# Check to remove mismatched 

bbox_names = [i.name.split('.')[0] for i in f_bbox.iterdir()]
img_names = [i.name.split('.')[0] for i in f_image.iterdir()]
label_names = f_labeldf['id'].tolist()

In [10]:
len(bbox_names), len(img_names), len(label_names)

(747, 755, 755)

In [11]:
bbox_names[:2], img_names[:2], label_names[:2]

(['ckm4i98se010t3a6870lsokzl', 'ckm4l96uj010y3a6884viub14'],
 ['cklmbf086000i3a62l63tud9h', 'cklnu3t7p00003a68xbkcjn77'],
 ['cklmbf086000i3a62l63tud9h', 'cklmfyw2600003b68ypk1esxh'])

In [12]:
all_names = list(set(bbox_names) | set(img_names) | set(label_names))
valid_names = list(set(bbox_names) & set(img_names) & set(label_names))
invalid_names = list(set(all_names) - set(valid_names))
len(all_names), len(valid_names), len(invalid_names)

(755, 747, 8)

In [13]:
# Search and remove invalid files
for n in invalid_names:
    invalid_bbox = f_bbox / (n + '.csv')
    invalid_img = f_image / (n + '.jpeg')
    if invalid_bbox.exists():
        os.remove(invalid_bbox)
    if invalid_img.exists():
        os.remove(invalid_img)

f_labeldf = f_labeldf[f_labeldf['id'].isin(valid_names)]

# Check again
bbox_names = [i.name.split('.')[0] for i in f_bbox.iterdir()]
img_names = [i.name.split('.')[0] for i in f_image.iterdir()]
label_names = f_labeldf['id'].tolist()
len(bbox_names), len(img_names), len(label_names)

(747, 747, 747)

# DONE