In [2]:
import pandas as pd
import os, shutil

import xml.etree.ElementTree as ET
import cv2

In [3]:
train_dir = 'test_task/test'
files = os.listdir(train_dir)
data = list()

for file in files:
    name = file.split('.')
    if  name[1] in ['jpeg', 'jpg', 'png']:
        tree = ET.parse(os.path.join(train_dir, name[0] + '.xml'))
        root = tree.getroot()

        for obj in root.findall('object'):
            bbox = list()
            bbox.append(obj.find('name').text)
            for coord in obj.findall('bndbox'):
                for attr in ['xmin', 'ymin', 'xmax', 'ymax']:
                    bbox.append(int(coord.find(attr).text))
            height, width, _ =  cv2.imread(os.path.join(train_dir, name[0] + '.' + name[1])).shape
            data.append([name[0] + '.' + name[1], height, width] + bbox)

test_df = pd.DataFrame(data, columns=['image', 'height', 'width', 'label', 'xmin', 'ymin', 'xmax', 'ymax'])
test_df

Unnamed: 0,image,height,width,label,xmin,ymin,xmax,ymax
0,ckicup6z1000g246a7fphj2ry.jpeg,157,321,6,13,11,44,146
1,ckicup6z1000g246a7fphj2ry.jpeg,157,321,0,50,33,78,146
2,ckicup6z1000g246a7fphj2ry.jpeg,157,321,4,84,11,114,147
3,ckicup6z1000g246a7fphj2ry.jpeg,157,321,6,119,11,149,146
4,ckicup6z1000g246a7fphj2ry.jpeg,157,321,J,166,21,195,134
...,...,...,...,...,...,...,...,...
194,ckn7cvs8i00613h6619wlyc3u.jpeg,61,84,A,63,16,70,56
195,ckn7dibfl00au3h66aqhhzp0m.jpeg,49,114,8,14,2,24,47
196,ckn7dibfl00au3h66aqhhzp0m.jpeg,49,114,8,25,2,35,47
197,ckn7dibfl00au3h66aqhhzp0m.jpeg,49,114,8,36,4,46,48


In [4]:
new_dir = 'train'
train_df = pd.read_csv('aug_data.csv', sep='\t',  index_col=[0])
train_df

Unnamed: 0,image,height,width,label,xmin,ymin,xmax,ymax
0,ckictsenl0000246a2itn28ts.jpeg,110,457,0,44,29,63,97
1,ckictsenl0000246a2itn28ts.jpeg,110,457,5,82,22,101,96
2,ckictsenl0000246a2itn28ts.jpeg,110,457,2,122,14,144,97
3,ckictsenl0000246a2itn28ts.jpeg,110,457,8,163,14,185,96
4,ckictsenl0000246a2itn28ts.jpeg,110,457,L,258,16,283,97
...,...,...,...,...,...,...,...,...
5882,augment_409.jpeg,55,94,N,32,6,43,48
5883,augment_409.jpeg,55,94,S,43,8,52,48
5884,augment_409.jpeg,55,94,K,57,13,67,48
5885,augment_409.jpeg,55,94,R,66,13,76,47


In [5]:
labels_dcit = {label: i for i, label in enumerate((train_df['label'].unique()))}
labels_dcit

{'0': 0,
 '5': 1,
 '2': 2,
 '8': 3,
 'L': 4,
 'U': 5,
 'X': 6,
 '3': 7,
 'D': 8,
 'B': 9,
 '6': 10,
 '9': 11,
 'J': 12,
 'A': 13,
 '7': 14,
 'K': 15,
 'G': 16,
 '4': 17,
 '1': 18,
 'T': 19,
 'R': 20,
 'V': 21,
 'H': 22,
 'N': 23,
 'E': 24,
 'S': 25,
 'Z': 26}

In [6]:
len(labels_dcit.keys())

27

In [7]:
def pacal_to_yoloy(df):
    for name in df['image'].unique():
        with open(f'labels/{name.split(".")[0]}.txt', 'x') as f:
            for index, row in df[df['image'] == name].iterrows():
                image_height, image_width, label, xmin, ymin, xmax, ymax = row[1:]
                label = labels_dcit[label]
                x_center = (xmin + xmax) / 2.0
                y_center = (ymin + ymax) / 2.0
                width = xmax - xmin
                height = ymax - ymin
                x_center /= image_width
                y_center /= image_height
                width /= image_width
                height /= image_height     
                f.write(' '.join([str(t) for t in [label, x_center, y_center, width, height]]) + '\n')

In [8]:
main_dir = 'data'
train_dir = 'data/train'
test_dir = 'data/test'
val_dir = 'data/val'

def copytree(src, dst, symlinks=False, ignore=None):
    for item in os.listdir(src):
        s = os.path.join(src, item)
        d = os.path.join(dst, item)
        if os.path.isdir(s):
            shutil.copytree(s, d, symlinks, ignore)
        else:
            shutil.copy2(s, d)

if not os.path.exists(main_dir):
    os.makedirs(main_dir)
if not os.path.exists(val_dir):
    os.makedirs(val_dir)
if not os.path.exists(train_dir):
    os.makedirs(train_dir)
if not os.path.exists(test_dir):
    os.makedirs(test_dir)

for path in [train_dir, test_dir, val_dir]:
    if not os.path.exists(os.path.join(path, 'images')):
        os.makedirs(os.path.join(path, 'images'))
    if not os.path.exists(os.path.join(path, 'labels')):
        os.makedirs(os.path.join(path, 'labels'))

copytree('./train', 'data/train/images')
os.makedirs('./labels')
pacal_to_yoloy(train_df)
copytree('./labels', 'data/train/labels')
shutil.rmtree('./labels')
os.makedirs('./labels')
pacal_to_yoloy(test_df)
copytree('./labels', 'data/test/labels')
l = len(os.listdir('data/train/images'))
for i, files in enumerate(zip(os.listdir('data/train/images')[:-100:-1], os.listdir('data/train/labels')[:-100:-1])):
    if i > 100:
        break
    shutil.move(os.path.join('data/train/images', files[0]), 'data/val/images')
    shutil.move(os.path.join('data/train/labels', files[1]), 'data/val/labels')

copytree('./test_task/test', 'data/test/images')
for file in os.listdir('data/test/images'):
    if file.split('.')[1] in ['txt']:
        os.remove(os.path.join('data/test/images', file))
    if file.split('.')[1] in ['xml']:
        os.remove(os.path.join('data/test/images', file))

In [63]:
os.listdir('data/train/images')[:-100:-1]

['ckn7dpc87001w3h66nfp0otgl.jpeg',
 'ckn7dmzrd00183h66joxua5db.jpeg',
 'ckn7dir8501jg3068balc54jg.jpeg',
 'ckn7dhjmw00ah3h6665lnlr86.jpeg',
 'ckn7dh3je01iu3068snby04rn.jpeg',
 'ckn7deu7101hs3068m7efvnxq.jpeg',
 'ckn7ddopj01h63068h5v2q77y.jpeg',
 'ckn7dddzj008x3h66b54u9qe8.jpeg',
 'ckn7d936n008b3h66bt0vc5k8.jpeg',
 'ckn7d83mv01ew3068hb7iluwa.jpeg',
 'ckn7d6qv001ea3068n03uk9t5.jpeg',
 'ckn7d39dd007p3h66gjeio0dd.jpeg',
 'ckn7d0pi701c83068l3u59ybi.jpeg',
 'ckn7cqorl004n3h66jq6uii6v.jpeg',
 'ckn7cpxgn019t3068m8oin3a6.jpeg',
 'ckn7comxi00413h66ud96f14z.jpeg',
 'ckn7cknnn002r3h66iyh76v2t.jpeg',
 'ckn7cgwc1016v30687fk1z4c5.jpeg',
 'ckn7cgkqf001s3h66ulbd74z0.jpeg',
 'ckn7cd7vt015e3068szo8o3o8.jpeg',
 'ckn7cbwkg014s30680jyijnxi.jpeg',
 'ckn7ca2h101443068w6pm5v6a.jpeg',
 'ckn7c8rgv013i3068m7fsw10h.jpeg',
 'ckn7c5mu8012k3068zcdr9iqs.jpeg',
 'ckn7c3yiy011y3068ifee2ecc.jpeg',
 'ckn7becgs00zu3068jm5bsyvb.jpeg',
 'ckn7b6drt00yh3068huxns41f.jpeg',
 'ckn7b42rh00xt3068la1z4fjx.jpeg',
 'ckn7b2qoh00x73068n

In [66]:
for i in os.listdir('data/train/images')[:-100:-1]:
    c = 0
    for i1 in os.listdir('data/train/labels')[:-100:-1]:
        if i.split('.')[0] == i1.split('.')[0]:
            c += 1
    if c != 1:
        print('Error')
        break