In [1]:
import os
import numpy as np
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [2]:
# Load all xml files and store in a list
xml_list = glob('./data_images/annotations/*.xml')
# data cleaning. replace \\ with /
xml_list = list(map(lambda x: x.replace('\\', '/'), xml_list))

In [3]:
xml_list

['./data_images/annotations/000001.xml',
 './data_images/annotations/000002.xml',
 './data_images/annotations/000007.xml',
 './data_images/annotations/000009.xml',
 './data_images/annotations/000012.xml',
 './data_images/annotations/000016.xml',
 './data_images/annotations/000017.xml',
 './data_images/annotations/000019.xml',
 './data_images/annotations/000020.xml',
 './data_images/annotations/000021.xml',
 './data_images/annotations/000023.xml',
 './data_images/annotations/000024.xml',
 './data_images/annotations/000026.xml',
 './data_images/annotations/000030.xml',
 './data_images/annotations/000032.xml',
 './data_images/annotations/000033.xml',
 './data_images/annotations/000034.xml',
 './data_images/annotations/000035.xml',
 './data_images/annotations/000036.xml',
 './data_images/annotations/000039.xml',
 './data_images/annotations/000041.xml',
 './data_images/annotations/000042.xml',
 './data_images/annotations/000044.xml',
 './data_images/annotations/000046.xml',
 './data_images/

In [4]:
# step-2: read xml files
# from each xml file we need to extract
# filesname, size(width, height), object(name, xmin, xmax, ymin, ymax)

def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename
    image_name = root.find('filename').text
    #width & height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
        
    return parser   

In [5]:
parser_all = list(map(extract_text, xml_list))

In [6]:
data = reduce(lambda x,y : x+y, parser_all)

In [7]:
df = pd.DataFrame(data, columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [8]:
df[['width','height','xmin','xmax','ymin','ymax']] = np.float32(df[['width','height','xmin','xmax','ymin','ymax']])

In [9]:
df['center_x'] = (df['xmin'] + df['xmax'])/(df['width']*2)
df['center_y'] = (df['ymin'] + df['ymax'])/(df['height']*2)
df['w'] = (df['xmax'] - df['xmin'])/(df['width'])
df['h'] = (df['ymax'] - df['ymin'])/(df['height'])

# split datas into train and test

In [10]:
images = df['filename'].unique()

In [11]:
len(images)

5012

In [12]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns = ['filename'])
img_train = tuple(img_df.sample(frac = 0.8)['filename']) # suffle & pick 80% of images

In [13]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])  # take rest 20% images

In [14]:
# Label encoding
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
             'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
             'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [15]:
df['id'] = pd.Series(df['name'].apply(label_encoding))

In [16]:
df.head(4)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,000001.jpg,1024.0,657.0,car,14.0,301.0,335.0,522.0,0.153809,0.652207,0.280273,0.284627,1
1,000001.jpg,1024.0,657.0,car,269.0,571.0,345.0,489.0,0.410156,0.634703,0.294922,0.219178,1
2,000001.jpg,1024.0,657.0,car,502.0,798.0,342.0,450.0,0.634766,0.60274,0.289062,0.164384,1
3,000001.jpg,1024.0,657.0,car,709.0,1009.0,333.0,438.0,0.838867,0.586758,0.292969,0.159817,1


In [17]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [18]:
train_df['name'].unique()

array(['car', 'horse', 'person', 'bicycle', 'dog', 'aeroplane',
       'diningtable', 'tvmonitor', 'chair', 'cat', 'bird', 'bottle',
       'motorbike', 'boat', 'pottedplant', 'sheep', 'cow', 'train', 'bus',
       'sofa'], dtype=object)

In [19]:
test_df.head(40)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
13,000017.jpg,480.0,364.0,person,185.0,279.0,62.0,199.0,0.483333,0.358516,0.195833,0.376374,0
14,000017.jpg,480.0,364.0,horse,90.0,403.0,78.0,336.0,0.513542,0.568681,0.652083,0.708791,9
15,000019.jpg,500.0,375.0,cat,231.0,483.0,88.0,256.0,0.714,0.458667,0.504,0.448,12
16,000019.jpg,500.0,375.0,cat,11.0,266.0,113.0,259.0,0.277,0.496,0.51,0.389333,12
28,000024.jpg,500.0,335.0,train,196.0,489.0,165.0,247.0,0.685,0.614925,0.586,0.244776,17
29,000026.jpg,500.0,333.0,car,90.0,337.0,125.0,212.0,0.427,0.506006,0.494,0.261261,1
40,000034.jpg,360.0,500.0,train,116.0,360.0,167.0,400.0,0.661111,0.567,0.677778,0.466,17
41,000034.jpg,360.0,500.0,train,141.0,333.0,153.0,229.0,0.658333,0.382,0.533333,0.152,17
51,000042.jpg,500.0,335.0,train,263.0,500.0,32.0,295.0,0.763,0.48806,0.474,0.785075,17
52,000042.jpg,500.0,335.0,train,1.0,235.0,36.0,299.0,0.236,0.5,0.468,0.785075,17


# save Images and Labels in text

In [20]:
import os
from shutil import move 

In [25]:
train_folder_new = 'data_images/train_new'
test_folder_new = 'data_images/test_new'

os.mkdir(train_folder_new)
os.mkdir(test_folder_new)

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'data_images/train_new'

In [26]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [27]:
# save each images in train/test folder and respective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move images
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
        os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index = False, header = False)
    

In [28]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [30]:
filename_series

0       000001.jpg
1       000002.jpg
2       000007.jpg
3       000009.jpg
4       000012.jpg
           ...    
4005    009950.jpg
4006    009954.jpg
4007    009958.jpg
4008    009959.jpg
4009    009961.jpg
Length: 4010, dtype: object

In [29]:
filename_series.apply(save_data, args = (train_folder_new, groupby_obj_train))

0       None
1       None
2       None
3       None
4       None
        ... 
4005    None
4006    None
4007    None
4008    None
4009    None
Length: 4010, dtype: object

In [31]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())

In [33]:
filename_series_test.apply(save_data, args = (test_folder_new, groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
997     None
998     None
999     None
1000    None
1001    None
Length: 1002, dtype: object