In [1]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [3]:
# step-1: get path of each xml file
xmlfiles = glob('./images/*.xml')
# replace \\ with /
# replace_text = lambda x: x.replace('\\','/')
# xmlfiles = list(map(replace_text,xmlfiles))

In [4]:
xmlfiles

['./images/25.xml',
 './images/43.xml',
 './images/44.xml',
 './images/82.xml',
 './images/180.xml',
 './images/174.xml',
 './images/99.xml',
 './images/80.xml',
 './images/41.xml',
 './images/140.xml',
 './images/48.xml',
 './images/102.xml',
 './images/150.xml',
 './images/109.xml',
 './images/96.xml',
 './images/22.xml',
 './images/107.xml',
 './images/46.xml',
 './images/51.xml',
 './images/104.xml',
 './images/69.xml',
 './images/1.xml',
 './images/166.xml',
 './images/122.xml',
 './images/167.xml',
 './images/20.xml',
 './images/87.xml',
 './images/185.xml',
 './images/34.xml',
 './images/148.xml',
 './images/47.xml',
 './images/165.xml',
 './images/136.xml',
 './images/39.xml',
 './images/75.xml',
 './images/108.xml',
 './images/120.xml',
 './images/153.xml',
 './images/103.xml',
 './images/5.xml',
 './images/81.xml',
 './images/40.xml',
 './images/15.xml',
 './images/27.xml',
 './images/113.xml',
 './images/63.xml',
 './images/106.xml',
 './images/54.xml',
 './images/115.xml',


In [5]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [6]:
parser_all = list(map(extract_text,xmlfiles))

In [7]:
data = reduce(lambda x, y : x+y,parser_all)

In [8]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [9]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,25.jpg,5705,3803,butterfly,1665,4521,276,2783
1,25.jpg,5705,3803,flower,959,3171,1958,3658
2,43.jpg,2032,3048,butterfly,182,1444,428,1605
3,43.jpg,2032,3048,flower,187,649,1424,1967
4,44.jpg,4910,3274,flower,618,2712,537,2905


In [10]:
df.shape

(339, 8)

In [11]:
df['name'].value_counts()

name
flower       250
butterfly     89
Name: count, dtype: int64

#### Conversion
![image-2.png](attachment:image-2.png)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339 entries, 0 to 338
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  339 non-null    object
 1   width     339 non-null    object
 2   height    339 non-null    object
 3   name      339 non-null    object
 4   xmin      339 non-null    object
 5   xmax      339 non-null    object
 6   ymin      339 non-null    object
 7   ymax      339 non-null    object
dtypes: object(8)
memory usage: 21.3+ KB


In [13]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 339 entries, 0 to 338
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  339 non-null    object
 1   width     339 non-null    int64 
 2   height    339 non-null    int64 
 3   name      339 non-null    object
 4   xmin      339 non-null    int64 
 5   xmax      339 non-null    int64 
 6   ymin      339 non-null    int64 
 7   ymax      339 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 21.3+ KB


In [14]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [15]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,25.jpg,5705,3803,butterfly,1665,4521,276,2783,0.542156,0.402182,0.500613,0.659216
1,25.jpg,5705,3803,flower,959,3171,1958,3658,0.361963,0.738364,0.38773,0.447016
2,43.jpg,2032,3048,butterfly,182,1444,428,1605,0.400098,0.333497,0.621063,0.386155
3,43.jpg,2032,3048,flower,187,649,1424,1967,0.205709,0.556266,0.227362,0.17815
4,44.jpg,4910,3274,flower,618,2712,537,2905,0.339104,0.525657,0.426477,0.723274


### split data into train and test

In [16]:
images = df['filename'].unique()

In [17]:
len(images)

185

In [18]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [19]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [20]:
len(img_train), len(img_test)

(148, 37)

In [21]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [22]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,25.jpg,5705,3803,butterfly,1665,4521,276,2783,0.542156,0.402182,0.500613,0.659216
1,25.jpg,5705,3803,flower,959,3171,1958,3658,0.361963,0.738364,0.38773,0.447016
2,43.jpg,2032,3048,butterfly,182,1444,428,1605,0.400098,0.333497,0.621063,0.386155
3,43.jpg,2032,3048,flower,187,649,1424,1967,0.205709,0.556266,0.227362,0.17815
4,44.jpg,4910,3274,flower,618,2712,537,2905,0.339104,0.525657,0.426477,0.723274


In [23]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
9,174.jpg,4592,3448,flower,1448,2938,71,1397,0.47757,0.212877,0.324477,0.384571
21,140.jpg,4797,3837,butterfly,157,4645,106,3700,0.500521,0.49596,0.935585,0.936669
59,1.jpg,3024,4032,flower,1,605,1153,1603,0.100198,0.341766,0.199735,0.111607
60,1.jpg,3024,4032,flower,286,649,1434,1703,0.154597,0.389013,0.12004,0.066716
61,1.jpg,3024,4032,flower,305,930,840,1472,0.2042,0.286706,0.20668,0.156746


### Assign id number to object names

In [24]:
# label encoding
def label_encoding(x):
    labels = {'flower':0, 'butterfly':1}
    return labels[x]

In [25]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [26]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,25.jpg,5705,3803,butterfly,1665,4521,276,2783,0.542156,0.402182,0.500613,0.659216,1
1,25.jpg,5705,3803,flower,959,3171,1958,3658,0.361963,0.738364,0.38773,0.447016,0
2,43.jpg,2032,3048,butterfly,182,1444,428,1605,0.400098,0.333497,0.621063,0.386155,1
3,43.jpg,2032,3048,flower,187,649,1424,1967,0.205709,0.556266,0.227362,0.17815,0
4,44.jpg,4910,3274,flower,618,2712,537,2905,0.339104,0.525657,0.426477,0.723274,0
5,44.jpg,4910,3274,flower,2018,4265,221,2573,0.639817,0.426695,0.457637,0.718387,0
6,82.jpg,3739,3354,flower,490,1911,403,1935,0.321075,0.348539,0.380048,0.456768,0
7,180.jpg,2081,2835,butterfly,536,1357,808,1582,0.454829,0.421517,0.394522,0.273016,1
8,180.jpg,2081,2835,flower,549,1862,1130,1982,0.579289,0.548854,0.630947,0.300529,0
10,99.jpg,5100,3000,butterfly,250,4994,506,3000,0.514118,0.584333,0.930196,0.831333,1


### Save Image and Labels in text

In [27]:
import os
from shutil import move

In [28]:
train_folder = 'images/train'
test_folder = 'images/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

In [29]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [30]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [31]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [32]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
143    None
144    None
145    None
146    None
147    None
Length: 148, dtype: object

In [33]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
dtype: object