In [13]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [14]:
# load all xml files and store in a list
xml_list = glob('./Data_Images/*.xml')
# dat cleaning. replace \\ with /
xml_list = list(map(lambda x: x.replace('\\','/'),xml_list))

In [15]:
xml_list

['./Data_Images/1551.xml',
 './Data_Images/1552.xml',
 './Data_Images/1553.xml',
 './Data_Images/1554.xml',
 './Data_Images/1555.xml',
 './Data_Images/1556.xml',
 './Data_Images/1557.xml',
 './Data_Images/1558.xml',
 './Data_Images/1559.xml',
 './Data_Images/1560.xml',
 './Data_Images/1561.xml',
 './Data_Images/1562.xml',
 './Data_Images/1563.xml',
 './Data_Images/1564.xml',
 './Data_Images/1565.xml',
 './Data_Images/1566.xml',
 './Data_Images/1567.xml',
 './Data_Images/1568.xml',
 './Data_Images/1569.xml',
 './Data_Images/1570.xml',
 './Data_Images/1571.xml',
 './Data_Images/1572.xml',
 './Data_Images/1573.xml',
 './Data_Images/1574.xml',
 './Data_Images/1575.xml',
 './Data_Images/1576.xml',
 './Data_Images/1577.xml',
 './Data_Images/1578.xml',
 './Data_Images/1579.xml',
 './Data_Images/1580.xml',
 './Data_Images/1581.xml',
 './Data_Images/1582.xml',
 './Data_Images/1583.xml',
 './Data_Images/1584.xml',
 './Data_Images/1585.xml',
 './Data_Images/1586.xml',
 './Data_Images/1587.xml',
 

In [16]:
# read xml files
# from each xml file we need to extract 
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_info(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract file name
    image_name = root.find('filename').text
    #width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objects = root.findall('object')
    parser = []
    for obj in objects:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
    
    return parser

In [17]:
parser_all = list(map(extract_info,xml_list))

data = reduce(lambda x, y : x+y, parser_all)

df = pd.DataFrame(data,columns = ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,1551.jpg,5184,3456,dog,918,3065,138,3364
1,1552.jpg,1851,2780,dog,277,908,1285,2585
2,1552.jpg,1851,2780,dog,873,1555,1198,2633
3,1553.jpg,2568,3876,dog,189,2383,843,3631
4,1554.jpg,2744,4049,dog,366,2410,893,3661


In [18]:
df['name'].value_counts()

name
person    212
dog       102
Name: count, dtype: int64

In [19]:
#type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 314 entries, 0 to 313
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  314 non-null    object
 1   width     314 non-null    int32 
 2   height    314 non-null    int32 
 3   name      314 non-null    object
 4   xmin      314 non-null    int32 
 5   xmax      314 non-null    int32 
 6   ymin      314 non-null    int32 
 7   ymax      314 non-null    int32 
dtypes: int32(6), object(2)
memory usage: 12.4+ KB


In [20]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [21]:
# split data into train and split
images = df['filename'].unique()

In [22]:
len(images)

199

In [23]:
# 89% train and 20% test
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac = 0.8)['filename'])

In [24]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [25]:
len(img_test), len(img_train)

(40, 159)

In [26]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [27]:
# label encoding
def label_encoding(x):
    labels = {'person' : 0, 'dog' : 1} 
    return labels[x]

In [32]:
train_df.loc[:, 'id'] = train_df['name'].apply(label_encoding)
test_df.loc[:, 'id'] = test_df['name'].apply(label_encoding)

In [33]:
import os
from shutil import move

In [34]:
train_folder = 'Data_Images/train'
test_folder ='Data_Images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

FileExistsError: [WinError 183] Cannot create a file when that file already exists: 'Data_Images/train'

In [35]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [36]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    src = os.path.join('Data_Images', filename)
    dst = os.path.join(folder_path, filename)
    
    try:
        move(src, dst)  # move image to the destination folder
        
        # Save the labels
        text_filename = os.path.join(folder_path, os.path.splitext(filename)[0] + '.txt')
        group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)
        
    except FileNotFoundError as e:
        # Handle the case where the file is not found
        print(f"File '{filename}' not found. Skipping...")
        print(e)  # Optionally, you can print or log the error message for debugging purposes
        # You can add further handling here if needed, such as logging the error or skipping the file.
        pass  # If you want to skip the file silently

In [37]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0      None
1      None
2      None
3      None
4      None
       ... 
154    None
155    None
156    None
157    None
158    None
Length: 159, dtype: object

In [38]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
35    None
36    None
37    None
38    None
39    None
dtype: object