In [5]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [6]:
#load all xml file and store in a list
xml_list = glob('./data_images/*.xml')
#data cleaning replace \\ with /
xml_list = list(map(lambda x: x.replace('\\','/'),xml_list))

In [7]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [None]:
parser_all = list(map(extract_text,xml_list))

In [None]:
parser_all

In [None]:
data = reduce(lambda x, y : x+y,parser_all)

In [None]:
data

In [None]:
df = pd.DataFrame(data,columns=['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [None]:
df.shape

In [None]:
df['name'].value_counts()

In [None]:
#type conversion 
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

In [None]:
#center x , center y
df['centerx'] = ((df['xmin']+df['xmax'])/2)/df['width']
df['centery'] = ((df['ymin']+df['ymax'])/2)/df['height']
#w h
df['w']=(df['xmax']-df['xmin'])/df['width']
df['h']=(df['ymax']-df['ymin'])/df['height']

In [None]:
df.info()

In [None]:
df.head()

In [None]:
images = df['filename'].unique()

In [None]:
len(images)

In [None]:
# 80% for train 20% for test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) #shuffle and pick 80% of images

In [None]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [None]:
len(img_train),len(img_test)

In [None]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [None]:
#training the text is not recommend so we need label encoding
# label encoding
def label_encoding(x):
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'pottedplant':4, 'bird':5, 'dog':6,
       'sofa':7, 'bicycle':8, 'horse':9, 'boat':10, 'motorbike':11, 'cat':12, 'tvmonitor':13,
       'cow':14, 'sheep':15, 'aeroplane':16, 'train':17, 'diningtable':18, 'bus':19}
    return labels[x]

In [None]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [None]:
train_df

In [None]:
import os
from shutil import move

In [None]:
train_folder = 'data_images/train'
test_folder  = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [None]:
cols = ['filename','id','centerx','centery','w','h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [None]:
groupby_obj_train.get_group('000009.jpg').set_index('filename')

In [None]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [4]:
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

NameError: name 'groupby_obj_train' is not defined