In [76]:
import os
from functools import reduce
from glob import glob
from shutil import move
from xml.etree import ElementTree as et

import pandas as pd

In [77]:
xml_list = glob('./Imagens/*.xml')
xml_list = list(map(lambda x: x.replace('\\','/'),xml_list))

In [78]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [79]:
parser_all = list(map(extract_text,xml_list))
data = reduce(lambda x, y : x+y,parser_all)

In [80]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,1.jpg,1280,720,Livre,263,352,58,97
1,1.jpg,1280,720,Livre,291,366,91,126
2,1.jpg,1280,720,Livre,306,388,124,168
3,1.jpg,1280,720,Livre,326,442,167,206
4,1.jpg,1280,720,Livre,327,464,204,258


In [81]:
df['name'].value_counts()

name
Ocupado         106
Livre            89
Fora_da_vaga      9
Name: count, dtype: int64

In [82]:
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 204 entries, 0 to 203
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  204 non-null    object
 1   width     204 non-null    int32 
 2   height    204 non-null    int32 
 3   name      204 non-null    object
 4   xmin      204 non-null    int32 
 5   xmax      204 non-null    int32 
 6   ymin      204 non-null    int32 
 7   ymax      204 non-null    int32 
dtypes: int32(6), object(2)
memory usage: 8.1+ KB


In [83]:
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']

df['w'] = (df['xmax']-df['xmin'])/df['width']

df['h'] = (df['ymax']-df['ymin'])/df['height']

In [84]:
images = df['filename'].unique()

In [85]:
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename'])

In [86]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [87]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [88]:
def label_encoding(x):
    labels = {'Livre':0, 'Ocupado':1, 'Fora_da_vaga':2}
    return labels[x]

In [89]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


In [90]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [91]:
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('Imagens',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst)
    
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)

In [92]:
train_folder = 'folders/train'
test_folder = 'folders/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [93]:
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0    None
1    None
2    None
3    None
4    None
dtype: object

In [94]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0    None
dtype: object