In [2]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

In [33]:
# Load all xml files and store in a list
xml_list = glob('.\data_images\*.xml')

# Data Cleaning - replace // with \
xml_files = list(map(lambda x: x.replace('\\','/'),xml_list))

In [34]:
# step 2- read xml files
# from each xml file we need to extract
# filename, size(width,height), object(name,xmin,xmax,ymin,ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text

    # extract width and height of image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        [name,xmin,xmax,ymin,ymax]
        parser.append([image_name,width,height,name,xmin,xmax,ymin,ymax])
    return parser


In [44]:
parser_all = list(map(extract_text,xml_files))
data = reduce(lambda x,y: x+y,parser_all)
df = pd.DataFrame(data, columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])
df.shape
df.head()
df['name'].value_counts()

car           65
semi_truck    43
airplane      39
motorcycle    33
Name: name, dtype: int64

In [45]:
df.info()
# create x_center and y_center for each bndbox


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 180 entries, 0 to 179
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  180 non-null    object
 1   width     180 non-null    object
 2   height    180 non-null    object
 3   name      180 non-null    object
 4   xmin      180 non-null    object
 5   xmax      180 non-null    object
 6   ymin      180 non-null    object
 7   ymax      180 non-null    object
dtypes: object(8)
memory usage: 11.4+ KB


In [46]:

# create x_center and y_center for each bndbox

cols = ['width','height','xmin','xmax','ymin','ymax']

df[cols] = df[cols].astype(int)

In [49]:
# center x, center y
df['center_x'] = ((df['xmin']+df['xmax'])/2)/df['width']
df['center_y'] = ((df['ymin']+df['ymax'])/2)/df['height']
df['w'] = (df['xmax']-df['xmin'])/df['width']
df['h'] = (df['ymax']-df['ymin'])/df['height']
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,00001.jpg,1600,900,semi_truck,213,1037,150,796,0.390625,0.525556,0.515,0.717778
1,00002.jpg,1536,1024,car,9,1434,173,824,0.469727,0.486816,0.927734,0.635742
2,00003.jpg,1365,2048,car,817,1252,1110,1477,0.757875,0.631592,0.318681,0.179199
3,00003.jpg,1365,2048,car,622,853,1075,1262,0.540293,0.570557,0.169231,0.091309
4,00003.jpg,1365,2048,car,334,427,1084,1175,0.278755,0.551514,0.068132,0.044434


## Split data into train and test

In [52]:
images = df['filename'].unique()
len(images)

# 80% train and 20% test


135

In [57]:
img_df = pd.DataFrame(images,columns = ['filename'])
img_df.head()
img_train = tuple(img_df.sample(frac = 0.8)['filename']) # shuffle and pick 80% of the images

In [61]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

(108, 27)

In [64]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,00001.jpg,1600,900,semi_truck,213,1037,150,796,0.390625,0.525556,0.515,0.717778
1,00002.jpg,1536,1024,car,9,1434,173,824,0.469727,0.486816,0.927734,0.635742
2,00003.jpg,1365,2048,car,817,1252,1110,1477,0.757875,0.631592,0.318681,0.179199
3,00003.jpg,1365,2048,car,622,853,1075,1262,0.540293,0.570557,0.169231,0.091309
4,00003.jpg,1365,2048,car,334,427,1084,1175,0.278755,0.551514,0.068132,0.044434


In [67]:
# label encoding
def label_encoding(x):
    labels = {
        'car':0, 'semi_truck':1, 'airplane':2, 'motorcycle':3
    }
    return labels[x]

In [69]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)
train_df.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,00001.jpg,1600,900,semi_truck,213,1037,150,796,0.390625,0.525556,0.515,0.717778,1
1,00002.jpg,1536,1024,car,9,1434,173,824,0.469727,0.486816,0.927734,0.635742,0
2,00003.jpg,1365,2048,car,817,1252,1110,1477,0.757875,0.631592,0.318681,0.179199,0
3,00003.jpg,1365,2048,car,622,853,1075,1262,0.540293,0.570557,0.169231,0.091309,0
4,00003.jpg,1365,2048,car,334,427,1084,1175,0.278755,0.551514,0.068132,0.044434,0
5,00003.jpg,1365,2048,car,508,636,1070,1198,0.419048,0.553711,0.093773,0.0625,0
6,00004.jpg,800,600,car,0,771,194,565,0.481875,0.6325,0.96375,0.618333,0
7,00005.jpg,610,417,car,11,600,90,393,0.50082,0.579137,0.965574,0.726619,0
8,00006.jpg,1536,1024,car,237,1261,393,749,0.48763,0.557617,0.666667,0.347656,0
9,00007.jpg,800,600,car,57,755,161,432,0.5075,0.494167,0.8725,0.451667,0


## Save image and labels in text

In [70]:
import os
from shutil import move

In [76]:
train_folder = 'data_images/train'
test_folder = 'data_images/test'

#os.mkdir(train_folder)
#os.mkdir(test_folder)

In [94]:
cols = ['filename','id','center_x','center_y','w','h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

# groupby_obj_train.get_group('00001.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)

In [99]:
# save each img in train/test folder

def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('data_images',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move imgs to dst folder
    
    # save the label info
    text_filename = os.path.join(folder_path,
        os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [100]:
filename_series = pd.Series(groupby_obj_train.groups.keys())
filename_series
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0      00001.jpg
1      00002.jpg
2      00003.jpg
3      00004.jpg
4      00005.jpg
         ...    
103    00131.jpg
104    00132.jpg
105    00133.jpg
106    00134.jpg
107    00135.jpg
Length: 108, dtype: object

In [104]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
dtype: object