In [1]:
import os
from glob import glob # extract path of each file
import pandas as pd # data preprocessing
from xml.etree import ElementTree as et # parse information from XML
from functools import reduce

In [2]:
import warnings
warnings.filterwarnings('ignore')

In [5]:
# step-1: get path of each xml file
xmlfiles = glob('/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test/*.xml')
# replace \\ with /
replace_text = lambda x: x.replace('\\','/')
xmlfiles = list(map(replace_text,xmlfiles))

In [6]:
xmlfiles

['/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test/36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.xml',
 '/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test/KakaoTalk_20221018_001425789_07_jpg.rf.afa1e8ac8ff8cbd3a5686e56511885da.xml',
 '/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test/feliz-administrador-de-almacen-pie-en-una-fabrica-con-casco-blanco-y-chaleco-seguridad-naranja-mirando-la-sonrisa-camara-173591370_jpg.rf.1ca9470ab4c46ac6d52ed134fb13258e.xml',
 '/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test/WhatsApp-Image-2023-10-10-at-11-18-26_a11a66e1_jpg.rf.4aba94595630757ac4a4a4f63127c555.xml',
 '/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test/00000206_jpg.rf.3421fc09104c3afc6e8cbb3bc5a9a8c9.xml',
 '/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test/images-31-.rf.0211bd26e06fb898b7c48989199ceabd.xml',
 '/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test/Video1_228_jpg.rf.a4b27b89f18bd7141f1ca2373d10e49f.xml',
 '/home/cpow/Deskt

In [7]:
# step-2: read xml files
# from each xml file we need to extract
# filename, size(width, height), object(name, xmin, xmax, ymin, ymax)
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()

    # extract filename
    image_name = root.find('filename').text
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name,xmin,xmax,ymin,ymax])
        
    return parser

In [8]:
parser_all = list(map(extract_text,xmlfiles))

In [9]:
data = reduce(lambda x, y : x+y,parser_all)

In [10]:
df = pd.DataFrame(data,columns = ['filename','width','height','name','xmin','xmax','ymin','ymax'])

In [11]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,hat,94,229,1,139
1,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,vest,29,293,139,442
2,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,vest,1,50,217,495
3,KakaoTalk_20221018_001425789_07_jpg.rf.afa1e8a...,640,640,hat,101,196,256,391
4,KakaoTalk_20221018_001425789_07_jpg.rf.afa1e8a...,640,640,vest,196,456,209,379


In [12]:
df.shape

(209, 8)

In [13]:
df['name'].value_counts()

name
hat        81
vest       73
no vest    31
no hat     24
Name: count, dtype: int64

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  209 non-null    object
 1   width     209 non-null    object
 2   height    209 non-null    object
 3   name      209 non-null    object
 4   xmin      209 non-null    object
 5   xmax      209 non-null    object
 6   ymin      209 non-null    object
 7   ymax      209 non-null    object
dtypes: object(8)
memory usage: 13.2+ KB


In [15]:
# type conversion
cols = ['width','height','xmin','xmax','ymin','ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 209 entries, 0 to 208
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  209 non-null    object
 1   width     209 non-null    int64 
 2   height    209 non-null    int64 
 3   name      209 non-null    object
 4   xmin      209 non-null    int64 
 5   xmax      209 non-null    int64 
 6   ymin      209 non-null    int64 
 7   ymax      209 non-null    int64 
dtypes: int64(6), object(2)
memory usage: 13.2+ KB


In [16]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w 
df['w'] = (df['xmax']-df['xmin'])/df['width']
# h 
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [17]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,hat,94,229,1,139,0.553082,0.1417,0.462329,0.279352
1,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,vest,29,293,139,442,0.55137,0.588057,0.90411,0.61336
2,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,vest,1,50,217,495,0.087329,0.720648,0.167808,0.562753
3,KakaoTalk_20221018_001425789_07_jpg.rf.afa1e8a...,640,640,hat,101,196,256,391,0.232031,0.505469,0.148438,0.210938
4,KakaoTalk_20221018_001425789_07_jpg.rf.afa1e8a...,640,640,vest,196,456,209,379,0.509375,0.459375,0.40625,0.265625


In [18]:
images = df['filename'].unique()

In [19]:
len(images)

44

In [20]:
# 80% train and 20% test
img_df = pd.DataFrame(images,columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename']) # shuffle and pick 80% of images

In [21]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename']) # take rest 20% images

In [22]:
len(img_train), len(img_test)

(35, 9)

In [23]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [24]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,hat,94,229,1,139,0.553082,0.1417,0.462329,0.279352
1,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,vest,29,293,139,442,0.55137,0.588057,0.90411,0.61336
2,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,vest,1,50,217,495,0.087329,0.720648,0.167808,0.562753
5,feliz-administrador-de-almacen-pie-en-una-fabr...,800,550,hat,288,567,1,191,0.534375,0.174545,0.34875,0.345455
6,feliz-administrador-de-almacen-pie-en-una-fabr...,800,550,vest,230,607,228,551,0.523125,0.708182,0.47125,0.587273


In [25]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
3,KakaoTalk_20221018_001425789_07_jpg.rf.afa1e8a...,640,640,hat,101,196,256,391,0.232031,0.505469,0.148438,0.210938
4,KakaoTalk_20221018_001425789_07_jpg.rf.afa1e8a...,640,640,vest,196,456,209,379,0.509375,0.459375,0.40625,0.265625
13,00000206_jpg.rf.3421fc09104c3afc6e8cbb3bc5a9a8...,256,256,hat,178,212,53,87,0.761719,0.273438,0.132812,0.132812
14,00000206_jpg.rf.3421fc09104c3afc6e8cbb3bc5a9a8...,256,256,hat,172,216,82,153,0.757812,0.458984,0.171875,0.277344
15,images-31-.rf.0211bd26e06fb898b7c48989199ceabd...,474,640,hat,4,276,1,206,0.295359,0.161719,0.57384,0.320312


In [26]:
# label encoding
def label_encoding(x):
    labels = {'hat':0, 'vest':1, 'no hat':2, 'no vest':3}
    return labels[x]

In [27]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

In [28]:
train_df.head(10)

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id
0,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,hat,94,229,1,139,0.553082,0.1417,0.462329,0.279352,0
1,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,vest,29,293,139,442,0.55137,0.588057,0.90411,0.61336,1
2,36_jpg.rf.e4757f0ab57054029ade0bb5d666673a.jpg,292,494,vest,1,50,217,495,0.087329,0.720648,0.167808,0.562753,1
5,feliz-administrador-de-almacen-pie-en-una-fabr...,800,550,hat,288,567,1,191,0.534375,0.174545,0.34875,0.345455,0
6,feliz-administrador-de-almacen-pie-en-una-fabr...,800,550,vest,230,607,228,551,0.523125,0.708182,0.47125,0.587273,1
7,WhatsApp-Image-2023-10-10-at-11-18-26_a11a66e1...,1600,1200,no hat,1026,1094,476,534,0.6625,0.420833,0.0425,0.048333,2
8,WhatsApp-Image-2023-10-10-at-11-18-26_a11a66e1...,1600,1200,no vest,979,1054,521,669,0.635312,0.495833,0.046875,0.123333,3
9,WhatsApp-Image-2023-10-10-at-11-18-26_a11a66e1...,1600,1200,hat,1491,1601,404,494,0.96625,0.374167,0.06875,0.075,0
10,WhatsApp-Image-2023-10-10-at-11-18-26_a11a66e1...,1600,1200,hat,1428,1494,454,523,0.913125,0.407083,0.04125,0.0575,0
11,WhatsApp-Image-2023-10-10-at-11-18-26_a11a66e1...,1600,1200,vest,1404,1489,511,654,0.904062,0.485417,0.053125,0.119167,1


In [29]:
import os
from shutil import move

In [30]:
train_folder = '/home/cpow/Desktop/eageleeyes/dataset/pascal/train'
test_folder = '/home/cpow/Desktop/eageleeyes/dataset/pascal/test'


os.mkdir(train_folder)
os.mkdir(test_folder)

FileExistsError: [Errno 17] File exists: '/home/cpow/Desktop/eageleeyes/dataset/pascal/train'

In [31]:
cols = ['filename','id','center_x','center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [32]:
#groupby_obj_train.get_group('000009.jpg').set_index('filename').to_csv('sample.txt',index=False,header=False)
# save each image in train/test folder and repective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('/home/cpow/Desktop/eageleeyes/dataset/pascal/labeled_test',filename)
    dst = os.path.join(folder_path,filename)
    move(src,dst) # move image to the destination folder
    
    # save the labels
    text_filename = os.path.join(folder_path,
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename,sep=' ',index=False,header=False)
    

In [33]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [34]:
filename_series.apply(save_data,args=(train_folder,groupby_obj_train))

0     None
1     None
2     None
3     None
4     None
5     None
6     None
7     None
8     None
9     None
10    None
11    None
12    None
13    None
14    None
15    None
16    None
17    None
18    None
19    None
20    None
21    None
22    None
23    None
24    None
25    None
26    None
27    None
28    None
29    None
30    None
31    None
32    None
33    None
34    None
dtype: object

In [35]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data,args=(test_folder,groupby_obj_test))

0    None
1    None
2    None
3    None
4    None
5    None
6    None
7    None
8    None
dtype: object