In [6]:
import os
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

## XML File process
* Load all xml files and store them in a list
* Read xml files

In [88]:
xml_files = glob('datasets/Annotation/*/*.xml')

In [95]:
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename
    image_name = f"datasets/Images/n{root.find('folder').text}-{root.find('object').find('name').text}/{root.find('filename').text}.jpg"
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
        
    return parser

In [96]:
parser_all = list(map(extract_text, xml_files))

In [97]:
data = reduce(lambda x, y : x+y, parser_all)

In [98]:
df = pd.DataFrame(data, columns= ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [99]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,datasets/Images/n02097658-silky_terrier/n02097...,500,375,silky_terrier,82,378,7,355
1,datasets/Images/n02097658-silky_terrier/n02097...,375,500,silky_terrier,43,321,270,498
2,datasets/Images/n02097658-silky_terrier/n02097...,448,400,silky_terrier,41,296,30,398
3,datasets/Images/n02097658-silky_terrier/n02097...,500,333,silky_terrier,146,356,10,332
4,datasets/Images/n02097658-silky_terrier/n02097...,333,500,silky_terrier,0,331,12,498


In [101]:
df.shape

(22126, 8)

In [102]:
df['name'].value_counts()

name
Afghan_hound          287
Maltese_dog           264
Irish_wolfhound       263
Leonberg              256
Scottish_deerhound    246
                     ... 
Sussex_spaniel        153
malinois              153
Pekinese              152
groenendael           152
redbone               151
Name: count, Length: 120, dtype: int64

In [103]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22126 entries, 0 to 22125
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  22126 non-null  object
 1   width     22126 non-null  object
 2   height    22126 non-null  object
 3   name      22126 non-null  object
 4   xmin      22126 non-null  object
 5   xmax      22126 non-null  object
 6   ymin      22126 non-null  object
 7   ymax      22126 non-null  object
dtypes: object(8)
memory usage: 1.4+ MB


---
## Conversion

In [104]:
# type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22126 entries, 0 to 22125
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  22126 non-null  object
 1   width     22126 non-null  int64 
 2   height    22126 non-null  int64 
 3   name      22126 non-null  object
 4   xmin      22126 non-null  int64 
 5   xmax      22126 non-null  int64 
 6   ymin      22126 non-null  int64 
 7   ymax      22126 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.4+ MB


In [105]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w & h
df['w'] = (df['xmax']-df['xmin'])/df['width']
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [106]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,datasets/Images/n02097658-silky_terrier/n02097...,500,375,silky_terrier,82,378,7,355,0.46,0.482667,0.592,0.928
1,datasets/Images/n02097658-silky_terrier/n02097...,375,500,silky_terrier,43,321,270,498,0.485333,0.768,0.741333,0.456
2,datasets/Images/n02097658-silky_terrier/n02097...,448,400,silky_terrier,41,296,30,398,0.376116,0.535,0.569196,0.92
3,datasets/Images/n02097658-silky_terrier/n02097...,500,333,silky_terrier,146,356,10,332,0.502,0.513514,0.42,0.966967
4,datasets/Images/n02097658-silky_terrier/n02097...,333,500,silky_terrier,0,331,12,498,0.496997,0.51,0.993994,0.972


---
## Split data into train and test

In [107]:
images = df['filename'].unique()

In [108]:
len(images)

19960

In [109]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename'])

In [110]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [111]:
len(img_train), len(img_test)

(15968, 3992)

In [112]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [113]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,datasets/Images/n02097658-silky_terrier/n02097...,500,375,silky_terrier,82,378,7,355,0.46,0.482667,0.592,0.928
1,datasets/Images/n02097658-silky_terrier/n02097...,375,500,silky_terrier,43,321,270,498,0.485333,0.768,0.741333,0.456
2,datasets/Images/n02097658-silky_terrier/n02097...,448,400,silky_terrier,41,296,30,398,0.376116,0.535,0.569196,0.92
3,datasets/Images/n02097658-silky_terrier/n02097...,500,333,silky_terrier,146,356,10,332,0.502,0.513514,0.42,0.966967
4,datasets/Images/n02097658-silky_terrier/n02097...,333,500,silky_terrier,0,331,12,498,0.496997,0.51,0.993994,0.972


In [114]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
11,datasets/Images/n02097658-silky_terrier/n02097...,500,375,silky_terrier,116,454,2,364,0.57,0.488,0.676,0.965333
16,datasets/Images/n02097658-silky_terrier/n02097...,375,500,silky_terrier,10,321,10,480,0.441333,0.49,0.829333,0.94
17,datasets/Images/n02097658-silky_terrier/n02097...,500,375,silky_terrier,111,371,24,289,0.482,0.417333,0.52,0.706667
18,datasets/Images/n02097658-silky_terrier/n02097...,500,400,silky_terrier,172,420,20,399,0.592,0.52375,0.496,0.9475
26,datasets/Images/n02097658-silky_terrier/n02097...,500,375,silky_terrier,57,432,6,346,0.489,0.469333,0.75,0.906667


## Assign Id number to object names

In [141]:
# label encoding
def get_second_part_of_folder_names(path):
    folder_names = []
    # Parcourir tous les fichiers et dossiers dans le répertoire courant
    for item in os.listdir(path):
        full_path = os.path.join(path, item)
        # Vérifier si l'item est un dossier
        if os.path.isdir(full_path):
            # Splitter le nom du dossier sur le tiret
            parts = item.split('-')            

            # Vérifier si le dossier a un nom avec un tiret et ajouter la deuxième partie
            if len(parts) > 1:
                folder_names.append(parts[1])

    return tuple(folder_names)

In [145]:
directory_path = 'datasets/Annotation/'
items = get_second_part_of_folder_names(directory_path)
string = "{"
for i, item in enumerate(items):
    string += f"\'{item}\':{i}, "
    
string += '}'
print(string)

{'silky_terrier':0, 'Scottish_deerhound':1, 'Chesapeake_Bay_retriever':2, 'Ibizan_hound':3, 'wire':4, 'Saluki':5, 'cocker_spaniel':6, 'schipperke':7, 'borzoi':8, 'Pembroke':9, 'komondor':10, 'Staffordshire_bullterrier':11, 'standard_poodle':12, 'Eskimo_dog':13, 'English_foxhound':14, 'golden_retriever':15, 'Sealyham_terrier':16, 'Japanese_spaniel':17, 'miniature_schnauzer':18, 'malamute':19, 'malinois':20, 'Pekinese':21, 'giant_schnauzer':22, 'Mexican_hairless':23, 'Doberman':24, 'standard_schnauzer':25, 'dhole':26, 'German_shepherd':27, 'Bouvier_des_Flandres':28, 'Siberian_husky':29, 'Norwich_terrier':30, 'Irish_terrier':31, 'Norfolk_terrier':32, 'Saint_Bernard':33, 'Border_terrier':34, 'briard':35, 'Tibetan_mastiff':36, 'bull_mastiff':37, 'Maltese_dog':38, 'Kerry_blue_terrier':39, 'kuvasz':40, 'Greater_Swiss_Mountain_dog':41, 'Lakeland_terrier':42, 'Blenheim_spaniel':43, 'basset':44, 'West_Highland_white_terrier':45, 'Chihuahua':46, 'Border_collie':47, 'redbone':48, 'Irish_wolfhound'

In [146]:
def label_encoding(x):
    labels = {'silky_terrier':0, 'Scottish_deerhound':1, 'Chesapeake_Bay_retriever':2, 'Ibizan_hound':3,'wire':4, 'Saluki':5, 'cocker_spaniel':6, 'schipperke':7, 'borzoi':8, 'Pembroke':9, 'komondor':10, 'Staffordshire_bullterrier':11, 'standard_poodle':12, 'Eskimo_dog':13, 'English_foxhound':14, 'golden_retriever':15, 'Sealyham_terrier':16, 'Japanese_spaniel':17, 'miniature_schnauzer':18, 'malamute':19, 'malinois':20, 'Pekinese':21, 'giant_schnauzer':22, 'Mexican_hairless':23, 'Doberman':24, 'standard_schnauzer':25, 'dhole':26, 'German_shepherd':27, 'Bouvier_des_Flandres':28, 'Siberian_husky':29, 'Norwich_terrier':30, 'Irish_terrier':31, 'Norfolk_terrier':32, 'Saint_Bernard':33, 'Border_terrier':34, 'briard':35, 'Tibetan_mastiff':36, 'bull_mastiff':37, 'Maltese_dog':38, 'Kerry_blue_terrier':39, 'kuvasz':40, 'Greater_Swiss_Mountain_dog':41, 'Lakeland_terrier':42, 'Blenheim_spaniel':43, 'basset':44, 'West_Highland_white_terrier':45, 'Chihuahua':46, 'Border_collie':47, 'redbone':48, 'Irish_wolfhound':49, 'bluetick':50, 'miniature_poodle':51, 'Cardigan':52, 'EntleBucher':53, 'Norwegian_elkhound':54, 'German_short':55, 'Bernese_mountain_dog':56, 'papillon':57, 'Tibetan_terrier':58, 'Gordon_setter':59, 'American_Staffordshire_terrier':60, 'vizsla':61, 'kelpie':62, 'Weimaraner':63, 'miniature_pinscher':64, 'boxer':65, 'chow':66, 'Old_English_sheepdog':67, 'pug':68, 'Rhodesian_ridgeback':69, 'Scotch_terrier':70, 'Shih':71, 'affenpinscher':72, 'whippet':73, 'Sussex_spaniel':74, 'otterhound':75, 'flat':76, 'English_setter':77, 'Italian_greyhound':78, 'Labrador_retriever':79, 'collie':80, 'cairn':81, 'Rottweiler':82, 'Australian_terrier':83, 'toy_terrier':84, 'Shetland_sheepdog':85, 'African_hunting_dog':86, 'Newfoundland':87, 'Walker_hound':88, 'Lhasa':89, 'beagle':90, 'Samoyed':91, 'Great_Dane':92, 'Airedale':93, 'bloodhound':94, 'Irish_setter':95, 'keeshond':96, 'Dandie_Dinmont':97, 'basenji':98, 'Bedlington_terrier':99, 'Appenzeller':100, 'clumber':101, 'toy_poodle':102, 'Great_Pyrenees':103, 'English_springer':104, 'Afghan_hound':105, 'Brittany_spaniel':106, 'Welsh_springer_spaniel':107, 'Boston_bull':108, 'dingo':109, 'soft':110, 'curly':111, 'French_bulldog':112, 'Irish_water_spaniel':113, 'Pomeranian':114, 'Brabancon_griffon':115, 'Yorkshire_terrier':116, 'groenendael':117, 'Leonberg':118, 'black':119}
    
    return labels[x]

In [148]:
label_encoding('Lhasa')

89