In [6]:
from glob import glob
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et

## XML File process
* Load all xml files and store them in a list
* Read xml files

In [326]:
import os
import shutil

def move_files_to_parent_dir(parent_dir, sub_dirs):
    for sub_dir in sub_dirs:
        sub_dir_path = os.path.join(parent_dir, sub_dir)

        # Vérifier si le sous-dossier existe
        if os.path.exists(sub_dir_path):
            for filename in os.listdir(sub_dir_path):
                src_path = os.path.join(sub_dir_path, filename)

                # Vérifier si c'est un fichier avant de le déplacer
                if os.path.isfile(src_path):
                    dst_path = os.path.join(parent_dir, filename)
                    shutil.move(src_path, dst_path)
                elif os.path.isdir(src_path):
                    # Si c'est un sous-dossier, répéter le processus récursivement
                    move_files_to_parent_dir(parent_dir, [os.path.join(sub_dir, filename)])

            # Optionnel : Supprimer le sous-dossier maintenant vide
            os.rmdir(sub_dir_path)

parent_directory = 'datasets'
sub_directories = ['Annotation', 'Images']

move_files_to_parent_dir(parent_directory, sub_directories)


In [327]:
def unknown_filename_to_title(file_path):
    # Extraire le nom de base du fichier (sans l'extension)
    nom_base = os.path.basename(file_path)
    nom_sans_extension = os.path.splitext(nom_base)[0]

    # Load the XML file
    arbre = et.parse(file_path)
    racine = arbre.getroot()

    modification_apportee = False

    # Search every <filename> and replace %s by the file of the name
    for filename in racine.iter('filename'):
        if filename.text == '%s':
            filename.text = nom_sans_extension
            modification_apportee = True

    # Save the modified XML file
    if modification_apportee:
        arbre.write(file_path)

def treat_all(folder):
    for file in os.listdir(folder):
        if file.endswith('.xml'):
            full_path = os.path.join(folder, file)
            unknown_filename_to_title(full_path)

# Exemple d'utilisation
dossier_datasets = 'datasets'
treat_all(dossier_datasets)


In [328]:
xml_files = glob('datasets/*.xml')

In [329]:
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    
    # extract filename
    image_name = f"{root.find('filename').text}.jpg"
    # width and height of the image
    width = root.find('size').find('width').text
    height = root.find('size').find('height').text
    objs = root.findall('object')
    parser = []
    for obj in objs:
        name = obj.find('name').text
        bndbox = obj.find('bndbox')
        xmin = bndbox.find('xmin').text
        xmax = bndbox.find('xmax').text
        ymin = bndbox.find('ymin').text
        ymax = bndbox.find('ymax').text
        parser.append([image_name, width, height, name, xmin, xmax, ymin, ymax])
        
    return parser

In [330]:
parser_all = list(map(extract_text, xml_files))

In [331]:
data = reduce(lambda x, y : x+y, parser_all)

In [332]:
df = pd.DataFrame(data, columns= ['filename', 'width', 'height', 'name', 'xmin', 'xmax', 'ymin', 'ymax'])

In [333]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax
0,n02094433_5155.jpg,500,333,Yorkshire_terrier,123,237,80,280
1,n02109047_5588.jpg,333,500,Great_Dane,75,244,141,405
2,n02104365_6395.jpg,500,376,schipperke,27,448,42,365
3,n02108000_2653.jpg,450,450,EntleBucher,90,372,56,429
4,n02088466_6712.jpg,375,500,bloodhound,56,350,88,412


In [334]:
df.shape

(22126, 8)

In [335]:
df['name'].value_counts()

name
Afghan_hound          287
Maltese_dog           264
Irish_wolfhound       263
Leonberg              256
Scottish_deerhound    246
                     ... 
affenpinscher         153
malinois              153
Pekinese              152
groenendael           152
redbone               151
Name: count, Length: 120, dtype: int64

In [336]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22126 entries, 0 to 22125
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  22126 non-null  object
 1   width     22126 non-null  object
 2   height    22126 non-null  object
 3   name      22126 non-null  object
 4   xmin      22126 non-null  object
 5   xmax      22126 non-null  object
 6   ymin      22126 non-null  object
 7   ymax      22126 non-null  object
dtypes: object(8)
memory usage: 1.4+ MB


---
## Conversion

In [337]:
# type conversion
cols = ['width', 'height', 'xmin', 'xmax', 'ymin', 'ymax']
df[cols] = df[cols].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 22126 entries, 0 to 22125
Data columns (total 8 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   filename  22126 non-null  object
 1   width     22126 non-null  int64 
 2   height    22126 non-null  int64 
 3   name      22126 non-null  object
 4   xmin      22126 non-null  int64 
 5   xmax      22126 non-null  int64 
 6   ymin      22126 non-null  int64 
 7   ymax      22126 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 1.4+ MB


In [338]:
# center x, center y
df['center_x'] = ((df['xmax']+df['xmin'])/2)/df['width']
df['center_y'] = ((df['ymax']+df['ymin'])/2)/df['height']
# w & h
df['w'] = (df['xmax']-df['xmin'])/df['width']
df['h'] = (df['ymax']-df['ymin'])/df['height']

In [339]:
df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,n02094433_5155.jpg,500,333,Yorkshire_terrier,123,237,80,280,0.36,0.540541,0.228,0.600601
1,n02109047_5588.jpg,333,500,Great_Dane,75,244,141,405,0.478979,0.546,0.507508,0.528
2,n02104365_6395.jpg,500,376,schipperke,27,448,42,365,0.475,0.541223,0.842,0.859043
3,n02108000_2653.jpg,450,450,EntleBucher,90,372,56,429,0.513333,0.538889,0.626667,0.828889
4,n02088466_6712.jpg,375,500,bloodhound,56,350,88,412,0.541333,0.5,0.784,0.648


---
## Split data into train and test

In [340]:
images = df['filename'].unique()

In [341]:
len(images)

20580

In [342]:
# 80% train and 20% test
img_df = pd.DataFrame(images, columns=['filename'])
img_train = tuple(img_df.sample(frac=0.8)['filename'])

In [343]:
img_test = tuple(img_df.query(f'filename not in {img_train}')['filename'])

In [344]:
len(img_train), len(img_test)

(16464, 4116)

In [345]:
train_df = df.query(f'filename in {img_train}')
test_df = df.query(f'filename in {img_test}')

In [346]:
train_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
0,n02094433_5155.jpg,500,333,Yorkshire_terrier,123,237,80,280,0.36,0.540541,0.228,0.600601
1,n02109047_5588.jpg,333,500,Great_Dane,75,244,141,405,0.478979,0.546,0.507508,0.528
2,n02104365_6395.jpg,500,376,schipperke,27,448,42,365,0.475,0.541223,0.842,0.859043
3,n02108000_2653.jpg,450,450,EntleBucher,90,372,56,429,0.513333,0.538889,0.626667,0.828889
4,n02088466_6712.jpg,375,500,bloodhound,56,350,88,412,0.541333,0.5,0.784,0.648


In [347]:
test_df.head()

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h
5,n02085620_4700.jpg,375,500,Chihuahua,30,372,32,498,0.536,0.53,0.912,0.932
7,n02109961_4369.jpg,500,375,Eskimo_dog,249,499,34,333,0.748,0.489333,0.5,0.797333
8,n02109961_4369.jpg,500,375,Eskimo_dog,37,291,41,334,0.328,0.5,0.508,0.781333
26,n02102040_6603.jpg,500,375,English_springer,78,364,42,374,0.442,0.554667,0.572,0.885333
27,n02093991_4462.jpg,400,500,Irish_terrier,6,392,147,465,0.4975,0.612,0.965,0.636


## Assign Id number to object names

In [348]:
def label_encoding(x):
    labels = {'silky_terrier':0, 'Scottish_deerhound':1, 'Chesapeake_Bay_retriever':2, 'Ibizan_hound':3,'wire-haired_fox_terrier':4, 'Saluki':5, 'cocker_spaniel':6, 'schipperke':7, 'borzoi':8, 'Pembroke':9, 'komondor':10, 'Staffordshire_bullterrier':11, 'standard_poodle':12, 'Eskimo_dog':13, 'English_foxhound':14, 'golden_retriever':15, 'Sealyham_terrier':16, 'Japanese_spaniel':17, 'miniature_schnauzer':18, 'malamute':19, 'malinois':20, 'Pekinese':21, 'giant_schnauzer':22, 'Mexican_hairless':23, 'Doberman':24, 'standard_schnauzer':25, 'dhole':26, 'German_shepherd':27, 'Bouvier_des_Flandres':28, 'Siberian_husky':29, 'Norwich_terrier':30, 'Irish_terrier':31, 'Norfolk_terrier':32, 'Saint_Bernard':33, 'Border_terrier':34, 'briard':35, 'Tibetan_mastiff':36, 'bull_mastiff':37, 'Maltese_dog':38, 'Kerry_blue_terrier':39, 'kuvasz':40, 'Greater_Swiss_Mountain_dog':41, 'Lakeland_terrier':42, 'Blenheim_spaniel':43, 'basset':44, 'West_Highland_white_terrier':45, 'Chihuahua':46, 'Border_collie':47, 'redbone':48, 'Irish_wolfhound':49, 'bluetick':50, 'miniature_poodle':51, 'Cardigan':52, 'EntleBucher':53, 'Norwegian_elkhound':54, 'German_short-haired_pointer':55, 'Bernese_mountain_dog':56, 'papillon':57, 'Tibetan_terrier':58, 'Gordon_setter':59, 'American_Staffordshire_terrier':60, 'vizsla':61, 'kelpie':62, 'Weimaraner':63, 'miniature_pinscher':64, 'boxer':65, 'chow':66, 'Old_English_sheepdog':67, 'pug':68, 'Rhodesian_ridgeback':69, 'Scotch_terrier':70, 'Shih-Tzu':71, 'affenpinscher':72, 'whippet':73, 'Sussex_spaniel':74, 'otterhound':75, 'flat-coated_retriever':76, 'English_setter':77, 'Italian_greyhound':78, 'Labrador_retriever':79, 'collie':80, 'cairn':81, 'Rottweiler':82, 'Australian_terrier':83, 'toy_terrier':84, 'Shetland_sheepdog':85, 'African_hunting_dog':86, 'Newfoundland':87, 'Walker_hound':88, 'Lhasa':89, 'beagle':90, 'Samoyed':91, 'Great_Dane':92, 'Airedale':93, 'bloodhound':94, 'Irish_setter':95, 'keeshond':96, 'Dandie_Dinmont':97, 'basenji':98, 'Bedlington_terrier':99, 'Appenzeller':100, 'clumber':101, 'toy_poodle':102, 'Great_Pyrenees':103, 'English_springer':104, 'Afghan_hound':105, 'Brittany_spaniel':106, 'Welsh_springer_spaniel':107, 'Boston_bull':108, 'dingo':109, 'soft-coated_wheaten_terrier':110, 'curly-coated_retriever':111, 'French_bulldog':112, 'Irish_water_spaniel':113, 'Pomeranian':114, 'Brabancon_griffon':115, 'Yorkshire_terrier':116, 'groenendael':117, 'Leonberg':118, 'black-and-tan_coonhound':119}
    
    return labels[x]

In [349]:
train_df['id'] = train_df['name'].apply(label_encoding)
test_df['id'] = test_df['name'].apply(label_encoding)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['id'] = train_df['name'].apply(label_encoding)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_df['id'] = test_df['name'].apply(label_encoding)


## Save images and labels in text

In [350]:
from shutil import move

In [351]:
train_df[train_df['filename'] == '%s.jpg']

Unnamed: 0,filename,width,height,name,xmin,xmax,ymin,ymax,center_x,center_y,w,h,id


In [352]:
train_df = train_df[train_df['filename'] != '%s.jpg']
test_df = test_df[test_df['filename'] != '%s.jpg']

In [353]:
train_folder = 'datasets/train'
test_folder = 'datasets/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

In [354]:
cols = ['filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[cols].groupby('filename')
groupby_obj_test = test_df[cols].groupby('filename')

In [355]:
# save each image in train/test folder and respective labels in .txt
def save_data(filename, folder_path, group_obj):
    # move image
    src = os.path.join('datasets', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst)
    
    # save the labels
    text_filename = os.path.join(folder_path, 
                                 os.path.splitext(filename)[0]+'.txt')
    group_obj.get_group(filename).set_index('filename').to_csv(text_filename, sep=' ', index=False, header=False)

In [356]:
filename_series = pd.Series(groupby_obj_train.groups.keys())

In [358]:
# full path filename are in the column 'filename' in train_df
filename_series.apply(save_data, args=(train_folder, groupby_obj_train))

0        None
1        None
2        None
3        None
4        None
         ... 
16459    None
16460    None
16461    None
16462    None
16463    None
Length: 16464, dtype: object

In [359]:
filename_series_test = pd.Series(groupby_obj_test.groups.keys())
filename_series_test.apply(save_data, args=(test_folder, groupby_obj_test))

0       None
1       None
2       None
3       None
4       None
        ... 
4111    None
4112    None
4113    None
4114    None
4115    None
Length: 4116, dtype: object

In [360]:
# Define the paths for the source and destination directories
source_folder = 'datasets'
destination_folder = 'datasets/Annotations'

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Iterate through all files in the source folder
for file in os.listdir(source_folder):
    # Check if the file is an .xml file
    if file.endswith('.xml'):
        # Construct the full path for the source and destination files
        source_path = os.path.join(source_folder, file)
        destination_path = os.path.join(destination_folder, file)

        # Move the file
        shutil.move(source_path, destination_path)

print("XML files have been moved successfully.")


XML files have been moved successfully.
