# AFTER APPLYING ALL THESE COMMANDS ON THE DATA AND 
*"""
1.LOADING XML FILES 
2.BASIC CLEANING
3.PARSING AND EXTRACTING 
4.CONVERTING INTO DATAFRAMES
5.PREPARING LABELS FOR YOLO MODEL
6.DATASET SPLITTING INTO TRAIN AND TEST 
7.LABEL ENCODING 
8.FOLDER CREATIONS AND SAVING WITH TXT FILE
"""
THE CELLS IF OPERATED AGAIN WILL GIVE ERRORS AS AL THE DATA IS MOVED TO RESPECTIVE FOLDERS AND ALL THE OPERATIONS ALREADY DONE 

In [None]:
# few libraries used 
import os
from glob import glob
""" The glob module, which is short for global, is a function that's used to search for files that match a specific file pattern or name. 
It can be used to search CSV files, for text in files. for xml files.
"""
import pandas as pd
from functools import reduce
from xml.etree import ElementTree as et
"""
The xml.etree.ElementTree module implements a simple and efficient API for parsing and creating XML data.
"""


In [None]:
# Load all Xml files and store in a list 
xml_list = glob('./data_images/*.xml')
"""
using the glob library open the file location of the xml files and using the regular expressions command ----- 
                for extracting all the information in the xml files 
                file location from where to retrieve from
                * -- indicates choosing 'all'
                .xml -- file type to choose 
"""

In [None]:
# Basic cleaning 
# like replacing the double backwards slash with forward slash

xml_list = list(map(lambda x:x.replace('\\', '/'), xml_list))   
"""
# changing the slashes then using the map function to apply the change to every file in the xml_list file 
and then converting it into a list and saving it in the same file 

map functions takes parameters 'functions' and 'iterables' on which the function is applied for each item of the iterable
"""

XML is an inherently hierarchical data format, and the most natural way to represent it is with a tree. ET has two classes for this purpose - ElementTree represents the whole XML document as a tree, and Element represents a single node in this tree. Interactions with the whole document (reading and writing to/from files) are usually done on the ElementTree level. Interactions with a single XML element and its sub-elements are done on the Element level.

In [None]:
# Reading the xml files and from each xml file ---
"""
extract 
1.filename 
2.size(width and height of the image)
3.object(name, xmin, xmax, ymin, ymax)
"""
# parsing one xml file
tree = et.parse('./data_images\\00026.xml')
root = tree.getroot()

# Extract filename 
image_name = root.find('filename').text       # inside the find method use the tag name where the file name is written and convert it into text 
# Extract width and heigth of the image 
image_width = root.find('size').find('width').text    # for nested tags we use find as many times as the nested tags
image_height = root.find('size').find('height').text

# extract the object information
# since the image can contain many objects so we need to use a for loop for traversing through all the inforamtion
parser = []        # create an empty list 
objs = root.findall('object')    # for getting the information of all the objects in the image 
for obj in objs:
    name = obj.find('name').text                   
    boundbox = obj.find('bndbox')    
    xmin = boundbox.find('xmin').text
    ymin = boundbox.find('ymin').text
    xmax = boundbox.find('xmax').text
    ymax = boundbox.find('ymax').text
    #print(list([image_name, image_width, image_height, name, xmin, xmax, ymin, ymax]))    # or can write as
    parser.append([image_name, image_width, image_height, name, xmin, xmax, ymin, ymax])
print(f"{parser} \n")
    

# Extracting information for one image

obj = root.findall('object')
name = obj[0].find('name').text     # the object instance and indexing used to access the infromation of the object 
# boundbox = root.find('object').find('bndbox')    # creating bounding box instance
# or can write as
boundbox = obj[0].find('bndbox')
xmin = boundbox.find('xmin').text
ymin = boundbox.find('ymin').text
xmax = boundbox.find('xmax').text
ymax = boundbox.find('ymax').text
list([name, xmin, xmax, ymin, ymax])

In [None]:
# THE ABOVE CODE OF PARSING AND EXTRACTING THE INFORMATION FOR ONE FILE CAN BE FUNCTIONISED 
def extract_text(filename):
    tree = et.parse(filename)
    root = tree.getroot()
    image_name = root.find('filename').text       
    image_width = root.find('size').find('width').text   
    image_height = root.find('size').find('height').text
    parser = []       
    objs = root.findall('object')    
    for obj in objs:
        name = obj.find('name').text                   
        boundbox = obj.find('bndbox')    
        xmin = boundbox.find('xmin').text
        ymin = boundbox.find('ymin').text
        xmax = boundbox.find('xmax').text
        ymax = boundbox.find('ymax').text
        parser.append([image_name, image_width, image_height, name, xmin, ymin, xmax, ymax])
    return parser
    

In [None]:
# Apply for all the xml files 

parser_all = list(map(extract_text, xml_list))         

In [None]:
len(parser_all)       # the lenght of the parser_all is equal to the number of images in the directory

In [None]:
# To flatten the dimensions of the parser_all list
# the flattened list will become the data for machine learning operations ---- 
# use reduce function to decrease the dimensionality of the vector

data = reduce(lambda x, y : x+y, parser_all)

In [None]:
# Converting the data into DataFrame--easier to read and operate on

data_df = pd.DataFrame(data, columns=['Filename', 'Image_width', 'Image_height','Name', 'xmin', 'ymin', 'xmax', 'ymax'])
data_df

In [None]:
data_df['Name'].value_counts()    # these will be our classes

# Preparing labels for yolo model
* for yolo we require the following information
1. Center_X  : center position x coordinate of the object normalized to width of the image 
2. Center_y  : center position y coordinate of the object normalized to height of the image 
3. w  : width of bounding box normalized to width of the image 
4. h  : height of the bounding box normalized to height of the image 

* Let image be 500*300 
  bounding box : [car, 50, 100, 220, 200] ([obj_name, xmin, ymin, xmax, ymax])
  convert this information into ([obj_name, center_x, center_y, w, h])
  #Conversion formula :
  1. center_x = (xmin+xmax/2)/width of the image 
  2. center_y = (ymin+ymax/2)/height of the image
  3. w = xmax-xmin/width of the image 
  4. h = ymax-ymin/height of the image 

# FOLDER FORMAT FOR STORING THE ABOVE INFORMATION:
data_images  -------- train(training dataset)--- for each image the relevant information must be stored in '.txt' file 
            --------- test (validate the results)--- similarily as the train data set
    

In [None]:
# Conversion of information
data_df.info()                 
# All the columns are in object data type--- for width, height,xmin,xmax, ymin,ymax --all the information needs to be in either int or float

In [None]:
# 1. Type conversion
# creating thelist of columns where we want the type conversion
cols = ['Image_width', 'Image_height', 'xmin', 'ymin', 'xmax', 'ymax']
data_df[cols] = data_df[cols].astype('int')       # By default the int data type is int64 for int32 we can use 'int32'
data_df.info()

In [None]:
# Applying the formula for yolo labels :

#center_x, center_y
data_df['center_x'] = ((data_df['xmin']+data_df['xmax'])/2)/data_df['Image_width']         # adding the column to the dataframe 
data_df['center_y'] = ((data_df['ymin']+data_df['ymax'])/2)/data_df['Image_height']

# w
data_df['w'] = (data_df['xmax']-data_df['xmin'])/data_df['Image_width']
# h
data_df['h'] = (data_df['ymax']-data_df['ymin'])/data_df['Image_height']




In [None]:
#Split data into Train and Test set

In [None]:
images = data_df['Filename'].unique()
len(images)            # split this 503 images into train and test set 

In [None]:
# Change the images array into dataFrame 
img_df = pd.DataFrame(images, columns=['Filename'])
img_df

In [None]:
# dataset splitting 

img_train = tuple(img_df.sample(frac=0.8)['Filename'])                 
# sample method shuffles and splits at given %age----- change into a tuple using a function tuple
# in test dataset the images we want are the files not in img_train
img_test = tuple(img_df.query(f'Filename not in {img_train}')['Filename'])   
# rest 20% images in the img_df---- query method takes string expression as a parameter
len(img_train), len(img_test)

In [None]:
# now split into training and testing DataFrame 

train_df = data_df.query(f'Filename in {img_train}')
test_df = data_df.query(f'Filename in {img_test}')
train_df.head()

In [None]:
test_df.head()


# Changing object names into specific ids ---- deep learning model cannot be trained on text 
# LABEL ENCODING --- CHANGING THE DATATYPE OF CLASSES FROM STRING TO NUMBERS 



In [None]:
# Assign id number to object names 

# label encoding 
def label_encoding(x):
    # creating a dictionary of object names and ids as key:value pairs and return the ids 
    labels = {'person':0, 'car':1, 'chair':2, 'bottle':3, 'potted plant':4, 'sheep':5, 'cow':6,'boat':7,
              'horse':8, 'motor bike':9, 'bicycle':10, 'dog':11, 'bird':12, 'sofa':13, 'bus':14, 'tv monitor':15,
              'cat':16, 'train':17, 'aeroplane':18, 'dining table':19}
    return labels[x]

In [None]:
# Applying the ids to train and test set

train_df['id'] = train_df['Name'].apply(label_encoding)
test_df['id'] = test_df['Name'].apply(label_encoding)

# Creating Folder Structure for train and test folders ----- each willl store the respective images and text file containing the information(yolo coordinates)


In [None]:
# SAVE IMAGES AND LABELS IN TEXT

import os 
from shutil import move 
"""
The shutil module offers a number of high-level operations on files and collections of files. 
In particular, functions are provided which support file copying and removal."""

In [None]:
# creating train and test folders inside the data_images folder using python commands and os module

train_folder = 'data_images/train'
test_folder = 'data_images/test'

os.mkdir(train_folder)
os.mkdir(test_folder)

A groupby object in pandas is like a collection of smaller DataFrames, each corresponding to a group defined by the unique values in the 'Filename' column.
.groupby('Filename'): This groups the resulting DataFrame by the values in the 'Filename' column.

    Each group will contain rows where the 'Filename' column has the same value.
    The result is a groupby object (groupby_obj_train) that contains groups of rows, where each group corresponds to a unique value in the 'Filename' column.

In [None]:
columns = ['Filename', 'id', 'center_x', 'center_y', 'w', 'h']
groupby_obj_train = train_df[columns].groupby('Filename')
groupby_obj_test = test_df[columns].groupby('Filename')

In [None]:
# for a sample data and store this information in a txt file 
groupby_obj_train.get_group('00004.jpg').set_index('Filename').to_csv('sample.txt', index = False, header=False)
""".set_index('Filename')
    This changes the index of the DataFrame to the 'Filename' column.
    The set_index('Filename') method sets the 'Filename' column as the new row index of the DataFrame.
    The 'Filename' column is no longer treated as part of the normal data columns and is instead used as the index for rows.

   .to_csv('sample.txt', index=False, header=False):

    to_csv('sample.txt'): This writes the DataFrame (after the grouping and setting the index) to a file named 'sample.txt' in CSV format.
    index=False: This tells pandas not to write the DataFrame index (in this case, the 'Filename' column which was set as the index) to the CSV file.
    header=False: This tells pandas not to include the header row (i.e., the column names) in the CSV file.
"""
# we need to save the text file without the commas and just with seperation

# IDEA -- SAVE EACH IMAGE IN TRAIN OR TEST FOLDER AND RESPECTIVE LABELS IN '.txt' file

In [None]:
# Function to save data in folders 

def save_data(filename, folder_path, group_obj):
    # move the image from source to destination
    src = os.path.join('data_images', filename)
    dst = os.path.join(folder_path, filename)
    move(src, dst)     # move method for moving files
    
    # saving the labels :
    #convert to txt file extension and save the file in the same folder as the iamges 
    text_filename = os.path.join(folder_path, 
                                 os.path.splitext(filename)[0] + '.txt')        # [0] index for file name before the dot
    group_obj.get_group(filename).set_index('Filename').to_csv(text_filename,sep = ' ',index = False,header=False)
    

In [None]:
# below are all the filenames that need to be applied to the save_data function
# groupby_obj_train.groups.keys() ----- convert into Series object for easier computations 
filename_train_series = pd.Series(groupby_obj_train.groups.keys())
filename_train_series


In [None]:
# apply the save_data function on few first to test

filename_train_series.apply(save_data, args=(train_folder,groupby_obj_train))    
# apply-----fucntion and arguements of that function --here folder_path and group_obj

In [None]:
filename_test_series = pd.Series(groupby_obj_test.groups.keys())
filename_test_series

In [None]:
filename_test_series.apply(save_data, args=(test_folder,groupby_obj_test))  