# Script that merges the Json file "ground-Truth" with the anotation XML-file and makes it to a csv-file 
The xml-files coordinates are the box on picture that are the odometer 



# Import cell 

In [13]:
import xml.etree.ElementTree as ET
import pandas as pd
import numpy as np
import json
import math
import os 

# XML file reading 

In [19]:

def read_odometer_boxes(img):
    path = '../Data/pascal voc 1.1/Annotations/'
    img = img.split('.')[0]
    tree = ET.parse(path+img+'.xml')
    root = tree.getroot()
        
    pos = []    
    for elem in root:
        for i in range(0,len(elem)):
            if elem[i].text == 'odometer':
                for coords in elem[i+2]:
                    pos.append(coords.text)
            break

    boxes = []
    xmin = pos[0]
    ymin = pos[1]
    xmax = pos[2]
    ymax = pos[3]
    boxes.append([xmin, ymin, xmax, ymax])
            
    return boxes



def filelist(root, file_type):
    """Returns a fully-qualified list of filenames under root directory"""
    return [os.path.join(directory_path, f) for directory_path, directory_name, 
            files in os.walk(root) for f in files if f.endswith(file_type)]

def generate_train_df (anno_path):
    annotations = filelist(anno_path, '.xml')
    anno_list = []
    for anno_path in annotations:
        root = ET.parse(anno_path).getroot()
        pos = []    
        for elem in root:
            for i in range(0,len(elem)):
                if elem[i].text == 'odometer':
                    for coords in elem[i+2]:
                        pos.append(coords.text)
                break

        boxes = []
        xmin = pos[0]
        ymin = pos[1]
        xmax = pos[2]
        ymax = pos[3]
        #boxes.append([xmin, ymin, xmax, ymax])
        anno = {}
        anno['width'] = root.find("./size/width").text
        anno['height'] = root.find("./size/height").text
        #print(anno_path.split('/')[-1])
        name = anno_path.split('/')[-1].split('.')[0]
        #print(name.join('jpg'))
        anno['image'] = name + '.jpg'
        
        anno['class'] = 'odometer'
        anno['xmin'] = xmin
        anno['ymin'] = ymin
        anno['xmax'] = xmax
        anno['ymax'] = ymax
        anno_list.append(anno)
    return pd.DataFrame(anno_list)


pdXML = generate_train_df('../Data/pascal voc 1.1/Annotations/')

pdXML

Unnamed: 0,width,height,image,class,xmin,ymin,xmax,ymax
0,900,1600,00001909-PHOTO-2020-12-20-10-36-05.jpg,odometer,223.4,809.5,486.29999999999995,872.9
1,472,1024,00000095-PHOTO-2020-11-21-20-48-57.jpg,odometer,142.5,458.88,295.31,529.83
2,766,1024,00000563-PHOTO-2020-11-30-13-25-25.jpg,odometer,281.23,340.0,592.3,465.8
3,768,1024,00000241-PHOTO-2020-11-24-08-54-09.jpg,odometer,157.6,410.61,355.2,492.49
4,576,1024,00001513-PHOTO-2020-12-15-22-19-59.jpg,odometer,233.9,586.35,364.07,616.37
...,...,...,...,...,...,...,...,...
2384,768,1024,00001143-PHOTO-2020-12-10-11-20-27.jpg,odometer,226.07,420.08,417.46,469.4
2385,575,1024,00001218-PHOTO-2020-12-10-11-47-26.jpg,odometer,206.56,384.5,351.32,491.03
2386,768,1024,00000166-PHOTO-2020-11-22-21-02-40.jpg,odometer,218.44,374.3,562.1,598.0
2387,768,1024,00001422-PHOTO-2020-12-15-22-17-24.jpg,odometer,152.74,264.08,563.62,386.05


# JSON groundtruth reading 

In [27]:
#Since nested JSON this don´t work 
#ground_truth = pd.read_json(‘groundTruth/groundtruth.json’)

# Beacuse the nested JSON this needs to be done. 
with open('../Data/groundTruth/groundtruth.json','r') as f:
    groundTruth = json.loads(f.read())

groundTruth_DF = pd.json_normalize(groundTruth, record_path =['odometers'])
#print(groundTruth_DF)
#mileage = groundTruth_DF['mileage']

    #Merging the two DataFrames 
merged_JSON_XML_DF = pd.merge(groundTruth_DF,pdXML, how='left',left_on='image',right_on='image')

    # To see all the rows and be ablse to scroll the DataFrame 
pd.set_option('display.max_rows', None)

    # Change the odometer type to binary where analog odometer is 0 and automatic is 1  
class_dict = {'analog': 0, 'digital': 1}
merged_JSON_XML_DF['odometer_type']= merged_JSON_XML_DF['odometer_type'].apply(lambda x:  class_dict[x])

    #Print the merged dataframe
merged_JSON_XML_DF

Unnamed: 0,image,odometer_type,mileage,width,height,class,xmin,ymin,xmax,ymax
0,00000002-PHOTO-2020-11-20-11-21-22.jpg,0,244362.0,768,1024,odometer,249.0,399.21,452.9,456.74
1,00000003-PHOTO-2020-11-20-11-21-23.jpg,1,64750.0,768,1024,odometer,300.0,413.31,420.6,485.75
2,00000004-PHOTO-2020-11-20-11-21-25.jpg,1,159073.0,1024,768,odometer,461.27,324.0,931.55,532.14
3,00000005-PHOTO-2020-11-20-11-21-26.jpg,0,18613.0,576,1024,odometer,216.78,582.97,333.7,626.14
4,00000006-PHOTO-2020-11-20-11-21-26.jpg,0,35376.0,768,1024,odometer,230.34,504.17,474.49,671.03
5,00000007-PHOTO-2020-11-20-11-21-27.jpg,0,7863.0,576,1024,odometer,171.15,393.51,336.25,458.5
6,00000008-PHOTO-2020-11-20-11-21-28.jpg,1,64750.0,576,1024,odometer,200.15,417.78,317.81,487.4
7,00000009-PHOTO-2020-11-20-11-21-29.jpg,0,2183.0,575,1024,odometer,206.05,561.19,376.96,601.1
8,00000010-PHOTO-2020-11-20-11-21-30.jpg,0,1187.0,768,1024,odometer,104.72,556.52,312.02,603.37
9,00000011-PHOTO-2020-11-20-11-21-30.jpg,1,189350.0,1024,768,odometer,591.45,406.18,815.33,505.87


# Save as a CSV to be able to use in another notebook

In [28]:
merged_JSON_XML_DF.to_csv('dataFrameMerged.csv')
    # To read use 
    # df = pd.read_csv('dataFrameMerged.csv', index_col=0)

In [29]:
df = pd.read_csv('dataFrameMerged.csv', index_col=0)
df

Unnamed: 0,image,odometer_type,mileage,width,height,class,xmin,ymin,xmax,ymax
0,00000002-PHOTO-2020-11-20-11-21-22.jpg,0,244362.0,768,1024,odometer,249.0,399.21,452.9,456.74
1,00000003-PHOTO-2020-11-20-11-21-23.jpg,1,64750.0,768,1024,odometer,300.0,413.31,420.6,485.75
2,00000004-PHOTO-2020-11-20-11-21-25.jpg,1,159073.0,1024,768,odometer,461.27,324.0,931.55,532.14
3,00000005-PHOTO-2020-11-20-11-21-26.jpg,0,18613.0,576,1024,odometer,216.78,582.97,333.7,626.14
4,00000006-PHOTO-2020-11-20-11-21-26.jpg,0,35376.0,768,1024,odometer,230.34,504.17,474.49,671.03
5,00000007-PHOTO-2020-11-20-11-21-27.jpg,0,7863.0,576,1024,odometer,171.15,393.51,336.25,458.5
6,00000008-PHOTO-2020-11-20-11-21-28.jpg,1,64750.0,576,1024,odometer,200.15,417.78,317.81,487.4
7,00000009-PHOTO-2020-11-20-11-21-29.jpg,0,2183.0,575,1024,odometer,206.05,561.19,376.96,601.1
8,00000010-PHOTO-2020-11-20-11-21-30.jpg,0,1187.0,768,1024,odometer,104.72,556.52,312.02,603.37
9,00000011-PHOTO-2020-11-20-11-21-30.jpg,1,189350.0,1024,768,odometer,591.45,406.18,815.33,505.87
