In [150]:
import xml.etree.ElementTree as ET
from collections import defaultdict
from shapely.geometry import box
import os
import plotly.graph_objects as go

In [2]:
def read_annotation(file_path):
    tree = ET.parse(file_path)
    root = tree.getroot()

    annotation = {
        'filename': root.find('filename').text,
        'objects': []
    }

    for obj in root.findall('object'):
        obj_data = {
            'name': obj.find('name').text,
            'bbox': (
                int(obj.find('bndbox/xmin').text),
                int(obj.find('bndbox/ymin').text),
                int(obj.find('bndbox/xmax').text),
                int(obj.find('bndbox/ymax').text)
            )
        }
        annotation['objects'].append(obj_data)

    return annotation

In [95]:
def calculate_intersection(bbox1, bbox2):
    rect1 = box(*bbox1)
    rect2 = box(*bbox2)
    intersection = rect1.intersection(rect2)
    intersection_area = intersection.area

    min_area = min(rect1.area, rect2.area)
    percentage_intersection = (intersection_area / min_area) * 100

    area_difference = abs(rect1.area - rect2.area)
    percentage_area_difference = (area_difference / min_area) * 100

    return percentage_intersection, percentage_area_difference

In [189]:
def compare_annotations(annotation1, annotation2, discrepancies_name, no_discrepancies):
    discrepancies = defaultdict(list)
    discrepancies_flag = False
    obj1 = annotation1['objects'][0]
    obj2 = annotation2['objects'][0]
    intersection_area, area_difference = calculate_intersection(obj1['bbox'], obj2['bbox'])

    if obj1['name'] != obj2['name']:
        discrepancies['class'].append((obj1['name'], obj2['name']))
        discrepancies_flag = True
        discrepancies_name += 1

    if intersection_area < 92 or area_difference > 22:
        discrepancies['bbox'].append((obj1['bbox'], obj2['bbox']))
        discrepancies['intersection_area'].append(intersection_area)
        if intersection_area < 92:
            discrepancies_mas_intersection_area['file'].append(annotation1['filename'])
            discrepancies_mas_intersection_area['intersection_area'].append(intersection_area)
        discrepancies['area_difference'].append(area_difference)
        if area_difference > 22:
            discrepancies_mas_area_difference['file'].append(annotation1['filename'])
            discrepancies_mas_area_difference['area_difference'].append(area_difference)
        discrepancies_flag = True

    if discrepancies_flag:
        print(
            f"Discrepancies: {discrepancies['class']}\n{discrepancies['bbox']}\n{discrepancies['intersection_area']}\n{discrepancies['area_difference']}\n\n")
    else:
        print('OK\n\n')
        no_discrepancies += 1
    return discrepancies_name, no_discrepancies

In [126]:
def get_files_in_directory(directory, extension):
    files = []
    for file in os.listdir(directory):
        if file.endswith(extension):
            files.append(os.path.join(directory, file))
    return files

In [127]:
def get_matching_files(dir1, dir2, file_extension):
    matching_files = []

    files1 = get_files_in_directory(dir1, file_extension)
    files2 = get_files_in_directory(dir2, file_extension)

    file_names1 = {os.path.splitext(os.path.basename(file))[0] for file in files1}
    file_names2 = {os.path.splitext(os.path.basename(file))[0] for file in files2}

    common_file_names = file_names1.intersection(file_names2)

    matching_files = [(os.path.join(dir1, f"{file_name}{file_extension}"),
                       os.path.join(dir2, f"{file_name}{file_extension}"))
                      for file_name in common_file_names]

    return matching_files

In [192]:
def visual_discrepancies():
    ideal_value = 92  

    relative_values = [100 * (ideal_value - value) / ideal_value for value in discrepancies_mas_intersection_area['intersection_area']]

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(len(discrepancies_mas_intersection_area['intersection_area']))), y=relative_values, mode='lines', name='Intersection area is less than 92%'))
    fig.update_layout(
        title='The spread of data relative to the ideal 92%',
        xaxis=dict(title='Data point number'),
        yaxis=dict(title='Relative value from ideal (%)'),
        template='plotly_dark',
        showlegend=True
    )
    fig.show()
    
    ideal_value = 22 

    relative_values = [100 * (value - ideal_value) / ideal_value for value in discrepancies_mas_area_difference['area_difference']]

    fig = go.Figure()
    fig.add_trace(go.Scatter(x=list(range(len(discrepancies_mas_area_difference['area_difference']))), y=relative_values, mode='lines', name='Area difference is more than 22%'))
    fig.update_layout(
        title='The spread of data relative to the ideal 22%',
        xaxis=dict(title='Data point number'),
        yaxis=dict(title='Relative value from ideal (%)'),
        template='plotly_dark',
        showlegend=True
    )
    fig.show()

In [190]:
dir_expert1 = 'D:\\Katerina\\HaGRID_marked\\'
dir_expert2 = 'D:\\Katerina\\HaGRID_marked2\\'
file_extension = '.xml'
discrepancies_mas_intersection_area = defaultdict(list)
discrepancies_mas_area_difference = defaultdict(list)
discrepancies_name = 0
no_discrepancies = 0

matching_files = get_matching_files(dir_expert1, dir_expert2, file_extension)

for file_pair in matching_files:
    print(f"Files for comparison: {file_pair[0]} и {file_pair[1]}")
    annotation_expert1 = read_annotation(file_pair[0])
    annotation_expert2 = read_annotation(file_pair[1])

    discrepancies_name, no_discrepancies = compare_annotations(annotation_expert1, annotation_expert2,discrepancies_name, no_discrepancies)

print(f'Discrepancies in name: {discrepancies_name}\nNo discrepancies: {no_discrepancies}')

Files for comparison: D:\Katerina\HaGRID_marked\247.xml и D:\Katerina\HaGRID_marked2\247.xml
OK


Files for comparison: D:\Katerina\HaGRID_marked\244.xml и D:\Katerina\HaGRID_marked2\244.xml
Discrepancies: []
[((939, 214, 1337, 718), (965, 206, 1299, 674))]
[98.29059829059828]
[28.327959465684017]


Files for comparison: D:\Katerina\HaGRID_marked\24.xml и D:\Katerina\HaGRID_marked2\24.xml
OK


Files for comparison: D:\Katerina\HaGRID_marked\278.xml и D:\Katerina\HaGRID_marked2\278.xml
Discrepancies: []
[((711, 678, 829, 842), (709, 674, 851, 844))]
[100.0]
[24.741628772219926]


Files for comparison: D:\Katerina\HaGRID_marked\285.xml и D:\Katerina\HaGRID_marked2\285.xml
OK


Files for comparison: D:\Katerina\HaGRID_marked\29.xml и D:\Katerina\HaGRID_marked2\29.xml
OK


Files for comparison: D:\Katerina\HaGRID_marked\220.xml и D:\Katerina\HaGRID_marked2\220.xml
Discrepancies: []
[((151, 624, 367, 968), (139, 642, 383, 950))]
[89.53488372093024]
[1.1412575366063737]


Files for compariso

Видно, что не все данные были размечены идеально. У двух разметчиков случилось 4 расхождение при классификации жестов на 215 фотографий. При выделении жеста прямоугольником двумя разметчиками получилось так, что у 14 изображений выделяемая площадь пересекалась меньше, чем на 92%. У 46 изображений разница в площадях составила больше 22%, т.е. кто-то из разметчиков мог выделить лишнюю область.

In [164]:
len(discrepancies_mas_intersection_area)

14

In [165]:
len(discrepancies_mas_area_difference)

46

Из первого графика видно, что для 3 и 12 элементов (в списке расхождений по пересечениям площадей) слишком большая разница с эталоном. Значит кто-то из разметчиков мог отрезать что-то важное. Из второго графика видно, что для 15, 34 и 36 элемента второй разметчик добавил слишком много лишнего в выделяемую область.
Стоит просмотреть эти элементы, чтобы подкорректировать разметку.

In [193]:
visual_discrepancies()

In [195]:
discrepancies_mas_area_difference['area_difference'][15]

77.58985200845666

In [199]:
discrepancies_mas_area_difference['file'][15]

'108.jpg'

In [196]:
discrepancies_mas_intersection_area['intersection_area'][3]

85.92225243960966