# Data preprocess

In [2]:
import os
import xml.etree.ElementTree as ET
from collections import Counter

# functions to parse annotations, filter out invalid annotations, and analyze the class distribution
def parse_annotations(annotation_dir):
    """Parse XML annotations and return a list of annotations."""
    annotations = []
    #select 6k annotations to avoid processing all annotations because we only need a subset with 3000 to 5000 images.
    count = 0
    for filename in os.listdir(annotation_dir):
        count += 1
        if count > 6000: #select 6k annotations
            break
        if not filename.endswith('.xml'):
            continue
        file_path = os.path.join(annotation_dir, filename)
        tree = ET.parse(file_path)
        root = tree.getroot()
        
        image_data = {
            'filename': root.find('filename').text,
            'size': {
                'width': int(root.find('size/width').text),
                'height': int(root.find('size/height').text),
            },
            'objects': []
        }
        
        for obj in root.findall('object'):
            bbox = obj.find('bndbox')
            object_data = {
                'name': obj.find('name').text,
                'difficult': int(obj.find('difficult').text),
                'bbox': {
                    'xmin': int(bbox.find('xmin').text),
                    'ymin': int(bbox.find('ymin').text),
                    'xmax': int(bbox.find('xmax').text),
                    'ymax': int(bbox.find('ymax').text),
                }
            }
            image_data['objects'].append(object_data)
        
        annotations.append(image_data)
    return annotations

def check_bounding_box_area(annotations, min_area=500, max_area=50000, min_aspect_ratio=0.2, max_aspect_ratio=5.0):
    """Filter out annotations with unusual bounding box areas or aspect ratios."""
    filtered_annotations = []
    for annotation in annotations:
        valid = True
        for obj in annotation['objects']:
            bbox = obj['bbox']
            width = bbox['xmax'] - bbox['xmin']
            height = bbox['ymax'] - bbox['ymin']
            area = width * height
            aspect_ratio = width / height if height > 0 else 0
            
            if area < min_area or area > max_area:
                valid = False
                break
            if aspect_ratio < min_aspect_ratio or aspect_ratio > max_aspect_ratio:
                valid = False
                break
        
        if valid:
            filtered_annotations.append(annotation)
    
    return filtered_annotations

def calculate_iou(box1, box2):
    """Calculate Intersection over Union (IoU) for two bounding boxes."""
    x1 = max(box1['xmin'], box2['xmin'])
    y1 = max(box1['ymin'], box2['ymin'])
    x2 = min(box1['xmax'], box2['xmax'])
    y2 = min(box1['ymax'], box2['ymax'])
    
    intersection_area = max(0, x2 - x1) * max(0, y2 - y1)
    box1_area = (box1['xmax'] - box1['xmin']) * (box1['ymax'] - box1['ymin'])
    box2_area = (box2['xmax'] - box2['xmin']) * (box2['ymax'] - box2['ymin'])
    
    union_area = box1_area + box2_area - intersection_area
    iou = intersection_area / union_area if union_area > 0 else 0
    return iou

def filter_high_iou_annotations(annotations, iou_threshold=0.9):
    """Remove annotations where bounding boxes have a high IoU, indicating potential duplicates."""
    filtered_annotations = []
    for annotation in annotations:
        valid = True
        num_objects = len(annotation['objects'])
        for i in range(num_objects):
            for j in range(i + 1, num_objects):
                iou = calculate_iou(annotation['objects'][i]['bbox'], annotation['objects'][j]['bbox'])
                if iou > iou_threshold:
                    valid = False
                    break
            if not valid:
                break
        if valid:
            filtered_annotations.append(annotation)
    return filtered_annotations

def check_class_distribution(annotations, min_count=5):
    """Analyze the class distribution and flag underrepresented classes."""
    class_counter = Counter()
    for annotation in annotations:
        for obj in annotation['objects']:
            class_counter[obj['name']] += 1
    
    underrepresented_classes = [cls for cls, count in class_counter.items() if count < min_count]
    return underrepresented_classes

In [3]:
# Directory containing XML annotation files
annotation_dir = '/Users/siyunhe/Desktop/CS_Master/year2/CS5330-CV/lab3/dataset_20210629145407_top_600/annotations'

# Step 1: Parse annotations
annotations = parse_annotations(annotation_dir)

# Step 2: Perform bounding box area checks
annotations = check_bounding_box_area(annotations)

# Step 3: Filter annotations with high IoU values
annotations = filter_high_iou_annotations(annotations)

# Step 4: Check class distribution
underrepresented_classes = check_class_distribution(annotations)

# Output the results
print(f'Filtered annotations count: {len(annotations)}')
if underrepresented_classes:
    print('Underrepresented classes:', underrepresented_classes)
else:
    print('No underrepresented classes found.')

Filtered annotations count: 4277
Underrepresented classes: ['40345', '47397', '41539', '32018']


In [3]:
# print an example annotation
annotations[0]

{'filename': '9ef6d7dc-e5bf-11eb-a3df-b0c090bd3910.jpg',
 'size': {'width': 300, 'height': 300},
 'objects': [{'name': '48336',
   'difficult': 0,
   'bbox': {'xmin': 0, 'ymin': 68, 'xmax': 79, 'ymax': 132}},
  {'name': '30350b',
   'difficult': 0,
   'bbox': {'xmin': 158, 'ymin': 104, 'xmax': 300, 'ymax': 207}}]}

In [4]:
# Save the filtered annotations to a new directory in the original pascal voc xml format, exclude underrepresented classes
output_dir = '/Users/siyunhe/Desktop/CS_Master/year2/CS5330-CV/lab3/dataset_20210629145407_top_600/filtered_annotations'
os.makedirs(output_dir, exist_ok=True)

for i, annotation in enumerate(annotations):
    filename = annotation['filename']
    output_path = os.path.join(output_dir, filename)
    
    root = ET.Element('annotation')
    ET.SubElement(root, 'filename').text = filename
    size = ET.SubElement(root, 'size')
    ET.SubElement(size, 'width').text = str(annotation['size']['width'])
    ET.SubElement(size, 'height').text = str(annotation['size']['height'])
    
    for obj in annotation['objects']:
        if obj['name'] in underrepresented_classes:
            continue
        
        obj_elem = ET.SubElement(root, 'object')
        ET.SubElement(obj_elem, 'name').text = obj['name']
        ET.SubElement(obj_elem, 'difficult').text = str(obj['difficult'])
        bbox = ET.SubElement(obj_elem, 'bndbox')
        ET.SubElement(bbox, 'xmin').text = str(obj['bbox']['xmin'])
        ET.SubElement(bbox, 'ymin').text = str(obj['bbox']['ymin'])
        ET.SubElement(bbox, 'xmax').text = str(obj['bbox']['xmax'])
        ET.SubElement(bbox, 'ymax').text = str(obj['bbox']['ymax'])
    
    tree = ET.ElementTree(root)
    #write in xml format
    output_file = os.path.join(output_dir, annotation['filename'].replace('.jpg', '.xml'))
    tree.write(output_file, encoding='utf-8', xml_declaration=True)

In [5]:
# Save the filtered images to a new directory
output_dir = '/Users/siyunhe/Desktop/CS_Master/year2/CS5330-CV/lab3/dataset_20210629145407_top_600/filtered_images'
os.makedirs(output_dir, exist_ok=True)

for annotation in annotations:
    filename = annotation['filename']
    image_path = os.path.join('/Users/siyunhe/Desktop/CS_Master/year2/CS5330-CV/lab3/dataset_20210629145407_top_600/images', filename)
    output_path = os.path.join(output_dir, filename)
    os.system(f'cp {image_path} {output_path}')