# Trash Dataset Labeling Script

This script processes XML annotation files for the trash dataset and creates a CSV file with standardized labels:
- Main Category: "Public Cleanliness & Public Property Damage"
- Sub Category: "Garbage Dumping"

In [1]:
import xml.etree.ElementTree as ET
import csv
import os
from pathlib import Path

# Set up paths
annotation_dir = "Trash/annotation/trash annotated"
output_csv = "trash_labels.csv"

# Standard categories for all trash images
main_category = "Public Cleanliness & Public Property Damage"
sub_category = "Garbage Dumping"

print(f"Processing XML annotations from: {annotation_dir}")
print(f"Output CSV file: {output_csv}")

Processing XML annotations from: Trash/annotation/trash annotated
Output CSV file: trash_labels.csv


In [2]:
# Function to parse XML annotation file
def parse_xml_annotation(xml_file_path):
    """
    Parse a Pascal VOC XML annotation file and extract image filename and objects.
    
    Args:
        xml_file_path (str): Path to the XML annotation file
    
    Returns:
        tuple: (image_filename, list_of_object_names)
    """
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()
        
        # Get image filename
        filename = root.find('filename').text
        
        # Get all objects in the image
        objects = []
        for obj in root.findall('object'):
            name = obj.find('name').text
            objects.append(name)
        
        return filename, objects
    
    except ET.ParseError as e:
        print(f"Error parsing {xml_file_path}: {e}")
        return None, []
    except Exception as e:
        print(f"Unexpected error processing {xml_file_path}: {e}")
        return None, []

# Test the function with one file
test_xml = Path(annotation_dir) / "Datacluster Trash (1).xml"
if test_xml.exists():
    filename, objects = parse_xml_annotation(test_xml)
    print(f"Test file: {filename}")
    print(f"Objects found: {objects}")
else:
    print(f"Test file not found: {test_xml}")

Test file: Datacluster Trash (1).jpg
Objects found: ['Domestic Trash\tGarbage']


In [4]:
# Process all XML files and create CSV
processed_images = set()  # To avoid duplicates
results = []

# Get all XML files in the annotation directory
annotation_path = Path(annotation_dir)
xml_files = list(annotation_path.glob("*.xml"))

print(f"Found {len(xml_files)} XML annotation files")

for xml_file in xml_files:
    filename, objects = parse_xml_annotation(xml_file)
    
    if filename and filename not in processed_images:
        processed_images.add(filename)
        
        # For trash dataset, all images get the same labels regardless of specific object types
        results.append({
            'image_file': filename,
            'main_category': main_category,
            'sub_category': sub_category,
            'detected_objects': '; '.join(objects) if objects else 'No objects detected'
        })

print(f"Processed {len(results)} unique images")

Found 248 XML annotation files
Processed 248 unique images


In [5]:
# Write results to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['image_file', 'main_category', 'sub_category', 'detected_objects']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write header
    writer.writeheader()
    
    # Write data rows
    for result in results:
        writer.writerow(result)

print(f"\nCSV file '{output_csv}' created successfully!")
print(f"Total images labeled: {len(results)}")
print(f"\nSample of first 5 entries:")
for i, result in enumerate(results[:5]):
    print(f"{i+1}. {result['image_file']} -> {result['main_category']} | {result['sub_category']}")


CSV file 'trash_labels.csv' created successfully!
Total images labeled: 248

Sample of first 5 entries:
1. Datacluster Trash (1).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping
2. Datacluster Trash (10).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping
3. Datacluster Trash (100).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping
4. Datacluster Trash (101).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping
5. Datacluster Trash (102).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping


In [6]:
# Display summary statistics
print("\n=== SUMMARY ===")
print(f"Total XML files found: {len(xml_files)}")
print(f"Total unique images processed: {len(results)}")
print(f"Main category assigned to all images: {main_category}")
print(f"Sub category assigned to all images: {sub_category}")

# Show distribution of detected object types (for information)
all_objects = []
for result in results:
    if result['detected_objects'] != 'No objects detected':
        objects = result['detected_objects'].split('; ')
        all_objects.extend(objects)

if all_objects:
    from collections import Counter
    object_counts = Counter(all_objects)
    print(f"\nDetected object types and their frequencies:")
    for obj_type, count in object_counts.most_common():
        print(f"  - {obj_type}: {count} images")
else:
    print("\nNo specific object types detected in annotations")


=== SUMMARY ===
Total XML files found: 248
Total unique images processed: 248
Main category assigned to all images: Public Cleanliness & Public Property Damage
Sub category assigned to all images: Garbage Dumping

Detected object types and their frequencies:
  - trash: 240 images
  - dw: 13 images
  - Domestic Trash	Garbage: 1 images


In [7]:
import pandas as pd
trash_labels = pd.read_csv(output_csv)

In [12]:
trash_labels.to_csv('trash_labels_cleaned.csv', index=False)


In [9]:
trash_labels.drop(columns=['detected_objects'], inplace=True)  # Drop the detected_objects column if not needed

In [11]:
trash_labels

Unnamed: 0,image_file,main_category,sub_category
0,Datacluster Trash (1).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
1,Datacluster Trash (10).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
2,Datacluster Trash (100).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
3,Datacluster Trash (101).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
4,Datacluster Trash (102).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
...,...,...,...
243,Datacluster Trash (95).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
244,Datacluster Trash (96).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
245,Datacluster Trash (97).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
246,Datacluster Trash (98).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
