# Trash Dataset Labeling Script

This script processes XML annotation files for the trash dataset and creates a CSV file with standardized labels:
- Main Category: "Public Cleanliness & Public Property Damage"
- Sub Category: "Garbage Dumping"

In [1]:
# Setup (unchanged parts omitted)
import xml.etree.ElementTree as ET
import csv, os
from pathlib import Path

annotation_dir = "Trash/annotation/trash annotated"
# Directory where the actual image files are stored:
# If images are in the same folder as XMLs, set image_base_dir = annotation_dir
# If they are in Trash/images, set as below.
image_base_dir = "Trash/trash/trash"

output_csv = "trash_labels.csv"

main_category = "Public Cleanliness & Public Property Damage"
sub_category = "Garbage Dumping"

store_absolute_paths = True  # set False to keep as relative

print(f"Processing XML annotations from: {annotation_dir}")
print(f"Image base directory: {image_base_dir}")
print(f"Output CSV file: {output_csv}")

Processing XML annotations from: Trash/annotation/trash annotated
Image base directory: Trash/trash/trash
Output CSV file: trash_labels.csv


In [2]:
# Updated parser returning full image path
def parse_xml_annotation(xml_file_path: Path):
    """
    Parse a Pascal VOC XML annotation file and extract full image path and objects.
    """
    try:
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        filename_node = root.find('filename')
        if filename_node is None or not filename_node.text:
            return None, []

        filename = filename_node.text.strip()

        # Construct candidate image paths
        base_dir_path = Path(image_base_dir)
        candidates = [
            base_dir_path / filename,                 # preferred (configured image dir)
            xml_file_path.parent / filename           # fallback: same dir as XML
        ]

        image_path = None
        for c in candidates:
            if c.exists():
                image_path = c
                break

        # If not found, still build from base_dir (will be non-existent but recorded)
        if image_path is None:
            image_path = candidates[0]

        if store_absolute_paths:
            image_path_str = str(image_path.resolve())
        else:
            # Store path relative to project root (current working directory)
            try:
                image_path_str = str(image_path.resolve().relative_to(Path.cwd()))
            except ValueError:
                image_path_str = str(image_path)
        
        objects = [obj.find('name').text.strip()
                   for obj in root.findall('object')
                   if obj.find('name') is not None and obj.find('name').text]

        return image_path_str, objects

    except ET.ParseError as e:
        print(f"Parse error: {xml_file_path} -> {e}")
        return None, []
    except Exception as e:
        print(f"Unexpected error: {xml_file_path} -> {e}")
        return None, []

In [3]:
# Test one file (adjust name if needed)
from pathlib import Path

test_xml = Path(annotation_dir) / "Datacluster Trash (1).xml"
if test_xml.exists():
    img_path, objects = parse_xml_annotation(test_xml)
    print(f"Test image path: {img_path}")
    print(f"Objects: {objects}")
else:
    print(f"Test file not found: {test_xml}")

Test image path: C:\Users\lchat\One Drive-UoM\OneDrive - University of Moratuwa\Datathon-2025-\DATATHON 2025\Trash\trash\trash\Datacluster Trash (1).jpg
Objects: ['Domestic Trash\tGarbage']


In [4]:
# Process all XML files
processed_images = set()
results = []

annotation_path = Path(annotation_dir)
xml_files = list(annotation_path.glob("*.xml"))
print(f"Found {len(xml_files)} XML files")

for xml_file in xml_files:
    img_path, objects = parse_xml_annotation(xml_file)
    if img_path and img_path not in processed_images:
        processed_images.add(img_path)
        results.append({
            'image_file': img_path,  # now full path
            'main_category': main_category,
            'sub_category': sub_category,
            'detected_objects': '; '.join(objects) if objects else 'No objects detected'
        })

print(f"Processed {len(results)} unique images")

Found 248 XML files
Processed 248 unique images


In [5]:
# Write results to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['image_file', 'main_category', 'sub_category', 'detected_objects']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write header
    writer.writeheader()
    
    # Write data rows
    for result in results:
        writer.writerow(result)

print(f"\nCSV file '{output_csv}' created successfully!")
print(f"Total images labeled: {len(results)}")
print(f"\nSample of first 5 entries:")
for i, result in enumerate(results[:5]):
    print(f"{i+1}. {result['image_file']} -> {result['main_category']} | {result['sub_category']}")


CSV file 'trash_labels.csv' created successfully!
Total images labeled: 248

Sample of first 5 entries:
1. Datacluster Trash (1).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping
2. Datacluster Trash (10).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping
3. Datacluster Trash (100).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping
4. Datacluster Trash (101).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping
5. Datacluster Trash (102).jpg -> Public Cleanliness & Public Property Damage | Garbage Dumping


In [5]:
# Write CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['image_file', 'main_category', 'sub_category', 'detected_objects']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    writer.writeheader()
    for row in results:
        writer.writerow(row)

print(f"CSV '{output_csv}' written. Sample first 3 rows:")
for r in results[:3]:
    print(r)

CSV 'trash_labels.csv' written. Sample first 3 rows:
{'image_file': 'C:\\Users\\lchat\\One Drive-UoM\\OneDrive - University of Moratuwa\\Datathon-2025-\\DATATHON 2025\\Trash\\trash\\trash\\Datacluster Trash (1).jpg', 'main_category': 'Public Cleanliness & Public Property Damage', 'sub_category': 'Garbage Dumping', 'detected_objects': 'Domestic Trash\tGarbage'}
{'image_file': 'C:\\Users\\lchat\\One Drive-UoM\\OneDrive - University of Moratuwa\\Datathon-2025-\\DATATHON 2025\\Trash\\trash\\trash\\Datacluster Trash (10).jpg', 'main_category': 'Public Cleanliness & Public Property Damage', 'sub_category': 'Garbage Dumping', 'detected_objects': 'trash'}
{'image_file': 'C:\\Users\\lchat\\One Drive-UoM\\OneDrive - University of Moratuwa\\Datathon-2025-\\DATATHON 2025\\Trash\\trash\\trash\\Datacluster Trash (100).jpg', 'main_category': 'Public Cleanliness & Public Property Damage', 'sub_category': 'Garbage Dumping', 'detected_objects': 'trash'}


In [6]:
# Display summary statistics
print("\n=== SUMMARY ===")
print(f"Total XML files found: {len(xml_files)}")
print(f"Total unique images processed: {len(results)}")
print(f"Main category assigned to all images: {main_category}")
print(f"Sub category assigned to all images: {sub_category}")

# Show distribution of detected object types (for information)
all_objects = []
for result in results:
    if result['detected_objects'] != 'No objects detected':
        objects = result['detected_objects'].split('; ')
        all_objects.extend(objects)

if all_objects:
    from collections import Counter
    object_counts = Counter(all_objects)
    print(f"\nDetected object types and their frequencies:")
    for obj_type, count in object_counts.most_common():
        print(f"  - {obj_type}: {count} images")
else:
    print("\nNo specific object types detected in annotations")


=== SUMMARY ===
Total XML files found: 248
Total unique images processed: 248
Main category assigned to all images: Public Cleanliness & Public Property Damage
Sub category assigned to all images: Garbage Dumping

Detected object types and their frequencies:
  - trash: 240 images
  - dw: 13 images
  - Domestic Trash	Garbage: 1 images


In [7]:
import pandas as pd
trash_labels = pd.read_csv(output_csv)

In [12]:
trash_labels.to_csv('trash_labels_cleaned.csv', index=False)


In [9]:
trash_labels.drop(columns=['detected_objects'], inplace=True)  # Drop the detected_objects column if not needed

In [11]:
trash_labels

Unnamed: 0,image_file,main_category,sub_category
0,Datacluster Trash (1).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
1,Datacluster Trash (10).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
2,Datacluster Trash (100).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
3,Datacluster Trash (101).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
4,Datacluster Trash (102).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
...,...,...,...
243,Datacluster Trash (95).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
244,Datacluster Trash (96).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
245,Datacluster Trash (97).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
246,Datacluster Trash (98).jpg,Public Cleanliness & Public Property Damage,Garbage Dumping
