# Potholes Dataset Labeling Script

This script processes COCO annotation files for the potholes dataset and creates a CSV file with standardized labels:
- Main Category: "Road & Infrastructure Issues"
- Sub Category: "Potholes"

In [10]:
import json
import csv

with open("Potholes/train/_annotations.coco.json") as f:
    coco = json.load(f)

# Your mapping dictionary (fill as per your plan)
category_mapping = {
    "potholes-and-speedbreakers": {
        "main_category": "Road & Infrastructure Issues",
        "sub_category": "Potholes"
    },
    "broken road": {
        "main_category": "Road & Infrastructure Issues",
        "sub_category": "Damaged/Cracked Roads"
    },
    "potholes": {
        "main_category": "Road & Infrastructure Issues",
        "sub_category": "Potholes"
    }
}

# Map category id to name for convenience
id_to_name = {cat["id"]: cat["name"] for cat in coco["categories"]}

# Prepare CSV file to write
with open("image_labels.csv", "w", newline="") as csvfile:
    writer = csv.writer(csvfile)
    writer.writerow(["image_file", "main_categories", "sub_categories"])

    for img in coco["images"]:
        image_id = img["id"]
        image_file = img["file_name"]

        # Find all category ids in this image
        anns = [ann for ann in coco["annotations"] if ann["image_id"] == image_id]
        category_names = set(id_to_name[ann["category_id"]] for ann in anns)

        main_cats = set()
        sub_cats = set()

        for cname in category_names:
            if cname in category_mapping and category_mapping[cname]["main_category"]:
                main_cats.add(category_mapping[cname]["main_category"])
                sub_cats.add(category_mapping[cname]["sub_category"])

        # Join multiple categories with semicolon, or empty string if none
        main_cat_str = "; ".join(main_cats) if main_cats else ""
        sub_cat_str = "; ".join(sub_cats) if sub_cats else ""

        writer.writerow([image_file, main_cat_str, sub_cat_str])

print("Done labeling images with main and sub categories.")


Done labeling images with main and sub categories.


In [11]:
import pandas as pd
potholes_labels_i_did = pd.read_csv("image_labels.csv")

In [12]:
potholes_labels_i_did

Unnamed: 0,image_file,main_categories,sub_categories
0,513_jpg.rf.3949764ef9d4515b77482e4e506692db.jpg,Road & Infrastructure Issues,Potholes
1,226_jpg.rf.3934bd97b1934b62561499e7ce1c35dd.jpg,Road & Infrastructure Issues,Potholes
2,498_jpg.rf.38a095ff185ff22f980349f4799743cd.jpg,Road & Infrastructure Issues,Potholes
3,97_jpg.rf.38a662c6d41c468becef03a85d7c56a9.jpg,Road & Infrastructure Issues,Potholes
4,217_jpg.rf.3884b2f92c87dc7e5fe4937793ac7e8c.jpg,Road & Infrastructure Issues,Potholes
...,...,...,...
3790,235_jpg.rf.7dcbfa457e2fbfa563deadb8c7a35340.jpg,Road & Infrastructure Issues,Potholes
3791,557_jpg.rf.7e399ff7301c98943523950a3708d476.jpg,Road & Infrastructure Issues,Potholes
3792,585_jpg.rf.7e0f4047348ecb82d559cf31db0faaf7.jpg,Road & Infrastructure Issues,Potholes
3793,567_jpg.rf.7e2748568c089889833670675f956a17.jpg,Road & Infrastructure Issues,Potholes


# the rest of the code from here was not used 

In [2]:
# Load COCO annotation file
try:
    with open(annotation_file, 'r', encoding='utf-8') as f:
        coco = json.load(f)
    
    print(f"Successfully loaded COCO annotations")
    print(f"Number of images: {len(coco['images'])}")
    print(f"Number of annotations: {len(coco['annotations'])}")
    print(f"Number of categories: {len(coco['categories'])}")
    
    # Show category information
    print("\nCategories found in the dataset:")
    for cat in coco['categories']:
        print(f"  ID {cat['id']}: {cat['name']}")
        
except FileNotFoundError:
    print(f"Error: Could not find annotation file at {annotation_file}")
    print("Please check the file path.")
except json.JSONDecodeError:
    print(f"Error: Invalid JSON format in {annotation_file}")
except Exception as e:
    print(f"Unexpected error: {e}")

Successfully loaded COCO annotations
Number of images: 3795
Number of annotations: 19607
Number of categories: 3

Categories found in the dataset:
  ID 0: potholes-and-speedbreakers
  ID 1: broken road
  ID 2: potholes


In [3]:
# Create category mapping - all categories will be mapped to potholes
category_mapping = {}

# Map all categories in the dataset to our standard categories
for cat in coco['categories']:
    category_mapping[cat['name']] = {
        "main_category": main_category,
        "sub_category": sub_category
    }

print("Category mapping created:")
for original_name, mapping in category_mapping.items():
    print(f"  '{original_name}' -> {mapping['main_category']} | {mapping['sub_category']}")

# Create ID to name mapping for convenience
id_to_name = {cat["id"]: cat["name"] for cat in coco["categories"]}
print(f"\nID to name mapping created with {len(id_to_name)} entries")

Category mapping created:
  'potholes-and-speedbreakers' -> Road & Infrastructure Issues | Potholes
  'broken road' -> Road & Infrastructure Issues | Potholes
  'potholes' -> Road & Infrastructure Issues | Potholes

ID to name mapping created with 3 entries


In [4]:
# Process images and create labels
results = []

print("Processing images...")

for img in coco["images"]:
    image_id = img["id"]
    image_file = img["file_name"]
    
    # Find all annotations for this image
    anns = [ann for ann in coco["annotations"] if ann["image_id"] == image_id]
    
    # Get unique category names for this image
    category_names = set(id_to_name[ann["category_id"]] for ann in anns)
    
    # Collect main and sub categories
    main_cats = set()
    sub_cats = set()
    
    for cname in category_names:
        if cname in category_mapping:
            main_cats.add(category_mapping[cname]["main_category"])
            sub_cats.add(category_mapping[cname]["sub_category"])
    
    # Join multiple categories with semicolon, or assign default if none
    main_cat_str = "; ".join(main_cats) if main_cats else main_category
    sub_cat_str = "; ".join(sub_cats) if sub_cats else sub_category
    
    # Store result
    results.append({
        'image_file': image_file,
        'main_category': main_cat_str,
        'sub_category': sub_cat_str,
        'detected_objects': '; '.join(category_names) if category_names else 'No objects detected'
    })

print(f"Processed {len(results)} images")

Processing images...
Processed 3795 images
Processed 3795 images


In [5]:
# Write results to CSV
with open(output_csv, 'w', newline='', encoding='utf-8') as csvfile:
    fieldnames = ['image_file', 'main_category', 'sub_category', 'detected_objects']
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
    
    # Write header
    writer.writeheader()
    
    # Write data rows
    for result in results:
        writer.writerow(result)

print(f"\nCSV file '{output_csv}' created successfully!")
print(f"Total images labeled: {len(results)}")
print(f"\nSample of first 5 entries:")
for i, result in enumerate(results[:5]):
    print(f"{i+1}. {result['image_file']} -> {result['main_category']} | {result['sub_category']}")


CSV file 'potholes_labels.csv' created successfully!
Total images labeled: 3795

Sample of first 5 entries:
1. 513_jpg.rf.3949764ef9d4515b77482e4e506692db.jpg -> Road & Infrastructure Issues | Potholes
2. 226_jpg.rf.3934bd97b1934b62561499e7ce1c35dd.jpg -> Road & Infrastructure Issues | Potholes
3. 498_jpg.rf.38a095ff185ff22f980349f4799743cd.jpg -> Road & Infrastructure Issues | Potholes
4. 97_jpg.rf.38a662c6d41c468becef03a85d7c56a9.jpg -> Road & Infrastructure Issues | Potholes
5. 217_jpg.rf.3884b2f92c87dc7e5fe4937793ac7e8c.jpg -> Road & Infrastructure Issues | Potholes


In [6]:
# Display summary statistics
print("\n=== SUMMARY ===")
print(f"Total images processed: {len(results)}")
print(f"Main category assigned to all images: {main_category}")
print(f"Sub category assigned to all images: {sub_category}")

# Show distribution of detected object types (for information)
all_objects = []
for result in results:
    if result['detected_objects'] != 'No objects detected':
        objects = result['detected_objects'].split('; ')
        all_objects.extend(objects)

if all_objects:
    from collections import Counter
    object_counts = Counter(all_objects)
    print(f"\nDetected object types and their frequencies:")
    for obj_type, count in object_counts.most_common():
        print(f"  - {obj_type}: {count} images")
else:
    print("\nNo specific object types detected in annotations")


=== SUMMARY ===
Total images processed: 3795
Main category assigned to all images: Road & Infrastructure Issues
Sub category assigned to all images: Potholes

Detected object types and their frequencies:
  - potholes: 3183 images
  - broken road: 786 images


In [7]:
# Load and display the created CSV using pandas
import pandas as pd

potholes_labels = pd.read_csv(output_csv)
print(f"\nLoaded CSV with {len(potholes_labels)} rows")
print("\nFirst few rows:")
potholes_labels.head()


Loaded CSV with 3795 rows

First few rows:


Unnamed: 0,image_file,main_category,sub_category,detected_objects
0,513_jpg.rf.3949764ef9d4515b77482e4e506692db.jpg,Road & Infrastructure Issues,Potholes,potholes
1,226_jpg.rf.3934bd97b1934b62561499e7ce1c35dd.jpg,Road & Infrastructure Issues,Potholes,potholes
2,498_jpg.rf.38a095ff185ff22f980349f4799743cd.jpg,Road & Infrastructure Issues,Potholes,potholes
3,97_jpg.rf.38a662c6d41c468becef03a85d7c56a9.jpg,Road & Infrastructure Issues,Potholes,potholes
4,217_jpg.rf.3884b2f92c87dc7e5fe4937793ac7e8c.jpg,Road & Infrastructure Issues,Potholes,potholes


In [8]:
# Remove the detected_objects column if not needed for ML training
potholes_labels_clean = potholes_labels.drop(columns=['detected_objects']).copy()
print("Cleaned dataset (without detected_objects column):")
potholes_labels_clean.head()

Cleaned dataset (without detected_objects column):


Unnamed: 0,image_file,main_category,sub_category
0,513_jpg.rf.3949764ef9d4515b77482e4e506692db.jpg,Road & Infrastructure Issues,Potholes
1,226_jpg.rf.3934bd97b1934b62561499e7ce1c35dd.jpg,Road & Infrastructure Issues,Potholes
2,498_jpg.rf.38a095ff185ff22f980349f4799743cd.jpg,Road & Infrastructure Issues,Potholes
3,97_jpg.rf.38a662c6d41c468becef03a85d7c56a9.jpg,Road & Infrastructure Issues,Potholes
4,217_jpg.rf.3884b2f92c87dc7e5fe4937793ac7e8c.jpg,Road & Infrastructure Issues,Potholes


In [9]:
# Save the clean version
clean_csv = "potholes_labels_clean.csv"
potholes_labels_clean.to_csv(clean_csv, index=False)
print(f"\nClean CSV saved as: {clean_csv}")
print(f"Shape: {potholes_labels_clean.shape}")
print(f"\nColumns: {list(potholes_labels_clean.columns)}")
print(f"\nUnique main categories: {potholes_labels_clean['main_category'].unique()}")
print(f"Unique sub categories: {potholes_labels_clean['sub_category'].unique()}")


Clean CSV saved as: potholes_labels_clean.csv
Shape: (3795, 3)

Columns: ['image_file', 'main_category', 'sub_category']

Unique main categories: ['Road & Infrastructure Issues']
Unique sub categories: ['Potholes']
