## Mount Drive and set kaggle token

In [None]:
from google.colab import drive
drive.mount('/content/drive')

!pip install kaggle

# !mkdir -p ~/.kaggle
# !cp /content/drive/MyDrive/kaggle.json ~/.kaggle
# !chmod 600 ~/.kaggle/kaggle.json
# Load Kaggle API token here

## Download KaggleDataset if not cached in Drive

In [None]:
import os

print("\nDownloading datasets from Kaggle")

drive_kaggle_base = '/content/drive/MyDrive/KaggleData/'

datasets = [
    'andrewmvd/pothole-detection', # good
    'chitholian/annotated-potholes-dataset', # good dataset (various image sizes, xml annotations)
    'atulyakumar98/pothole-detection-dataset', # Not annotated (no object frames) + no pothole images
    'rajdalsaniya/pothole-detection-dataset', # Good but most images are confusing (already in yolov8 format)
]

for ds in datasets:

  ds_user, ds_name = ds.split('/')

  ds_folder_name = f"{ds_name}_{ds_user}"
  drive_path = f"{drive_kaggle_base}{ds_folder_name}"

  if not os.path.exists(drive_path):

    local_path = f'/content/KaggleData/{ds_folder_name}'

    os.system(f'kaggle datasets download -d {ds} -p {local_path} --unzip')
    os.system(f"mkdir -p {drive_path}")
    os.system(f"cp -r {local_path}/* {drive_path}/")
    print(f"Saved to {drive_path}")

  else:

    print(f"{ds} already exists in drive. Skipping download")


## Pipeline for transforming xml annotations to YOLO Format

#### 1. Collecting data, renaming images and labels, generating content of labelx.txt files

In [None]:
# Global index for naming images and annotations
global_idx = 1

In [None]:
import xml.etree.ElementTree as ET
import re

datasets = [
    {
      "name": "andrewmvd",
      "img_dir":f"{drive_kaggle_base}pothole-detection_andrewmvd/images",
      "labels_dir": f"{drive_kaggle_base}pothole-detection_andrewmvd/annotations"
    },
    {
      "name": "chitholian",
      "img_dir": f"{drive_kaggle_base}annotated-potholes-dataset_chitholian/annotated-images",
      "labels_dir": f"{drive_kaggle_base}annotated-potholes-dataset_chitholian/annotated-images",
    }
]

class_map = {'pothole': 0}
samples = []

for ds in datasets:

  ds_name = ds["name"]
  img_dir = ds["img_dir"]
  labels_dir = ds["labels_dir"]

  if not os.path.exists(img_dir) or not os.path.exists(labels_dir):
    print(f"Skipping dataset {ds_name}: missing dirs")
    continue

  xml_files = [f for f in os.listdir(labels_dir) if f.endswith('.xml')]

  for xml_file in xml_files:
    try:
      path = os.path.join(labels_dir, xml_file)
      tree = ET.parse(path)
      root = tree.getroot()

      filename = root.find("filename").text
      size = root.find("size")

      w = float(size.find("width").text)
      h = float(size.find("height").text)

      # Find the associated image
      img_path = os.path.join(img_dir, filename)
      if not os.path.exists(img_path):
        print(f"{ds_name}: Image {filename} not found. Skipping")
        continue

      # Create new image and label file names
      new_img_base = f"{ds_name}_{global_idx}"
      ext = os.path.splitext(filename)[1]
      new_img_name = f"{new_img_base}{ext}"
      new_label_name = f"{new_img_base}.txt"

      # Parse annotations
      yolo_lines = []

      for obj in root.iter("object"):

        bndbox = obj.find("bndbox")

        xmin = float(bndbox.find('xmin').text)
        ymin = float(bndbox.find('ymin').text)
        xmax = float(bndbox.find('xmax').text)
        ymax = float(bndbox.find('ymax').text)

        cx = (xmin + xmax) / 2 / w
        cy = (ymin + ymax) / 2 / h
        pw = (xmax - xmin) / w
        ph = (ymax - ymin) / h

        yolo_lines.append(f"{0} {cx:.6f} {cy:.6f} {pw:.6f} {ph:.6f}")

      # Store the new img name, new label name,
      samples.append({
          "img_path": img_path,                # path for original img
          "new_img_name": new_img_name,
          "new_label_name": new_label_name,
          "yolo_lines": yolo_lines
      })

      global_idx += 1

    except Exception as e:
      print(f"Error during parsing {xml_file}: {e}")

In [None]:
dataset_rajdalsaniya = {
  "name": "rajdalsaniya",
  "img_dirs":[
    f"{drive_kaggle_base}pothole-detection-dataset_rajdalsaniya/train/images",
    f"{drive_kaggle_base}pothole-detection-dataset_rajdalsaniya/valid/images"
  ],
  "labels_dirs": [
    f"{drive_kaggle_base}pothole-detection-dataset_rajdalsaniya/train/labels",
    f"{drive_kaggle_base}pothole-detection-dataset_rajdalsaniya/valid/labels"
  ]
}

already_formated_samples = []

ds_name = dataset_rajdalsaniya["name"]

for idx, (img_dir, ann_dir) in enumerate(zip(dataset_rajdalsaniya['img_dirs'], dataset_rajdalsaniya['labels_dirs'])):

  if not os.path.exists(img_dir) or not os.path.exists(ann_dir):
    print(f"Skipping num_{global_idx} (img_dir, labels_dir): missing dirs")
    continue

  txt_files = [f for f in os.listdir(ann_dir) if f.endswith('.txt')]

  for txt_file in txt_files:

    # We know that all images in this dataset have .jpg extension
    filename = os.path.splitext(txt_file)[0]
    filename += '.jpg'

    # Find the associated image
    img_path = os.path.join(img_dir, filename)
    if not os.path.exists(img_path):
      print(f"{ds_name}: Image {filename} not found. Skipping")
      continue

    # Create new image and label file names
    new_img_base = f"{ds_name}_{global_idx}"
    new_img_name = re.sub(r'^.*(?=\.jpg$)', new_img_base, filename)
    new_label_name = f"{new_img_base}.txt"

    label_path = os.path.join(ann_dir, txt_file)

    # Store the new img name, new label name,
    already_formated_samples.append({
        "img_path": img_path,                    # path for original img
        "label_path": label_path,                # path for original label .txt file
        "new_img_name": new_img_name,
        "new_label_name": new_label_name,
    })

    global_idx += 1

In [None]:
dataset_atulyakumar98 = {
  "name": "atulyakumar98",
  "img_dir": f"{drive_kaggle_base}pothole-detection-dataset_atulyakumar98/normal"
}

ds_name = dataset_atulyakumar98['name']
img_dir = dataset_atulyakumar98['img_dir']

no_pothole_samples = []

if not os.path.exists(img_dir):
    print(f"Skipping {ds_name} img_dir: missing dirs")

else :

  images = os.listdir(img_dir)

  for filename in images:

    img_path = os.path.join(img_dir, filename)
    new_img_base = f"{ds_name}_{global_idx}"
    new_img_name = f"{new_img_base}.jpg"
    new_label_name = f"{new_img_base}.txt"

    no_pothole_samples.append({
        "img_path": img_path,                # path for original img
        "new_img_name": new_img_name,
        "new_label_name": new_label_name,
    })

    global_idx += 1


#### 2. Loading data to output dir

In [None]:
import shutil

# Define output directory of the transformed data
output_dir = "/content/drive/MyDrive/Pothole_detection_project/pothole_dataset"
images_dir = os.path.join(output_dir, "images", "train")
labels_dir = os.path.join(output_dir, "labels", "train")

os.makedirs(images_dir, exist_ok=True)
os.makedirs(labels_dir, exist_ok=True)

print(f"\nSaving dataset to: {output_dir}")
print(f"   Images → {images_dir}")
print(f"   Labels → {labels_dir}")

def save_yolo_sample(img_path, new_img_name, new_label_name, yolo_lines=None, label_path=None):
    """
    Save one image + YOLO label to output dir.

    Args:
        img_path: source image path
        new_img_name: new filename (e.g. "ds_001.jpg")
        new_label_name: name of annotation file (e.g. "ds_001.txt")
        yolo_lines: list of strings (e.g. ["0 0.5 0.5 0.1 0.1"]) or None
        label_path: if provided, copy existing .txt file instead of creating
    """

    # --- Copy Image ---
    dst_img = os.path.join(images_dir, new_img_name)
    if os.path.exists(img_path):
        shutil.copy2(img_path, dst_img)
    else:
        print(f"Warning: Image not found: {img_path}")
        return False

    # --- Save Label ---
    dst_ann = os.path.join(labels_dir, new_label_name)

    if label_path and os.path.exists(label_path):
        # Case: already has YOLO .txt → copy it
        shutil.copy2(label_path, dst_ann)
    else:
        # Case: XML to YOLO lines, or no pothole (empty)
        with open(dst_ann, 'w') as f:
            if yolo_lines:
                f.write('\n'.join(yolo_lines))
            # else: empty file → negative sample

    return True

In [None]:
from tqdm import tqdm

# --- XML → YOLO (converted) ---
print("\nProcessing XML samples...")
for sample in tqdm(samples, desc="XML → YOLO"):
    save_yolo_sample(
        img_path=sample["img_path"],
        new_img_name=sample["new_img_name"],
        new_label_name=sample["new_label_name"],
        yolo_lines=sample["yolo_lines"]
    )

# --- Already in YOLO format ---
print("\nProcessing pre-formatted YOLO samples...")
for sample in tqdm(already_formated_samples, desc="YOLO → YOLO"):
    save_yolo_sample(
        img_path=sample["img_path"],
        new_img_name=sample["new_img_name"],
        new_label_name=sample["new_label_name"],
        label_path=sample["label_path"]
    )

# --- No pothole (negative) samples ---
print("\nProcessing negative samples...")
for sample in tqdm(no_pothole_samples, desc="Negative samples"):
    save_yolo_sample(
        img_path=sample["img_path"],
        new_img_name=sample["new_img_name"],
        new_label_name=sample["new_label_name"]
    )

print(f"\n\nDataset ready! Total images: {len(os.listdir(images_dir))}")