In [32]:
import os, glob
import pandas as pd

In [33]:
def find_repo_root(current_path=None, marker_dir="Datasets"):
    if current_path is None:
        current_path = os.getcwd()
    while True:
        if marker_dir in os.listdir(current_path):
            return current_path
        parent = os.path.dirname(current_path)
        if parent == current_path:
            raise FileNotFoundError(f"Could not find repo root containing '{marker_dir}'")
        current_path = parent

REPO_ROOT = find_repo_root()
print("Repo root found at:", REPO_ROOT)

Repo root found at: /root/MaskGenerator


In [34]:
def get_common_pairs(image_map, mask_map):
    image_ids = set(image_map.keys())
    mask_ids = set(mask_map.keys())

    missing_in_masks = image_ids - mask_ids
    missing_in_images = mask_ids - image_ids

    if missing_in_masks:
        print(f"Warning: {len(missing_in_masks)} images do not have corresponding masks.")
    if missing_in_images:
        print(f"Warning: {len(missing_in_images)} masks do not have corresponding images.")

    common_ids = sorted(image_ids & mask_ids)
    pairs = [(to_repo_relative(image_map[id_]), to_repo_relative(mask_map[id_])) for id_ in common_ids]

    if not pairs:
        raise ValueError("No common image-mask pairs found.")
    return pairs

def save_pairs_csv(pairs, output_file):
    if not pairs:
        raise ValueError("No valid image-mask pairs found.")
    pairs_df = pd.DataFrame(pairs, columns=["image_path", "mask_path"])
    pairs_df.to_csv(output_file, index=False)
    print(f"Saved {len(pairs)} pairs to {output_file}")

def to_repo_relative(path, repo_root=REPO_ROOT):
    abs_path = os.path.abspath(path)
    rel_path = os.path.relpath(abs_path, repo_root)
    return rel_path.replace("\\", "/")

## Index Simu

In [35]:
def index_simu(simu_dir, output_file):
    image_files = glob.glob(os.path.join(simu_dir, "Images", "*.png"))
    mask_files = glob.glob(os.path.join(simu_dir, "Masks", "*.png"))

    image_map = {os.path.splitext(os.path.basename(p))[0].split('_')[-1]: p for p in image_files}
    mask_map  = {os.path.splitext(os.path.basename(p))[0].split('_')[-1]: p for p in mask_files}

    pairs = get_common_pairs(image_map, mask_map)

    save_pairs_csv(pairs, output_file)

## Index CARLANE

In [36]:
def get_key_from_path(p, dataset_root):
    rel_path = os.path.relpath(p, dataset_root)  # ex : "val/source/123_image.jpg"
    dirname = os.path.dirname(rel_path)          # ex : "val/source"
    filename = os.path.basename(rel_path)        # ex : "123_image.jpg"
    file_id = filename.split('_')[0]              # ex : "123"
    key = os.path.join(dirname, file_id)          # ex : "val/source/123"
    return key

def index_carlane(carlane_dir, output_file):
    image_files = glob.glob(os.path.join(carlane_dir, "**", "*_image.jpg"), recursive=True)
    mask_files = glob.glob(os.path.join(carlane_dir, "**", "*_label.png"), recursive=True)

    print(f"Found {len(image_files)} images and {len(mask_files)} masks in {carlane_dir}")

    image_map = {get_key_from_path(p, carlane_dir): p for p in image_files}
    mask_map = {get_key_from_path(p, carlane_dir): p for p in mask_files}

    print(f"Image map size: {len(image_map)}")
    print(f"Mask map size: {len(mask_map)}")

    pairs = get_common_pairs(image_map, mask_map)

    save_pairs_csv(pairs, output_file)

In [37]:
DATASETS_DIR = "../Datasets/"

SIMU_DIR = os.path.join(DATASETS_DIR, "simu_v0")
CARLANE_DIR = os.path.join(DATASETS_DIR, "CARLANE")

SIMU_CSV = os.path.join(SIMU_DIR, "simu.csv")
CARLANE_CSV = os.path.join(CARLANE_DIR, "carlane.csv")

print("Indexing Simu dataset...")
index_simu(SIMU_DIR, SIMU_CSV)

print("Indexing CARLANE dataset...")
index_carlane(CARLANE_DIR, CARLANE_CSV)

Indexing Simu dataset...
Saved 4817 pairs to ../Datasets/simu_v0/simu.csv
Indexing CARLANE dataset...
Found 228987 images and 175340 masks in ../Datasets/CARLANE
Image map size: 225027
Mask map size: 171380
Saved 171380 pairs to ../Datasets/CARLANE/carlane.csv


## Global CSV

In [38]:
GLOBAL_CSV = os.path.join(DATASETS_DIR, "all.csv")
df_simu = pd.read_csv(SIMU_CSV)
df_carlane = pd.read_csv(CARLANE_CSV)

df_global = pd.concat([df_simu, df_carlane], ignore_index=True)

df_global.to_csv(GLOBAL_CSV, index=False)

print(f"Global dataset CSV saved to {GLOBAL_CSV}, total pairs: {len(df_global)}")

Global dataset CSV saved to ../Datasets/all.csv, total pairs: 176197


## Sub Datasets

In [40]:
MOLANE_DIR = os.path.join(CARLANE_DIR, "MoLane")
MOLANE_CSV = os.path.join(MOLANE_DIR, "molane.csv")

df_molane = df_carlane[df_carlane['image_path'].str.contains("MoLane")]
df_molane.to_csv(MOLANE_CSV, index=False)
print(f"MoLane dataset CSV saved to {MOLANE_CSV}, total pairs: {len(df_molane)}")

MOLANE_VAL_SOURCE_CSV = os.path.join(MOLANE_DIR, "molane_val_source.csv")
df_molane_val_source = df_molane[df_molane['image_path'].str.contains("val/source")]
df_molane_val_source.to_csv(MOLANE_VAL_SOURCE_CSV, index=False)
print(f"MoLane validation source CSV saved to {MOLANE_VAL_SOURCE_CSV}, total pairs: {len(df_molane_val_source)}")

MOLANE_VAL_TARGET_CSV = os.path.join(MOLANE_DIR, "molane_val_target.csv")
df_molane_val_target = df_molane[df_molane['image_path'].str.contains("val/target")]
df_molane_val_target.to_csv(MOLANE_VAL_TARGET_CSV, index=False)
print(f"MoLane validation target CSV saved to {MOLANE_VAL_TARGET_CSV}, total pairs: {len(df_molane_val_target)}")

MOLANE_TEST_CSV = os.path.join(MOLANE_DIR, "molane_test.csv")
df_molane_test = df_molane[df_molane['image_path'].str.contains("test")]
df_molane_test.to_csv(MOLANE_TEST_CSV, index=False)
print(f"MoLane test CSV saved to {MOLANE_TEST_CSV}, total pairs: {len(df_molane_test)}")

MoLane dataset CSV saved to ../Datasets/CARLANE/MoLane/molane.csv, total pairs: 87000
MoLane validation source CSV saved to ../Datasets/CARLANE/MoLane/molane_val_source.csv, total pairs: 4000
MoLane validation target CSV saved to ../Datasets/CARLANE/MoLane/molane_val_target.csv, total pairs: 2000
MoLane test CSV saved to ../Datasets/CARLANE/MoLane/molane_test.csv, total pairs: 1000
