# Data Splitter (v1.0)
This notebook covers the data preparation for version 1.0 of the Project SBAFN. Specifically, given the downloaded 440k+ images on a specific developer's local system, we try to split the data into a sample of 1600 images for train-validation-test split, and prepare for annotation using Label Studio. In future versions, we will try to re-train the object detection model for increased performance.

In [1]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from urllib.parse import urlparse, unquote
from pathlib import Path
import os, shutil

from sklearn.model_selection import GroupShuffleSplit
import pandas as pd
import numpy as np
import yaml

In [2]:
try:
    HERE = Path(__file__).resolve().parent
except NameError:
    HERE = Path.cwd()
REPO_ROOT = HERE.parent.parent
PIPELINE_DIR = REPO_ROOT / "pipeline"
MODEL_DIR = REPO_ROOT / "models"

with open(PIPELINE_DIR / "configs" / "config.yaml", "r", encoding="utf-8") as f:
    pipeline_cfg = yaml.safe_load(f)

with open(MODEL_DIR / "configs"/ "model_config.yaml", "r", encoding="utf-8") as f:
    model_cfg = yaml.safe_load(f)

random_state = model_cfg.get("random_state", 67)
meta_outdir = pipeline_cfg.get("mapillary_api", {}).get("manifest", {}).get("out_dir", "data/meta/")
meta_name = pipeline_cfg.get("mapillary_api", {}).get("manifest", {}).get("local_manifest_name", "mapillary_manifest_local.csv")

In [3]:
df = pd.read_csv(REPO_ROOT / meta_outdir / meta_name)
df = df.rename(columns={"file_path": "image"})
df

Unnamed: 0,id,thumb_kind,captured_at,camera_type,sequence,lat,lon,width,height,face,yaw_deg,pitch_deg,hfov_deg,image
0,2747432245565190,2048,1579449714000,perspective,mQI3T1Vx2dsT97eosEtUKA,14.555918,120.978320,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\274...
1,1205619749867643,2048,1579449034000,perspective,TMirHKwg-xdFaFwsfrV0wQ,14.555843,120.979176,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\120...
2,200620985244175,2048,1579449037000,perspective,TMirHKwg-xdFaFwsfrV0wQ,14.555939,120.979156,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\200...
3,399048961209247,2048,1579449031000,perspective,TMirHKwg-xdFaFwsfrV0wQ,14.555843,120.979246,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\399...
4,468411350925249,2048,1579449028000,perspective,TMirHKwg-xdFaFwsfrV0wQ,14.555886,120.979324,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\468...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444517,1867264740438588,2048,1714921154000,spherical,PDbHRBavwWpLG74VCKU6do,14.641095,121.025562,1024,1024,forward,102.955234,-10.0,80.0,C:\Prog Projects\project-sbafn\data\images\186...
444518,1867264740438588,2048,1714921154000,spherical,PDbHRBavwWpLG74VCKU6do,14.641095,121.025562,1024,1024,right,192.955234,-10.0,80.0,C:\Prog Projects\project-sbafn\data\images\186...
444519,452452674286020,2048,1714892373000,perspective,IW8bvPja4THdFZM2Ko1srh,14.641407,121.025815,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\452...
444520,1395888600841751,2048,1646622207294,fisheye,QEv3Y6xFCO9lagsKXB4u07,14.640809,121.026698,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\139...


# Data Splitting
- This split ensures that images belonging to the same sequence id DO NOT get assigned to different split types (to prevent data leakage)

In [4]:
def ensure_sequence_id(df: pd.DataFrame, 
                       seq_col: str = "sequence") -> pd.DataFrame:
    if df[seq_col].dtype == "object" and df[seq_col].apply(lambda x: isinstance(x, dict)).any():
        df = df.copy()
        df[seq_col] = df[seq_col].apply(lambda d: d.get("id") if isinstance(d, dict) else d)
    df["group_key"] = df[seq_col].fillna(df["id"])
    return df

def split_by_sequence(df: pd.DataFrame, 
                      train_size: float = 0.8, 
                      val_size: float = 0.1, 
                      test_size: float = 0.1, 
                      random_state: int = 67) -> pd.DataFrame:
    assert abs(train_size + val_size + test_size - 1.0) < 1e-8
    df = ensure_sequence_id(df, "sequence")
    groups = df["group_key"]

    # 1) train vs temp
    gss1 = GroupShuffleSplit(n_splits=1, train_size=train_size, random_state=random_state)
    train_idx, temp_idx = next(gss1.split(df, groups=groups))
    df_train = df.iloc[train_idx].copy()
    df_temp  = df.iloc[temp_idx].copy()

    # 2) temp -> val/test
    val_ratio = val_size / (val_size + test_size)
    gss2 = GroupShuffleSplit(n_splits=1, train_size=val_ratio, random_state=random_state)
    val_idx_rel, test_idx_rel = next(gss2.split(df_temp, groups=df_temp["group_key"]))
    df_val  = df_temp.iloc[val_idx_rel].copy()
    df_test = df_temp.iloc[test_idx_rel].copy()

    # Sanity checks
    assert set(df_train["group_key"]).isdisjoint(df_val["group_key"])
    assert set(df_train["group_key"]).isdisjoint(df_test["group_key"])
    assert set(df_val["group_key"]).isdisjoint(df_test["group_key"])

    # tag split
    df_train["split"] = "train"
    df_val["split"]   = "val"
    df_test["split"]  = "test"
    return pd.concat([df_train, df_val, df_test], ignore_index=True)

In [5]:
res_df = split_by_sequence(df=df, train_size=0.8, val_size=0.1, test_size=0.1, random_state=random_state)
res_df

Unnamed: 0,id,thumb_kind,captured_at,camera_type,sequence,lat,lon,width,height,face,yaw_deg,pitch_deg,hfov_deg,image,group_key,split
0,1205619749867643,2048,1579449034000,perspective,TMirHKwg-xdFaFwsfrV0wQ,14.555843,120.979176,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\120...,TMirHKwg-xdFaFwsfrV0wQ,train
1,200620985244175,2048,1579449037000,perspective,TMirHKwg-xdFaFwsfrV0wQ,14.555939,120.979156,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\200...,TMirHKwg-xdFaFwsfrV0wQ,train
2,399048961209247,2048,1579449031000,perspective,TMirHKwg-xdFaFwsfrV0wQ,14.555843,120.979246,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\399...,TMirHKwg-xdFaFwsfrV0wQ,train
3,468411350925249,2048,1579449028000,perspective,TMirHKwg-xdFaFwsfrV0wQ,14.555886,120.979324,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\468...,TMirHKwg-xdFaFwsfrV0wQ,train
4,474598270316019,2048,1579449801000,perspective,1Xbwq-jWPihD8I7fVv5nNQ,14.555859,120.979577,4000,3000,,,,,C:\Prog Projects\project-sbafn\data\images\474...,1Xbwq-jWPihD8I7fVv5nNQ,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444517,472976081902195,2048,1714916876000,spherical,5TC3BLNwfgvWqiE4HtleFx,14.641265,121.026750,1024,1024,forward,12.667304,-10.0,80.0,C:\Prog Projects\project-sbafn\data\images\472...,5TC3BLNwfgvWqiE4HtleFx,test
444518,472976081902195,2048,1714916876000,spherical,5TC3BLNwfgvWqiE4HtleFx,14.641265,121.026750,1024,1024,right,102.667304,-10.0,80.0,C:\Prog Projects\project-sbafn\data\images\472...,5TC3BLNwfgvWqiE4HtleFx,test
444519,490871835866525,2048,1646622099297,fisheye,SIDvmjEw41tKFh6T7WQiOs,14.640779,121.029720,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\490...,SIDvmjEw41tKFh6T7WQiOs,test
444520,5103811173009148,2048,1646622100797,fisheye,SIDvmjEw41tKFh6T7WQiOs,14.640802,121.029672,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\510...,SIDvmjEw41tKFh6T7WQiOs,test


In [6]:
res_df["split"].value_counts()

split
train    353003
test      46839
val       44680
Name: count, dtype: int64

# Annotation Subset for v1

In [7]:
def sample_annotation_v1(df: pd.DataFrame,
                         total: int = 1600,
                         ratios: dict[str:float] = {"train":0.8, "val":0.1, "test":0.1},
                         random_state: int = 67):
    rng = np.random.default_rng(random_state)
    df = df.copy()

    targets = {k: int(round(v * total)) for k, v in ratios.items()}
    targets["train"] += total - sum(targets.values())

    chosen = []
    for split, n in targets.items():
        pool = df[df["split"] == split]
        if len(pool) == 0:
            continue
        take = min(n, len(pool))
        idx = rng.choice(pool.index.to_numpy(), size=take, replace=False)
        chosen.append(idx)

    chosen = np.concatenate(chosen) if chosen else np.array([], dtype=int)
    df["annotate_v1"] = 0
    df.loc[chosen, "annotate_v1"] = 1
    return df

In [8]:
sample_df = sample_annotation_v1(
    df=res_df,
    total=1600,
    ratios={"train":0.8, "val":0.1, "test":0.1},
    random_state=random_state
)
sample_df = sample_df[sample_df["annotate_v1"] == 1].copy()
sample_df = sample_df.drop(columns={"group_key", "annotate_v1"}) # not needed in manifest
sample_df

Unnamed: 0,id,thumb_kind,captured_at,camera_type,sequence,lat,lon,width,height,face,yaw_deg,pitch_deg,hfov_deg,image,split
210,1048274270352012,2048,1712387240200,perspective,XUP3hlknjWVNYorGwa5Avx,14.557162,120.983622,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\104...,train
300,162806036129159,2048,1646532375291,fisheye,IgWSUHEKV0eZlN1TwG6ynr,14.556830,120.984436,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\162...,train
904,2439813999548407,2048,1712387187000,perspective,clo42mIG0SxDwVBgFi1kvt,14.557095,120.983644,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\243...,train
942,2439144589627655,2048,1712387022000,perspective,UTxLK8JRQwOiCH6p5nWB2F,14.556305,120.984350,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\243...,train
1668,969327708218364,2048,1712387061000,perspective,UmXyVDhP3F0Als1BxJIbEC,14.556649,120.984284,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\969...,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444206,524349980047672,2048,1715391217000,perspective,mSHMaUVNPBhe6Qks4z3b0d,14.641056,121.010907,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\524...,test
444232,1045901917208653,2048,1715420004000,spherical,BiAH4kCndQrVaFPW657Mgj,14.641254,121.010587,1024,1024,forward,122.292397,-10.0,80.0,C:\Prog Projects\project-sbafn\data\images\104...,test
444353,504459358751825,2048,1714890743000,perspective,bJD3MuSHpO89krw1ICVFnm,14.640689,121.018096,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\504...,test
444360,523994895639336,2048,1535238441578,perspective,tgxc-0DSQO-MFQFP4mGVFA,14.641810,121.023008,4128,3096,,,,,C:\Prog Projects\project-sbafn\data\images\523...,test


In [9]:
sample_df["split"].value_counts()

split
train    1280
val       160
test      160
Name: count, dtype: int64

# Export Non-Sharded Manifest (v1.0)

In [10]:
sample_df_repo = sample_df.copy().drop(columns={"image"})
sample_df_local = sample_df.copy()
sample_df_local

Unnamed: 0,id,thumb_kind,captured_at,camera_type,sequence,lat,lon,width,height,face,yaw_deg,pitch_deg,hfov_deg,image,split
210,1048274270352012,2048,1712387240200,perspective,XUP3hlknjWVNYorGwa5Avx,14.557162,120.983622,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\104...,train
300,162806036129159,2048,1646532375291,fisheye,IgWSUHEKV0eZlN1TwG6ynr,14.556830,120.984436,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\162...,train
904,2439813999548407,2048,1712387187000,perspective,clo42mIG0SxDwVBgFi1kvt,14.557095,120.983644,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\243...,train
942,2439144589627655,2048,1712387022000,perspective,UTxLK8JRQwOiCH6p5nWB2F,14.556305,120.984350,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\243...,train
1668,969327708218364,2048,1712387061000,perspective,UmXyVDhP3F0Als1BxJIbEC,14.556649,120.984284,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\969...,train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
444206,524349980047672,2048,1715391217000,perspective,mSHMaUVNPBhe6Qks4z3b0d,14.641056,121.010907,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\524...,test
444232,1045901917208653,2048,1715420004000,spherical,BiAH4kCndQrVaFPW657Mgj,14.641254,121.010587,1024,1024,forward,122.292397,-10.0,80.0,C:\Prog Projects\project-sbafn\data\images\104...,test
444353,504459358751825,2048,1714890743000,perspective,bJD3MuSHpO89krw1ICVFnm,14.640689,121.018096,3840,2160,,,,,C:\Prog Projects\project-sbafn\data\images\504...,test
444360,523994895639336,2048,1535238441578,perspective,tgxc-0DSQO-MFQFP4mGVFA,14.641810,121.023008,4128,3096,,,,,C:\Prog Projects\project-sbafn\data\images\523...,test


In [11]:
sample_df_out_dir = REPO_ROOT / "data" / "meta" / "v1"
sample_df_repo_name = "annotation_v1.csv"
sample_df_local_name = "annotation_v1_local.csv"

# Export
if not (sample_df_out_dir / sample_df_repo_name).exists():
    sample_df_repo.to_csv(sample_df_out_dir / sample_df_repo_name, index=False)
else:
    print(f"[!] v1.0 manifest (repo) already exists.")

if not (sample_df_out_dir / sample_df_local_name).exists():
    sample_df_local.to_csv(sample_df_out_dir / sample_df_local_name, index=False)
else:
    print(f"[!] v1.0 manifest (local) already exists.")

# Four-Way Split for Local Annotation

In [12]:
def _to_rel_path(s: str, rel_base: Path) -> str:
    if isinstance(s, str) and s.startswith("file://"):
        up = urlparse(s)
        s = unquote(up.path)
        if os.name == "nt" and s.startswith("/") and ":" in s[1:3]:
            s = s[1:]
    p = Path(s).resolve()
    try:
        return str(p.relative_to(rel_base))
    except Exception:
        return p.name

def write_v1_shards(df: pd.DataFrame,
                    rel_base: str,
                    out_dir: str = "data/meta",
                    base_name: str = "annotation_v1_shard",
                    n_shards: int = 4,
                    random_state: int = 67,
                    filter_to_v1: bool = True,
                    write_meta_copy: bool = True,
                    add_filename: bool = True):
    out_dir = Path(out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)
    v1 = df[df["annotate_v1"] == 1].copy() if (filter_to_v1 and "annotate_v1" in df.columns) else df.copy()

    # Change `image` column to relative path
    if "image" not in v1.columns:
        if "file_path" in v1.columns:
            v1["image"] = v1["file_path"]
        else:
            raise ValueError("Need an 'image' or 'file_path' column.")
    v1["image"] = v1["image"].astype(str).map(lambda s: _to_rel_path(s, rel_base))

    shards = {k: [] for k in range(n_shards)}
    for split in ["train", "val", "test"]:
        part = v1[v1["split"] == split].sample(frac=1.0, random_state=random_state)
        idx = part.index.to_numpy()
        sizes = np.full(n_shards, len(idx)//n_shards)
        sizes[: len(idx) % n_shards] += 1
        start = 0
        for k in range(n_shards):
            sz = sizes[k]
            if sz > 0:
                shards[k].append(part.loc[idx[start:start+sz]])
            start += sz

    cols_all = list(v1.columns)
    path_cols = [c for c in ("image", "file_path") if c in cols_all]

    for k in range(n_shards):
        shard_df = pd.concat(shards[k], ignore_index=True) if shards[k] else pd.DataFrame(columns=cols_all)

        # Full manifest (local copy)
        p_full = out_dir / f"{base_name}{k+1}_local.csv"
        shard_df.to_csv(p_full, index=False)

        # Metadata-only manifest (repo copy)
        if write_meta_copy:
            meta_cols = [c for c in shard_df.columns if c not in path_cols]
            p_meta = out_dir / f"{base_name}{k+1}.csv"
            shard_df.loc[:, meta_cols].to_csv(p_meta, index=False)

        tr = (shard_df.get("split","") == "train").sum()
        va = (shard_df.get("split","") == "val").sum()
        te = (shard_df.get("split","") == "test").sum()
        print(f"[ok] {p_full.name} rows={len(shard_df)} (train={tr}, val={va}, test={te})"
              + (f" | wrote {base_name}{k+1}.csv (no paths)" if write_meta_copy else ""))

In [13]:
out_dir = REPO_ROOT / "data" / "meta" / "v1"
if not (out_dir / "annotation_v1_shard1_local.csv").exists():
    write_v1_shards(
        df=sample_df,
        out_dir= out_dir,
        rel_base= REPO_ROOT / "data" / "images",
        n_shards=4,
        random_state=random_state
    )
else:
    print(f"[!] Shard manifest .csv files already exists.")

[!] Shard manifest .csv files already exists.


# Copy to Individual Folders

In [14]:
def _resolve_local(p, rel_base: Path | None = None) -> Path:
    """Return a local absolute Path from a relative path or file:// URI."""
    s = str(p)
    if s.startswith("file://"):
        up = urlparse(s)
        s = unquote(up.path)
        if os.name == "nt" and s.startswith("/") and ":" in s[1:3]:
            s = s[1:]
        pth = Path(s)
        return pth if pth.is_absolute() else (Path(rel_base) / pth).resolve() if rel_base else pth.resolve()

    pth = Path(s)
    if pth.is_absolute():
        return pth
    if rel_base is None:
        return pth.resolve()
    return (Path(rel_base) / pth).resolve()

def _copy_or_link(src: Path, dst: Path, mode: str = "auto"):
    """mode: 'copy' | 'hardlink' | 'symlink' | 'auto'."""
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists():
        return True
    try:
        if mode == "auto":
            mode = "hardlink" if os.name == "nt" else "symlink"
        if mode == "hardlink":
            os.link(src, dst)
        elif mode == "symlink":
            os.symlink(src, dst)
        elif mode == "copy":
            shutil.copy2(src, dst)
        else:
            raise ValueError("mode must be copy|hardlink|symlink|auto")
        return True
    except Exception:
        try:
            shutil.copy2(src, dst)
            return True
        except Exception:
            return False

def materialize_shard(shard_csv: Path,
                      out_root: Path,
                      images_base: Path | None = None,
                      by_split: bool = True,
                      preserve_tree: bool = True,
                      link_mode: str = "auto",
                      max_workers: int = 8):
    df = pd.read_csv(shard_csv)
    path_col = "image" if "image" in df.columns else ("file_path" if "file_path" in df.columns else None)
    if path_col is None:
        raise ValueError("CSV must have 'image' or 'file_path' column.")
    if by_split and "split" not in df.columns:
        raise ValueError("CSV is missing 'split' column needed for by_split=True.")

    shard_name = Path(shard_csv).stem
    out_base = Path(out_root) / shard_name
    images_base = Path(images_base).resolve() if images_base else None

    tasks = []
    missing = 0
    for _, row in df.iterrows():
        src = _resolve_local(row[path_col], rel_base=images_base)
        if not src.exists():
            missing += 1
            continue

        sub = row["split"] if by_split else ""
        if preserve_tree and images_base:
            try:
                rel = src.relative_to(images_base)
            except Exception:
                rel = src.name
            dest = (out_base / sub / rel) if isinstance(rel, Path) else (out_base / sub / str(rel))
        else:
            dest = out_base / sub / src.name

        tasks.append((src, dest, link_mode))

    ok = 0
    with ThreadPoolExecutor(max_workers=max_workers) as ex:
        futs = [ex.submit(_copy_or_link, src, dst, mode) for (src, dst, mode) in tasks]
        for fut in as_completed(futs):
            if fut.result():
                ok += 1

    print(f"[{shard_name}] linked/copied: {ok}   missing: {missing}   out: {out_base}")
    return out_base

def materialize_all_shards(shard_dir="data/meta",
                           pattern="annotation_v1_shard*_local.csv",
                           out_root="data/annotation_v1_images",
                           images_base="data/images",
                           link_mode="auto",
                           max_workers=8):
    shard_dir = Path(shard_dir)
    out_root = Path(out_root); out_root.mkdir(parents=True, exist_ok=True)
    shard_csvs = sorted(shard_dir.glob(pattern))
    if not shard_csvs:
        raise FileNotFoundError(f"No shard CSVs matching {pattern} in {shard_dir}")
    for csv in shard_csvs:
        materialize_shard(csv,
                          out_root,
                          images_base=images_base,
                          by_split=True,
                          preserve_tree=True,
                          link_mode=link_mode,
                          max_workers=max_workers)

In [15]:
out_root  = REPO_ROOT / "data" / "annotation_v1_images"

if not out_root.exists():
    materialize_all_shards(
        shard_dir = REPO_ROOT / "data" / "meta" / "v1",
        pattern   = "annotation_v1_shard*_local.csv",
        out_root  = out_root,
        images_base = REPO_ROOT / "data" / "images",
        link_mode = "hardlink",
        max_workers = 8
    )
else:
    print(f"[!] {out_root} already exists.")

[!] C:\Prog Projects\project-sbafn\data\annotation_v1_images already exists.
