# Dataset Preparation

In [1]:
import kagglehub
import os
import shutil, random
from pathlib import Path
from typing import List, Tuple, Set

In [2]:
path = kagglehub.dataset_download("hristohristov21/pid-symbols")
print("Path to dataset files:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/hristohristov21/pid-symbols?dataset_version_number=1...


100%|██████████| 1.41G/1.41G [01:05<00:00, 23.3MB/s]

Extracting files...





Path to dataset files: /root/.cache/kagglehub/datasets/hristohristov21/pid-symbols/versions/1


In [1]:
# drive_destination_path = "/content/drive/MyDrive/IndustrialAI"
# os.makedirs(drive_destination_path, exist_ok=True)
# try:
#     for item in os.listdir(path):
#         s = os.path.join(path, item)
#         d = os.path.join(drive_destination_path, item)
#         shutil.move(s, d)
#     print(f"All contents from {path} moved to: {drive_destination_path}")
# except Exception as e:
#     print(f"Error moving contents: {e}")

In [4]:
BASE_ROOT   = Path("/content/drive/MyDrive/IndustrialAI")            # project root in Colab
TRAIN_TXT   = BASE_ROOT / "train.txt"     # e.g., /content/train.txt
VAL_TXT     = BASE_ROOT / "val.txt"       # e.g., /content/val.txt
IMAGES_DIR  = BASE_ROOT / "images"        # where images actually live
LABELS_DIR  = BASE_ROOT / "labels"        # where labels actually live
OUT_ROOT    = BASE_ROOT / "Split_Dataset"   # new YOLO dataset root to create
TEST_RATIO = 0.20                        # portion of train -> test
SEED        = 42
USE_SYMLINK = True

In [5]:
random.seed(SEED)

def index_by_stem(root: Path, pattern: str):
    """Return dict: stem -> first matching path (fast lookup by name)."""
    idx = {}
    for p in root.rglob(pattern):
        idx.setdefault(p.stem, p)  # keep first seen
    return idx

def stems_from_list(list_file: Path):
    """Read stems from lines like './images/foo' or './images/foo.jpg'."""
    stems = []
    for line in list_file.read_text(encoding="utf-8").splitlines():
        s = line.strip()
        if not s or s.startswith("#"):
            continue
        stem = Path(s).stem  # 'foo'
        stems.append(stem)
    return stems

def ln_or_cp(src: Path, dst: Path):
    dst.parent.mkdir(parents=True, exist_ok=True)
    if dst.exists():
        return
    if USE_SYMLINK:
        try:
            os.symlink(src, dst)
            return
        except Exception:
            pass
    shutil.copy2(src, dst)

def stage(stems, subset, img_idx, lbl_idx):
    img_out = OUT_ROOT / "images" / subset
    lbl_out = OUT_ROOT / "labels" / subset
    img_out.mkdir(parents=True, exist_ok=True)
    lbl_out.mkdir(parents=True, exist_ok=True)

    n_ok = n_missing_img = n_missing_lbl = 0
    for stem in stems:
        img = img_idx.get(stem)
        lbl = lbl_idx.get(stem)
        if not img:
            n_missing_img += 1
            continue
        dst_img = img_out / f"{stem}{img.suffix}"
        ln_or_cp(img, dst_img)
        if lbl and lbl.exists():
            dst_lbl = lbl_out / f"{stem}.txt"
            ln_or_cp(lbl, dst_lbl)
        else:
            (lbl_out / f"{stem}.txt").write_text("", encoding="utf-8")
            n_missing_lbl += 1
        n_ok += 1
    print(f"{subset:>5}: {n_ok} files  | missing images: {n_missing_img}  | empty labels created: {n_missing_lbl}")

# 1) Build quick lookups by name
img_idx = index_by_stem(IMAGES_DIR, "*.jpg")
lbl_idx = index_by_stem(LABELS_DIR, "*.txt")

# 2) Read requested splits (by name)
train_stems_all = stems_from_list(TRAIN_TXT)
val_stems       = stems_from_list(VAL_TXT)

# Remove any overlap (val has priority)
train_stems_all = [s for s in train_stems_all if s not in set(val_stems)]

# 3) Split TRAIN => train + test
random.shuffle(train_stems_all)
k = int(round(len(train_stems_all) * TEST_RATIO))
test_stems  = train_stems_all[:k]
train_stems = train_stems_all[k:]

# 4) Stage (symlink or copy)
stage(train_stems, "train", img_idx, lbl_idx)
stage(test_stems,  "test",  img_idx, lbl_idx)
stage(val_stems,   "val",   img_idx, lbl_idx)

print("✅ Done. Dataset at:", OUT_ROOT)


train: 12513 files  | missing images: 6932  | empty labels created: 0
 test: 3105 files  | missing images: 1756  | empty labels created: 0
  val: 1947 files  | missing images: 1053  | empty labels created: 0
✅ Done. Dataset at: /content/drive/MyDrive/IndustrialAI/Split_Dataset


In [3]:
## Images count

path = "/content/drive/MyDrive/IndustrialAI/Split_Dataset/images/train"
fileList=os.listdir(path)
print(len(fileList))

# print(len([fname for fname in os.listdir(path) if os.path.isfile(os.path.join(path, fname))]))

path = "/content/drive/MyDrive/IndustrialAI/Split_Dataset/images/test"
fileList=os.listdir(path)
print(len(fileList))

# print(len([fname for fname in os.listdir(path) if os.path.isfile(os.path.join(path, fname))]))


path = "/content/drive/MyDrive/IndustrialAI/Split_Dataset/images/val"
fileList=os.listdir(path)
print(len(fileList))

# print(len([fname for fname in os.listdir(path) if os.path.isfile(os.path.join(path, fname))]))

12513
3105
1947


In [4]:
## Labels count

path = "/content/drive/MyDrive/IndustrialAI/Split_Dataset/labels/train"
fileList=os.listdir(path)
print(len(fileList))

# print(len([fname for fname in os.listdir(path) if os.path.isfile(os.path.join(path, fname))]))

path = "/content/drive/MyDrive/IndustrialAI/Split_Dataset/labels/test"
fileList=os.listdir(path)
print(len(fileList))

# print(len([fname for fname in os.listdir(path) if os.path.isfile(os.path.join(path, fname))]))


path = "/content/drive/MyDrive/IndustrialAI/Split_Dataset/labels/val"
fileList=os.listdir(path)
print(len(fileList))

# print(len([fname for fname in os.listdir(path) if os.path.isfile(os.path.join(path, fname))]))

12513
3105
1947
