In [None]:
# Install necessary libraries
!pip -q install ultralytics roboflow==1.* rich
import os, shutil, json, yaml, re, glob, random, math
from pathlib import Path
import matplotlib.pyplot as plt
from collections import Counter, defaultdict


In [None]:
# Install necessary datasets
!rm -rf "/content/VIETNAM-SIGN-LANGUAGE-7"

from roboflow import Roboflow
import os
os.environ["ROBOFLOW_API_KEY"] = "aqOzlejG9d4xpiYBF3JI"

rf = Roboflow(api_key=os.environ["ROBOFLOW_API_KEY"])
project = rf.workspace("ho-chi-minh-university-of-technology-clmwp").project("vietnam-sign-language")
version = project.version(7)
dataset = version.download("yolov8")
print("Location:", dataset.location)


In [None]:
""" Reshape to YOLO shape """
root = Path(dataset.location)
print(root)

if (root/"valid").exists() and not (root/"val").exists():
    (root/"valid").rename(root/"val")

def ensure_ultralytics_layout(root: Path):
    for split in ["train","val","test"]:
        if not (root/split).exists():
            continue
        (root/"images"/split).mkdir(parents=True, exist_ok=True)
        (root/"labels"/split).mkdir(parents=True, exist_ok=True)

        if (root/split/"images").exists():
            for p in (root/split/"images").glob("*"):
                shutil.move(str(p), str(root/"images"/split/p.name))
        else:
            for p in (root/split).glob("*"):
                if p.suffix.lower() in [".jpg",".jpeg",".png",".bmp",".tif",".tiff",".webp"]:
                    shutil.move(str(p), str(root/"images"/split/p.name))

        if (root/split/"labels").exists():
            for p in (root/split/"labels").glob("*.txt"):
                shutil.move(str(p), str(root/"labels"/split/p.name))
        else:
            for p in (root/split).glob("*.txt"):
                shutil.move(str(p), str(root/"labels"/split/p.name))

        try:
            shutil.rmtree(root/split)
        except Exception:
            pass

ensure_ultralytics_layout(root)

data_yaml = root/"data.yaml"
with open(data_yaml) as f:
    data = yaml.safe_load(f)

data["path"] = str(root.resolve())
data["train"] = "images/train"
data["val"]   = "images/val"
if (root/"images/test").exists():
    data["test"]  = "images/test"

with open(data_yaml, "w") as f:
    yaml.safe_dump(data, f, sort_keys=False)

print("Classes:", data.get("names"))


In [None]:
""" Divide into train, val, test """
from pathlib import Path
from sklearn.model_selection import train_test_split
from collections import Counter
import shutil, yaml

ROOT   = Path("/content/VIETNAM-SIGN-LANGUAGE-7")  # dataset root
RATIOS = (0.70, 0.20, 0.10)  # train, val, test
SEED   = 42
IMG_EXTS = {".jpg",".jpeg",".png",".bmp",".tif",".tiff",".webp"}

# Collect pairs (image, label)
def find_pairs(root: Path):
    pairs = []
    for sp in ["train","val","test"]:
        img_dir = root/"images"/sp
        lbl_dir = root/"labels"/sp
        if img_dir.exists() and lbl_dir.exists():
            for img in img_dir.iterdir():
                if img.suffix.lower() in IMG_EXTS:
                    lbl = lbl_dir/(img.stem + ".txt")
                    if lbl.exists():
                        pairs.append((img, lbl))
    return pairs

# Handle class for stratification
def majority_class(lbl_path: Path):
    try:
        with open(lbl_path) as f:
            lines = [ln.strip() for ln in f if ln.strip()]
        if not lines:
            return 0
        counts = Counter(int(ln.split()[0]) for ln in lines)
        return counts.most_common(1)[0][0]
    except Exception:
        return 0

pairs = find_pairs(ROOT)
print(f"Found {len(pairs)} image/label pairs")

# Re-split dataset
y_all = [majority_class(lbl) for _, lbl in pairs]

train_pairs, temp_pairs, y_train, y_temp = train_test_split(
    pairs, y_all, test_size=(1.0 - RATIOS[0]), random_state=SEED, stratify=y_all
)

val_portion = RATIOS[1] / (RATIOS[1] + RATIOS[2])
val_pairs, test_pairs, _, _ = train_test_split(
    temp_pairs, y_temp, test_size=(1 - val_portion), random_state=SEED, stratify=y_temp
)

# Place into new folders
out_root = ROOT/"split_70_20_10"
for sp in ["train","val","test"]:
    (out_root/"images"/sp).mkdir(parents=True, exist_ok=True)
    (out_root/"labels"/sp).mkdir(parents=True, exist_ok=True)

def place(pairs, split):
    for img, lbl in pairs:
        shutil.copy2(img, out_root/"images"/split/img.name)
        shutil.copy2(lbl, out_root/"labels"/split/lbl.name)

place(train_pairs, "train")
place(val_pairs,   "val")
place(test_pairs,  "test")

# Write data.yaml
data = {
    "path": str(out_root.resolve()),
    "train": "images/train",
    "val": "images/val",
    "test": "images/test",
}
yaml.safe_dump(data, open(out_root/"data.yaml","w"), sort_keys=False)

for sp in ["train","val","test"]:
    ni = len(list((out_root/"images"/sp).glob("*")))
    nl = len(list((out_root/"labels"/sp).glob("*.txt")))
    print(f"{sp}: {ni} images, {nl} labels")


In [None]:
with open(root/"data.yaml") as f:
    data = yaml.safe_load(f)
names = data.get("names", [])
nc = data.get("nc", len(names)) or len(names)
if not names:
    names = [str(i) for i in range(nc)]
print(f"Classes ({len(names)}):", names)


In [None]:
# Count number of images and labels of each class

def yolo_box_counter(label_file):
    cnt = Counter()
    with open(label_file) as f:
        for line in f:
            cid = int(line.strip().split()[0])
            cnt[cid] += 1
    return cnt

split_stats = {}
class_counts = Counter()

for split in ["train","val","test"]:
    img_dir = root/"images"/split
    lbl_dir = root/"labels"/split
    imgs = list(img_dir.glob("*"))
    lbls = list(lbl_dir.glob("*.txt"))
    n_imgs = len(imgs)
    n_lbls = 0
    for lf in lbls:
        c = yolo_box_counter(lf)
        n_lbls += sum(c.values())
        class_counts.update(c)
    split_stats[split] = {"images": n_imgs, "boxes": n_lbls}

split_stats, class_counts


In [None]:
# Class distribution + imbalance ratio + bar chart
from pathlib import Path
from collections import Counter
import yaml, matplotlib.pyplot as plt

ROOT = Path("/content/VIETNAM-SIGN-LANGUAGE-7")
IMG_EXTS = {".jpg",".jpeg",".png",".bmp",".tif",".tiff",".webp"}

with open(ROOT/"data.yaml") as f:
    data = yaml.safe_load(f)
names = data.get("names", [])
nc = len(names)

def yolo_box_counter(label_file: Path):
    c = Counter()
    with open(label_file) as f:
        for line in f:
            parts = line.strip().split()
            if not parts:
                continue
            try:
                cid = int(parts[0])
                c[cid] += 1
            except:
                pass
    return c

def split_dirs(split):
    if (ROOT/split/"images").exists() and (ROOT/split/"labels").exists():
        return ROOT/split/"images", ROOT/split/"labels"
    if (ROOT/"images"/split).exists() and (ROOT/"labels"/split).exists():
        return ROOT/"images"/split, ROOT/"labels"/split
    return None, None

per_split_counts = {}
total_counts = Counter()
for sp in ["train","val","test","valid"]:
    img_dir, lbl_dir = split_dirs(sp)
    if not img_dir:
        continue
    c = Counter()
    for lf in lbl_dir.glob("*.txt"):
        c.update(yolo_box_counter(lf))
    per_split_counts["val" if sp=="valid" else sp] = c
    total_counts.update(c)

vals = [total_counts.get(i, 0) for i in range(nc)]

nonzero = [v for v in vals if v > 0]
imbalance_ratio = (max(nonzero) / min(nonzero)) if nonzero else float("nan")

print("Number of bbox per split:")
for sp, c in per_split_counts.items():
    print(f"  {sp:>5}: {sum(c.values())} boxes")
print(f"\nImbalance ratio (max/min,): {imbalance_ratio:.2f}")

if nonzero:
    max_i = vals.index(max(vals))
    min_i = vals.index(min(nonzero))
    print(f"  Max: {names[max_i]} = {vals[max_i]}")
    print(f"  Min : {names[min_i]} = {vals[min_i]}")
missing = [names[i] for i,v in enumerate(vals) if v==0]
if missing:
    print("missing", missing)

plt.figure(figsize=(10,4))
plt.bar([names[i] for i in range(nc)], vals)
plt.title("Bounding box in each class")
plt.xlabel("class"); plt.ylabel("Bounding box")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


In [None]:
def check_label_file(path, nc):
    bad = {"class_out_of_range":0, "coords_out_of_range":0, "zero_box":0, "format_error":0}
    with open(path) as f:
        for ln in f:
            parts = ln.strip().split()
            if len(parts)!=5:
                bad["format_error"]+=1; continue
            c, x, y, w, h = parts
            try:
                c = int(c); x=float(x); y=float(y); w=float(w); h=float(h)
            except:
                bad["format_error"]+=1; continue
            if c<0 or c>=nc: bad["class_out_of_range"]+=1
            if w<=0 or h<=0: bad["zero_box"]+=1
            for v in (x,y,w,h):
                if v<0 or v>1: bad["coords_out_of_range"]+=1; break
    return bad

errors = Counter()
for lf in root.rglob("labels/*.txt"):
    e = check_label_file(lf, len(names))
    errors.update(e)

print("Label issues:", dict(errors))


In [None]:
def stem(p): return p.stem
for split in ["train","val","test"]:
    imgs = {p.stem for p in (root/"images"/split).glob("*")}
    lbls = {p.stem for p in (root/"labels"/split).glob("*.txt")}
    print(split,
          "no_label:", len(imgs - lbls),
          "no_image:", len(lbls - imgs))


In [None]:
from ultralytics import YOLO
model = YOLO("yolov8n.pt")  # small model
model.train(data=str(root/"data.yaml"), epochs=1, imgsz=640, device=0)


In [None]:
from google.colab import drive
import shutil
import os

# Mount Google Drive
drive.mount('/content/drive')

# Path to dataset
DATASET_DIR = "/content/VIETNAM-SIGN-LANGUAGE-7"
TARGET_DIR  = "/content/drive/MyDrive/vietnam_sign_language_resplit"

# Remove old copy if exists
if os.path.exists(TARGET_DIR):
    shutil.rmtree(TARGET_DIR)

# Copy the entire dataset folder to Google Drive
shutil.copytree(DATASET_DIR, TARGET_DIR)

print(f"Dataset saved to: {TARGET_DIR}")
