In [21]:
# %%
# üß© Convert YOLO + CSV Datasets into Pandas DataFrames (no saving)

import os
from pathlib import Path
import yaml
import pandas as pd
from glob import glob
from PIL import Image


# %%
# üìò 1. Load YOLO config

yaml_path = "datasets/Playing-Cards-Object-Detection-Dataset/data.yaml"
with open(yaml_path, "r") as f:
    yolo_cfg = yaml.safe_load(f)

yolo_class_names = yolo_cfg["names"]
yolo_class_to_id = {name: idx for idx, name in enumerate(yolo_class_names)}

print(f"Loaded {len(yolo_class_names)} classes from {yaml_path}")

# %%
# üß© Helpers

IMG_EXTS = [".jpg", ".jpeg", ".png"]

def find_image(images_dir: Path, stem: str) -> Path | None:
    """Return the first existing image path among common extensions."""
    for ext in IMG_EXTS:
        p = images_dir / f"{stem}{ext}"
        if p.exists():
            return p
    return None

def get_image_size(filename, base_folder):
    path = Path(base_folder) / filename
    with Image.open(path) as im:
        return im.width, im.height
# %%
# üìò 2. Parse YOLO label files into a DataFrame (using actual structure)

def load_yolo_split(base_dir: Path) -> pd.DataFrame:
    labels_dir = base_dir / "labels"
    images_dir = base_dir / "images"

    if not labels_dir.exists():
        print(f"‚ö†Ô∏è No labels folder found: {labels_dir}")
        return pd.DataFrame(columns=["image", "class_id", "class_name", "x_center", "y_center", "width", "height"])

    txt_files = sorted(labels_dir.glob("*.txt"))
    if not txt_files:
        print(f"‚ö†Ô∏è No label files found in {labels_dir}")
        return pd.DataFrame(columns=["image", "class_id", "class_name", "x_center", "y_center", "width", "height"])

    all_rows = []
    for txt_file in txt_files:
        stem = txt_file.stem
        image_path = find_image(images_dir, stem)
        image_str = str(image_path) if image_path else str(images_dir / f"{stem}.jpg")

        with open(txt_file, "r") as f:
            for line in f:
                parts = line.strip().split()
                if len(parts) != 5:
                    continue
                class_id, x_center, y_center, width, height = parts
                class_id = int(class_id)
                if class_id >= len(yolo_class_names):
                    continue
                class_name = yolo_class_names[class_id]
                all_rows.append(
                    [image_str, class_id, class_name, float(x_center), float(y_center), float(width), float(height)]
                )

    df = pd.DataFrame(all_rows, columns=["image", "class_id", "class_name", "x_center", "y_center", "width", "height"])
    return df

# Real dataset paths
base_path = Path("datasets/Playing-Cards-Object-Detection-Dataset")
train_base = base_path / "train"
valid_base = base_path / "valid"
test_base  = base_path / "test"

yolo_train_df = load_yolo_split(train_base)
yolo_valid_df = load_yolo_split(valid_base)
yolo_test_df  = load_yolo_split(test_base)

print(f"YOLO train: {len(yolo_train_df)}  |  valid: {len(yolo_valid_df)}  |  test: {len(yolo_test_df)}")

# %%
# üß† 3. Mapping from full names ‚Üí YOLO short names

full_to_short = {
    "ace of clubs": "Ac", "ace of diamonds": "Ad", "ace of hearts": "Ah", "ace of spades": "As",
    "two of clubs": "2c", "two of diamonds": "2d", "two of hearts": "2h", "two of spades": "2s",
    "three of clubs": "3c", "three of diamonds": "3d", "three of hearts": "3h", "three of spades": "3s",
    "four of clubs": "4c", "four of diamonds": "4d", "four of hearts": "4h", "four of spades": "4s",
    "five of clubs": "5c", "five of diamonds": "5d", "five of hearts": "5h", "five of spades": "5s",
    "six of clubs": "6c", "six of diamonds": "6d", "six of hearts": "6h", "six of spades": "6s",
    "seven of clubs": "7c", "seven of diamonds": "7d", "seven of hearts": "7h", "seven of spades": "7s",
    "eight of clubs": "8c", "eight of diamonds": "8d", "eight of hearts": "8h", "eight of spades": "8s",
    "nine of clubs": "9c", "nine of diamonds": "9d", "nine of hearts": "9h", "nine of spades": "9s",
    "ten of clubs": "10c", "ten of diamonds": "10d", "ten of hearts": "10h", "ten of spades": "10s",
    "jack of clubs": "Jc", "jack of diamonds": "Jd", "jack of hearts": "Jh", "jack of spades": "Js",
    "queen of clubs": "Qc", "queen of diamonds": "Qd", "queen of hearts": "Qh", "queen of spades": "Qs",
    "king of clubs": "Kc", "king of diamonds": "Kd", "king of hearts": "Kh", "king of spades": "Ks",
}

typo_fixes = {"eigth": "eight"}
SUIT_BY_PREFIX = {"c": "clubs", "d": "diamonds", "h": "hearts", "s": "spades"}

def normalize_full_name(s: str) -> str:
    s = s.strip().lower()
    for bad, good in typo_fixes.items():
        s = s.replace(bad, good)
    return " ".join(s.split())

def repair_label(label: str, filename: str) -> str | None:
    if label == "seven of seven":
        prefix = Path(filename).stem[0].lower()
        suit = SUIT_BY_PREFIX.get(prefix)
        if suit:
            return f"seven of {suit}"
    return None

def map_full_to_short(label: str, filename: str) -> str | None:
    norm = normalize_full_name(label)
    if norm in full_to_short:
        return full_to_short[norm]
    repaired = repair_label(norm, filename)
    if repaired in full_to_short:
        return full_to_short[repaired]
    return None

# %%
# üìò 4. Parse VOC CSVs and convert to YOLO format

def convert_voc_to_yolo(xmin, ymin, xmax, ymax, img_w, img_h):
    x_center = (xmin + xmax) / 2 / img_w
    y_center = (ymin + ymax) / 2 / img_h
    width = (xmax - xmin) / img_w
    height = (ymax - ymin) / img_h
    return x_center, y_center, width, height

def parse_voc_csv(csv_path: str, base_dir: str) -> pd.DataFrame:
    """
    Parse VOC-style CSV and convert to YOLO format.
    Adds full image path based on the dataset base directory.
    """
    if not os.path.exists(csv_path):
        return pd.DataFrame()

    df_csv = pd.read_csv(csv_path)
    all_rows = []
    skipped = 0

    for _, row in df_csv.iterrows():
        fname = str(row["filename"]).strip()
        label = str(row["class"]).strip()

        mapped = map_full_to_short(label, fname)
        if not mapped:
            print(f"‚ö†Ô∏è Skipping unknown class: '{label}' in {fname}")
            skipped += 1
            continue

        try:
            img_w = float(row["width"])
            img_h = float(row["height"])
            if img_w == 0 or img_h == 0:
                print(f"‚ö†Ô∏è Skipping {fname}: image size is zero ({img_w}x{img_h})")
                skipped += 1
                continue
        except Exception as e:
            print(f"‚ö†Ô∏è Skipping {fname}: bad width/height ({e})")
            skipped += 1
            continue

        x_center, y_center, width, height = convert_voc_to_yolo(
            float(row["xmin"]), float(row["ymin"]),
            float(row["xmax"]), float(row["ymax"]),
            img_w, img_h
        )

        class_id = yolo_class_to_id[mapped]
        image_path = str(Path(base_dir) / fname)

        all_rows.append([image_path, class_id, mapped, x_center, y_center, width, height])

    df = pd.DataFrame(
        all_rows,
        columns=["image", "class_id", "class_name", "x_center", "y_center", "width", "height"]
    )
    print(f"{csv_path}: parsed {len(df)} valid annotations, skipped {skipped}.")
    return df


voc_train_csv = "datasets/Playing-Cards-Images-Object-Detection-Dataset/train_labels.csv"
voc_test_csv  = "datasets/Playing-Cards-Images-Object-Detection-Dataset/test_labels.csv"

voc_train_base = "datasets/Playing-Cards-Images-Object-Detection-Dataset/train/train"
voc_test_base  = "datasets/Playing-Cards-Images-Object-Detection-Dataset/test/test"

voc_train_df = parse_voc_csv(voc_train_csv, voc_train_base)
voc_test_df  = parse_voc_csv(voc_test_csv, voc_test_base)

voc_df = pd.concat([voc_train_df, voc_test_df], ignore_index=True)
print(f"VOC total (CSV): {len(voc_df)} annotations")

# %%
# ‚úÖ 5. Final Output

print("YOLO (train):")
display(yolo_train_df.head())

print("YOLO (valid):")
display(yolo_valid_df.head())

print("YOLO (test):")
display(yolo_test_df.head())

print("VOC (train):")
display(voc_train_df.head())

print("VOC (test):")
display(voc_test_df.head())

# 6. Save to CSV
yolo_train_df.to_csv("datasets/Playing-Cards-Object-Detection-Dataset/yolo_train_converted.csv", index=False)
yolo_valid_df.to_csv("datasets/Playing-Cards-Object-Detection-Dataset/yolo_valid_converted.csv", index=False)
yolo_test_df.to_csv("datasets/Playing-Cards-Object-Detection-Dataset/yolo_test_converted.csv", index=False)
voc_train_df.to_csv("datasets/Playing-Cards-Images-Object-Detection-Dataset/voc_train_converted.csv", index=False)
voc_test_df.to_csv("datasets/Playing-Cards-Images-Object-Detection-Dataset/voc_test_converted.csv", index=False)


Loaded 52 classes from datasets/Playing-Cards-Object-Detection-Dataset/data.yaml
YOLO train: 53003  |  valid: 15159  |  test: 7588
‚ö†Ô∏è Skipping 3.png: image size is zero (0.0x0.0)
‚ö†Ô∏è Skipping 28.jpg: image size is zero (0.0x0.0)
‚ö†Ô∏è Skipping q2345235.png: image size is zero (0.0x0.0)
datasets/Playing-Cards-Images-Object-Detection-Dataset/train_labels.csv: parsed 536 valid annotations, skipped 3.
‚ö†Ô∏è Skipping unknown class: 'three of dimaonds' in d313.jpg
‚ö†Ô∏è Skipping unknown class: 'three of dimaonds' in d312.jpg
‚ö†Ô∏è Skipping unknown class: 'six of dimaonds' in d610.png
‚ö†Ô∏è Skipping unknown class: 'four of dimaonds' in d410.png
‚ö†Ô∏è Skipping unknown class: 'three of dimaonds' in d314.jpg
‚ö†Ô∏è Skipping unknown class: 'five of dimaonds' in d512.png
‚ö†Ô∏è Skipping unknown class: 'three of dimaonds' in d315.png
‚ö†Ô∏è Skipping unknown class: 'three of dimaonds' in d311.jpg
datasets/Playing-Cards-Images-Object-Detection-Dataset/test_labels.csv: parsed 90 valid ann

Unnamed: 0,image,class_id,class_name,x_center,y_center,width,height
0,datasets/Playing-Cards-Object-Detection-Datase...,51,Qs,0.460337,0.69351,0.064904,0.042067
1,datasets/Playing-Cards-Object-Detection-Datase...,37,Ad,0.644231,0.501202,0.063702,0.088942
2,datasets/Playing-Cards-Object-Detection-Datase...,37,Ad,0.614183,0.074519,0.064904,0.088942
3,datasets/Playing-Cards-Object-Detection-Datase...,17,5d,0.480769,0.793269,0.063702,0.055288
4,datasets/Playing-Cards-Object-Detection-Datase...,24,7c,0.445913,0.737981,0.066106,0.052885


YOLO (valid):


Unnamed: 0,image,class_id,class_name,x_center,y_center,width,height
0,datasets/Playing-Cards-Object-Detection-Datase...,27,7s,0.308894,0.709135,0.096154,0.078125
1,datasets/Playing-Cards-Object-Detection-Datase...,11,3s,0.280048,0.612981,0.104567,0.0625
2,datasets/Playing-Cards-Object-Detection-Datase...,38,Ah,0.268029,0.510817,0.097356,0.045673
3,datasets/Playing-Cards-Object-Detection-Datase...,38,Ah,0.581731,0.219952,0.09976,0.044471
4,datasets/Playing-Cards-Object-Detection-Datase...,12,4c,0.229567,0.465144,0.050481,0.098558


YOLO (test):


Unnamed: 0,image,class_id,class_name,x_center,y_center,width,height
0,datasets/Playing-Cards-Object-Detection-Datase...,39,As,0.645433,0.5,0.074519,0.088942
1,datasets/Playing-Cards-Object-Detection-Datase...,29,8d,0.588942,0.538462,0.066106,0.098558
2,datasets/Playing-Cards-Object-Detection-Datase...,24,7c,0.436298,0.572115,0.051683,0.100962
3,datasets/Playing-Cards-Object-Detection-Datase...,24,7c,0.138221,0.260817,0.054087,0.104567
4,datasets/Playing-Cards-Object-Detection-Datase...,30,8h,0.55649,0.145433,0.044471,0.097356


VOC (train):


Unnamed: 0,image,class_id,class_name,x_center,y_center,width,height
0,datasets/Playing-Cards-Images-Object-Detection...,38,Ah,0.501333,0.544167,0.605333,0.541667
1,datasets/Playing-Cards-Images-Object-Detection...,9,3d,0.494536,0.505455,0.57377,0.589091
2,datasets/Playing-Cards-Images-Object-Detection...,42,Jh,0.496441,0.479787,0.893238,0.751064
3,datasets/Playing-Cards-Images-Object-Detection...,27,7s,0.460784,0.493927,0.754902,0.882591
4,datasets/Playing-Cards-Images-Object-Detection...,24,7c,0.497268,0.503636,0.863388,0.869091


VOC (test):


Unnamed: 0,image,class_id,class_name,x_center,y_center,width,height
0,datasets/Playing-Cards-Images-Object-Detection...,26,7h,0.558659,0.521352,0.782123,0.629893
1,datasets/Playing-Cards-Images-Object-Detection...,8,3c,0.502688,0.483333,0.962366,0.92963
2,datasets/Playing-Cards-Images-Object-Detection...,23,6s,0.488701,0.459649,0.548023,0.442105
3,datasets/Playing-Cards-Images-Object-Detection...,30,8h,0.508287,0.467742,0.939227,0.878136
4,datasets/Playing-Cards-Images-Object-Detection...,37,Ad,0.509659,0.469065,0.8875,0.861871


In [24]:
# %%
# üß© Dataset Analysis for YOLO & VOC DataFrames
import pandas as pd
from pathlib import Path

# %%
# üìò 1. Load exported CSVs (update paths if needed)

datasets = {
    "yolo_train": "datasets/Playing-Cards-Object-Detection-Dataset/yolo_train_converted.csv",
    "yolo_valid": "datasets/Playing-Cards-Object-Detection-Dataset/yolo_valid_converted.csv",
    "yolo_test":  "datasets/Playing-Cards-Object-Detection-Dataset/yolo_test_converted.csv",
    "voc_train":  "datasets/Playing-Cards-Images-Object-Detection-Dataset/voc_train_converted.csv",
    "voc_test":   "datasets/Playing-Cards-Images-Object-Detection-Dataset/voc_test_converted.csv",
}

dfs = {}
for name, path in datasets.items():
    if Path(path).exists():
        df = pd.read_csv(path)
        dfs[name] = df
        print(f"‚úÖ Loaded {name}: {len(df):,} annotations")
    else:
        print(f"‚ö†Ô∏è Missing file: {path}")

# %%
# üßÆ 2. Count how many classes are found per split

def class_summary(df: pd.DataFrame, name: str):
    if df.empty:
        print(f"{name}: empty DataFrame")
        return None

    num_classes = df['class_id'].nunique()
    counts = df['class_name'].value_counts().reset_index()
    counts.columns = ['class_name', 'count']
    print(f"\n{name.upper()} ‚Äî {num_classes} unique classes\n{'-'*50}")
    print(counts.head(10).to_string(index=False))  # show top 10
    return counts

class_summaries = {k: class_summary(v, k) for k, v in dfs.items() if not v.empty}

# %%
# üìä 3. Combine split sizes overview

split_info = []
for split, df in dfs.items():
    if df.empty:
        continue
    total_anns = len(df)
    total_imgs = df['image'].nunique() if 'image' in df.columns else total_anns
    split_info.append({
        "Split": split,
        "Annotations": total_anns,
        "Unique Images": total_imgs,
        "Classes": df["class_id"].nunique()
    })

split_df = pd.DataFrame(split_info)
print("\n" + "=" * 60)
print("DATASET SPLIT OVERVIEW")
print("=" * 60)
print(split_df.to_string(index=False))

# %%
# üìò 4. Build image-label coverage statistics like your old code

rows = []
splits = {}

for split_name, df in dfs.items():
    if not df.empty:
        splits[split_name] = {
            "images": df["image"].unique().tolist(),
            "labels": df["image"].unique().tolist(),
        }

for split, d in splits.items():
    img_paths = d['images']
    lbl_paths = set(d['labels'])
    for ip in img_paths:
        rows.append({
            "split": split,
            "image_path": ip,
            "label_path": ip if ip in lbl_paths else None
        })

df = pd.DataFrame(rows)

split_stats = []
for split in splits.keys():
    split_df = df[df['split'] == split]
    num_images = len(split_df)
    num_labels = split_df['label_path'].notna().sum()
    missing_labels = split_df['label_path'].isna().sum()
    coverage = (num_labels / num_images * 100) if num_images > 0 else 0
    
    split_stats.append({
        'Split': split,
        'Images': num_images,
        'Labels': num_labels,
        'Missing': missing_labels,
        'Coverage': f"{coverage:.2f}%"
    })

stats_df = pd.DataFrame(split_stats)
print("\n" + "=" * 60)
print("DATASET DISTRIBUTION OVERVIEW")
print("=" * 60)
print(stats_df.to_string(index=False))

# %%
# üìò 5. Global statistics

total_images = df.shape[0]
total_labels = df['label_path'].notna().sum()
total_missing = df['label_path'].isna().sum()
missing_pct = (total_missing / total_images * 100) if total_images > 0 else 0

print("\n" + "-" * 60)
print("GLOBAL STATISTICS")
print("-" * 60)
print(f"Total images:          {total_images:,}")
print(f"Total labels:          {total_labels:,}")
print(f"Missing labels:        {total_missing:,}")
print(f"Missing percentage:    {missing_pct:.2f}%")
print(f"Dataset completeness:  {100 - missing_pct:.2f}%")
print("=" * 60)


‚úÖ Loaded yolo_train: 53,003 annotations
‚úÖ Loaded yolo_valid: 15,159 annotations
‚úÖ Loaded yolo_test: 7,588 annotations
‚úÖ Loaded voc_train: 536 annotations
‚úÖ Loaded voc_test: 90 annotations

YOLO_TRAIN ‚Äî 52 unique classes
--------------------------------------------------
class_name  count
        8d   1171
        4h   1117
        3h   1098
        7d   1085
        Jd   1075
        2d   1071
        3s   1066
        2c   1062
        Ac   1062
        Qd   1050

YOLO_VALID ‚Äî 52 unique classes
--------------------------------------------------
class_name  count
        3s    358
        8s    339
        6h    329
        As    322
       10h    322
        Qs    317
        Qd    317
        3h    313
        4h    312
        7d    311

YOLO_TEST ‚Äî 52 unique classes
--------------------------------------------------
class_name  count
        5d    173
       10d    170
        5h    169
        3h    169
        5s    168
        6d    167
        7h    167
        