In [1]:
from pathlib import Path
import re
from typing import Dict, List, Tuple

import pandas as pd

ROOT = Path.cwd()

def list_images(root: Path) -> List[Path]:
    exts = {".jpg", ".jpeg", ".png", ".bmp", ".tif", ".tiff"}
    return [p for p in root.rglob("*") if p.suffix.lower() in exts and p.is_file()]

def summarize_subfolders(root: Path) -> Dict[str, int]:
    if not root.exists():
        return {}
    counts: Dict[str, int] = {}
    for sub in sorted([p for p in root.iterdir() if p.is_dir()]):
        counts[sub.name] = len(list_images(sub))
    return counts

def parse_xray_name(name: str) -> Tuple[str, str, int | None, int | None]:
    """
    Expected pattern:
    XRay_<LedType>_<Solder>_Panel<Num>_LED<Num>_<suffix>.jpg
    """
    parts = name.split("_")
    led_type = parts[1] if len(parts) > 1 else ""
    solder = parts[2] if len(parts) > 2 else ""
    panel_num = None
    led_num = None
    for part in parts:
        if part.startswith("Panel"):
            panel_num = int(re.sub(r"[^\d]", "", part) or 0) or None
        if part.startswith("LED"):
            led_num = int(re.sub(r"[^\d]", "", part) or 0) or None
    return led_type, solder, panel_num, led_num

def inspect_csvs(csv_dir: Path) -> None:
    if not csv_dir.exists():
        print(f"[WARN] Missing CSV dir: {csv_dir}")
        return
    csvs = sorted(csv_dir.glob("*.csv"))
    print(f"\nCSV files in {csv_dir}: {len(csvs)}")
    for csv in csvs:
        df = pd.read_csv(csv, sep=";")
        print(f"\n== {csv.name} ==")
        print(df.head(3))
        if "Void rate" in df.columns:
            print("Void rate summary:")
            print(df["Void rate"].describe())

folders = {
    "SAM": ROOT / "SAM" / "SAM",
    "TTA": ROOT / "TTA" / "TTA",
    "TTA-Raw": ROOT / "TTA-Raw" / "TTA-Raw",
    "XRay": ROOT / "XRay" / "XRay",
}

for name, path in folders.items():
    counts = summarize_subfolders(path)
    total = sum(counts.values())
    print(f"\n{name}: {path}")
    print(f"Subfolders: {len(counts)} | Images: {total}")
    for sub, cnt in counts.items():
        print(f"  - {sub}: {cnt}")

inspect_csvs(ROOT / "CrackVoid Ratios")

xray_root = folders["XRay"]
if xray_root.exists():
    sample = list_images(xray_root)[:5]
    print("\nXRay filename samples:")
    for img in sample:
        led_type, solder, panel_num, led_num = parse_xray_name(img.name)
        print(
            f"{img.name} -> led_type={led_type}, solder={solder}, "
            f"panel={panel_num}, led={led_num}"
        )



SAM: d:\soldercracks\SAM\SAM
Subfolders: 8 | Images: 7548
  - FC-GB1: 1000
  - FC-GB2: 1000
  - FC-GB3: 1000
  - FC-SP1: 1000
  - FC-SP2: 1000
  - FC-SP3: 1000
  - FC-SP4: 1000
  - VTF1: 548

TTA: d:\soldercracks\TTA\TTA
Subfolders: 8 | Images: 0
  - FC-GB1: 0
  - FC-GB2: 0
  - FC-GB3: 0
  - FC-SP1: 0
  - FC-SP2: 0
  - FC-SP3: 0
  - FC-SP4: 0
  - VTF1: 0

TTA-Raw: d:\soldercracks\TTA-Raw\TTA-Raw
Subfolders: 5 | Images: 0
  - FC-GB1: 0
  - FC-GB2: 0
  - FC-GB3: 0
  - FC-SP1: 0
  - FC-SP2: 0

XRay: d:\soldercracks\XRay\XRay
Subfolders: 9 | Images: 1800
  - FC-GB1: 200
  - FC-GB2: 200
  - FC-GB3: 200
  - FC-SP1: 200
  - FC-SP2: 200
  - FC-SP3: 200
  - FC-SP4: 200
  - VTF1: 200
  - VTF2: 200

CSV files in d:\soldercracks\CrackVoid Ratios: 8

== CrackRatioFC-GB1.csv ==
  Led Type  Solder  Panel  LED Number  Total Crack Ratio 0 TSC  \
0   FC-GB1  SAC105      1           1                 3.764779   
1   FC-GB1  SAC105      1           2                 4.340386   
2   FC-GB1  SAC105      1 

In [7]:
import pandas as pd
from pathlib import Path

csv_path = Path("CrackVoid Ratios") / "Xray Void Ratio.csv"
df = pd.read_csv(csv_path, sep=";")
df["Void rate"].describe()





count     1800
unique    1743
top        0.0
freq        53
Name: Void rate, dtype: object

In [8]:
df["Void rate"] = pd.to_numeric(df["Void rate"], errors="coerce")
df["Void rate"].describe()
df["Void rate"].quantile([0.5, 0.75, 0.9, 0.95])

0.50    0.020186
0.75    0.036056
0.90    0.058460
0.95    0.073802
Name: Void rate, dtype: float64

In [9]:
threshold = 0.036  # try 0.058 for stricter defect
df["label"] = (df["Void rate"] >= threshold).astype(int)  # 1=defect, 0=normal
df["label"].value_counts()


label
0    1349
1     451
Name: count, dtype: int64

In [10]:
import re
from pathlib import Path

# Load CSV
csv_path = Path("CrackVoid Ratios") / "Xray Void Ratio.csv"
df = pd.read_csv(csv_path, sep=";")
df["Void rate"] = pd.to_numeric(df["Void rate"], errors="coerce")

threshold = 0.036
df["label"] = (df["Void rate"] >= threshold).astype(int)

# Build a lookup by (Led Type, Panel, LED Number)
def key_from_row(r):
    return (r["Led Type"], int(r["Panel"]), int(r["LED Number"]))

lookup = {key_from_row(r): r for _, r in df.iterrows()}

# Parse XRay filenames
xray_root = Path("XRay") / "XRay"
rows = []

for img_path in xray_root.rglob("*.jpg"):
    name = img_path.name
    parts = name.split("_")
    if len(parts) < 5:
        continue
    led_type = parts[1]
    panel = None
    led_num = None
    for part in parts:
        if part.startswith("Panel"):
            panel = int(re.sub(r"[^\d]", "", part))
        if part.startswith("LED"):
            led_num = int(re.sub(r"[^\d]", "", part))
    if panel is None or led_num is None:
        continue
    key = (led_type, panel, led_num)
    if key in lookup:
        r = lookup[key]
        rows.append({
            "path": str(img_path),
            "label": int(r["label"]),
            "void_rate": float(r["Void rate"]),
            "led_type": led_type,
            "panel": panel,
            "led_number": led_num,
        })

labeled = pd.DataFrame(rows)
labeled.head()


Unnamed: 0,path,label,void_rate,led_type,panel,led_number
0,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.031611,FC-GB1,1,1
1,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.018718,FC-GB1,1,2
2,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.005978,FC-GB1,1,3
3,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.015036,FC-GB1,1,4
4,XRay\XRay\FC-GB1\XRay_FC-GB1_SAC105_Panel1_LED...,0,0.023375,FC-GB1,1,5


In [11]:
labeled["label"].value_counts()
labeled["void_rate"].describe()


count    1800.000000
mean        0.028636
std         0.023569
min         0.000000
25%         0.010268
50%         0.023420
75%         0.039848
max         0.107209
Name: void_rate, dtype: float64