## Number of label photos per person:

In [1]:
import os

"""
Script: count_json_files_by_labeler.py

Purpose:
    - Count how many LabelMe JSON files are associated with each annotator,
      inferred from annotator identifiers embedded in the filenames.

Inputs:
    - folder_path: Directory containing JSON files.
    - labelers:    List of annotator identifiers to match in filenames.

Outputs:
    - Console summary of counts per annotator.
    - Console total of matched JSON files.

Notes:
    - A file is counted for at most one annotator (first match wins).
"""

# === CONFIGURATION ===
folder_path = "data_json"  # Directory containing the JSON files

# List of annotator identifiers to count
labelers = ["Chaudpb", "Thunv10", "NghiaNT20", "khaihoan2003"]

# === INITIALIZE COUNTERS ===
counts = {labeler: 0 for labeler in labelers}

# === ITERATE OVER FILES AND COUNT MATCHES ===
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        for labeler in labelers:
            if labeler in filename:
                counts[labeler] += 1
                # Stop after the first match to avoid double counting
                break

# === PRINT PER-LABELER SUMMARY ===
for labeler, count in counts.items():
    print(f"{labeler}: {count} JSON file(s)")

# === PRINT TOTAL MATCHED FILES ===
total_files = sum(counts.values())
print(f"Total matched JSON files: {total_files}")


Chaudpb: 557 JSON file(s)
Thunv10: 615 JSON file(s)
NghiaNT20: 364 JSON file(s)
khaihoan2003: 505 JSON file(s)
Total matched JSON files: 2041


## Reformat label name:

In [1]:
# This cell normalizes all "label" fields in LabelMe JSON files to lowercase.
import os
import json

"""
Script: normalize_labels_to_lowercase.py

Purpose:
    - Iterate over LabelMe JSON files and convert every "label" value to lowercase in-place.

Inputs:
    - folder_path: Directory containing LabelMe JSON files.

Outputs:
    - Overwrites modified JSON files on disk.
    - Console messages indicating which files were updated and a completion notice.

Notes:
    - Only writes back files that actually changed (at least one label was not already lowercase).
"""

# === CONFIGURATION ===
folder_path = "data_json"  # Directory containing LabelMe JSON files

# === ITERATE OVER JSON FILES IN THE DIRECTORY ===
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        file_path = os.path.join(folder_path, filename)
        try:
            with open(file_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            modified = False

            # Normalize each shape's label to lowercase
            for shape in data.get("shapes", []):
                label = shape.get("label")
                if label and label != label.lower():
                    shape["label"] = label.lower()
                    modified = True

            # Write back only if any label changed
            if modified:
                with open(file_path, "w", encoding="utf-8") as f:
                    json.dump(data, f, ensure_ascii=False, indent=2)

                print(f"Normalized labels in: {filename}")

        except Exception as e:
            print(f"Error processing {filename}: {e}")

print("Completed normalizing all labels to lowercase.")


Completed normalizing all labels to lowercase.


## Recheck number of label.

In [2]:
import os
import json
from collections import Counter

"""
Script: count_labels_from_labelme_json.py

Purpose:
    - Iterate through LabelMe JSON files and count how many times each label appears.

Inputs:
    - folder_path: Directory containing LabelMe JSON files (non-recursive).

Outputs:
    - Console summary listing each label and its count, sorted by descending frequency.

Notes:
    - Only keys under data["shapes"][*]["label"] are counted.
"""

# === CONFIGURATION ===
folder_path = "data_json"  # Directory containing LabelMe JSON files

# === STATE ===
label_counter = Counter()

# === ITERATE OVER JSON FILES IN THE FOLDER ===
for filename in os.listdir(folder_path):
    if filename.endswith(".json"):
        json_path = os.path.join(folder_path, filename)
        try:
            with open(json_path, "r", encoding="utf-8") as f:
                data = json.load(f)

            for shape in data.get("shapes", []):
                label = shape.get("label")
                if label:
                    label_counter[label] += 1

        except Exception as e:
            print(f"Error reading {filename}: {e}")

# === PRINT SUMMARY ===
print("Label frequency summary:")
for label, count in label_counter.most_common():
    print(f"- {label}: {count}")


Label frequency summary:
- unbet: 16818
- fanduel: 1435
- betway: 437
- draftkings: 260
- bally: 240
- bet365: 159
- gilariver: 152
- betrivers: 65
- pointsbet: 60
- betmgm: 50
- caesars: 38
- fanatics: 31
- betparx: 7
- casino: 4


## Conver Json label file to txt format (YOLO segment format):

In [3]:
"""
Script: labelme_to_yolov8_seg.py

Purpose:
    - Convert LabelMe JSON annotations to YOLOv8 Segmentation label files (.txt).
    - For each polygon, write one line per instance:
        class_id x_center y_center width height p1x p1y p2x p2y ... (all normalized)
    - One output .txt per input JSON (same stem name).

Inputs:
    - input_folder:  Directory of LabelMe JSON files.
    - output_folder: Destination directory for YOLOv8 .txt labels.
    - label_names:   Mapping from label string (lowercased) to class_id.

Outputs:
    - For each JSON "imageName.json" -> "imageName.txt" written to output_folder.

Notes:
    - Only polygon shapes with >= 3 points are converted.
    - Labels not found in label_names are skipped (reported at the end).
    - If a JSON contains no valid polygons, an empty .txt is still created.
"""

# === CONFIGURATION ===
input_folder = "data_json"
output_folder = "data_txt"

os.makedirs(output_folder, exist_ok=True)

# === CLASS-ID MAPPING (lowercase labels) ===
label_names = {
    "unbet": 0, "betrivers": 1, "fanduel": 2,
    "betway": 3, "caesars": 4, "bally": 5,
    "draftkings": 6, "pointsbet": 7, "bet365": 8,
    "fanatics": 9, "betparx": 10, "betmgm": 11,
    "gilariver": 12, "casino": 13
}

# === HELPERS ===
def normalize(value: float, max_value: float) -> float:
    """Normalize a pixel coordinate to [0, 1] with rounding."""
    if max_value <= 0:
        return 0.0
    x = value / max_value
    # optional clamp for numerical safety
    if x < 0:
        x = 0.0
    elif x > 1:
        x = 1.0
    return round(x, 6)


# === CONVERT ALL JSON FILES ===
unknown_labels = set()
files_processed = 0

for file in os.listdir(input_folder):
    if not file.endswith(".json"):
        continue

    json_path = os.path.join(input_folder, file)
    try:
        with open(json_path, "r", encoding="utf-8") as f:
            data = json.load(f)

        image_w = data.get("imageWidth")
        image_h = data.get("imageHeight")
        if not image_w or not image_h:
            print(f"[WARN] Missing imageWidth/Height in: {file} — skipped.")
            continue

        lines = []
        shapes = data.get("shapes", [])

        for shape in shapes:
            label = (shape.get("label") or "").strip().lower()
            if not label:
                continue

            if label not in label_names:
                unknown_labels.add(label)
                # Skip shapes whose label is not in the mapping
                continue

            points = shape.get("points") or []
            if len(points) < 3:
                # YOLOv8 Seg requires polygons with at least 3 points
                continue

            # Bounding box in pixels
            x_coords = [pt[0] for pt in points]
            y_coords = [pt[1] for pt in points]
            x_min, x_max = min(x_coords), max(x_coords)
            y_min, y_max = min(y_coords), max(y_coords)

            # Convert bbox to normalized center-width-height
            x_center = normalize((x_min + x_max) / 2, image_w)
            y_center = normalize((y_min + y_max) / 2, image_h)
            bbox_w   = normalize(x_max - x_min, image_w)
            bbox_h   = normalize(y_max - y_min, image_h)

            # Polygon points (normalized, flattened as x1 y1 x2 y2 ...)
            poly_points = []
            for x, y in points:
                poly_points.append(normalize(x, image_w))
                poly_points.append(normalize(y, image_h))

            class_id = label_names[label]

            # YOLOv8-Seg line format
            values = [str(class_id), str(x_center), str(y_center), str(bbox_w), str(bbox_h)]
            values.extend(str(p) for p in poly_points)
            line = " ".join(values)
            lines.append(line)

        # Write output .txt (create empty if no valid polygons)
        output_name = os.path.splitext(file)[0] + ".txt"
        output_path = os.path.join(output_folder, output_name)
        with open(output_path, "w", encoding="utf-8") as f:
            f.write("\n".join(lines))

        files_processed += 1
        print(f"[OK] Processed: {file} -> {output_name} ({len(lines)} instance(s))")

    except Exception as e:
        print(f"[ERROR] {file}: {e}")

print(f"Conversion finished. Files processed: {files_processed}")
if unknown_labels:
    print(f"Unknown labels encountered (skipped): {sorted(unknown_labels)}")


[OK] Processed: BostonCeltics_NewYorkKnicks_1205_Chaudpb_000001.json -> BostonCeltics_NewYorkKnicks_1205_Chaudpb_000001.txt (20 instance(s))
[OK] Processed: BostonCeltics_NewYorkKnicks_1205_Chaudpb_000002.json -> BostonCeltics_NewYorkKnicks_1205_Chaudpb_000002.txt (14 instance(s))
[OK] Processed: BostonCeltics_NewYorkKnicks_1205_Chaudpb_000003.json -> BostonCeltics_NewYorkKnicks_1205_Chaudpb_000003.txt (10 instance(s))
[OK] Processed: BostonCeltics_NewYorkKnicks_1205_Chaudpb_000004.json -> BostonCeltics_NewYorkKnicks_1205_Chaudpb_000004.txt (11 instance(s))
[OK] Processed: BostonCeltics_NewYorkKnicks_1205_Chaudpb_000006.json -> BostonCeltics_NewYorkKnicks_1205_Chaudpb_000006.txt (10 instance(s))
[OK] Processed: BostonCeltics_NewYorkKnicks_1205_Chaudpb_000007.json -> BostonCeltics_NewYorkKnicks_1205_Chaudpb_000007.txt (10 instance(s))
[OK] Processed: BostonCeltics_NewYorkKnicks_1205_Chaudpb_000008.json -> BostonCeltics_NewYorkKnicks_1205_Chaudpb_000008.txt (9 instance(s))
[OK] Processed

## Reformat all image to jpg:

In [4]:
from PIL import Image, UnidentifiedImageError
import os

"""
Script: png_to_jpg_inplace.py

Purpose:
    - Convert all PNG images in a folder to JPEG format.
    - If the PNG has an alpha channel (transparency), composite it over a solid
      background color (default: white) before saving as JPEG.
    - Optionally delete the original PNG after successful conversion.

Inputs:
    - image_folder: Directory containing the source .png files.

Outputs:
    - .jpg files with the same stem name as the original .png files.

Notes:
    - JPEG does not support transparency; transparent regions are filled with `bg_color`.
    - By default, the original .png is removed after conversion (configurable).
"""

# === CONFIGURATION ===
image_folder   = "data_json"          # Directory containing images
bg_color       = (255, 255, 255)      # Background color for PNGs with alpha (RGB)
jpeg_quality   = 92                   # JPEG quality (1–95 typical)
delete_source  = True                 # Remove original .png after conversion

# === CONVERT ALL PNG FILES IN FOLDER ===
for file in os.listdir(image_folder):
    if file.lower().endswith(".png"):
        src_path = os.path.join(image_folder, file)
        dst_name = os.path.splitext(file)[0] + ".jpg"
        dst_path = os.path.join(image_folder, dst_name)

        try:
            with Image.open(src_path) as img:
                # Handle alpha if present
                if img.mode in ("RGBA", "LA") or ("transparency" in img.info):
                    # Convert to RGBA to ensure alpha is available
                    rgba = img.convert("RGBA")
                    # Create background and composite
                    background = Image.new("RGB", rgba.size, bg_color)
                    background.paste(rgba, mask=rgba.split()[-1])  # last channel = alpha
                    out = background
                else:
                    # Ensure 3-channel RGB
                    out = img.convert("RGB")

                # Save JPEG with chosen quality; use optimize/progressive for smaller files
                out.save(dst_path, "JPEG", quality=jpeg_quality, optimize=True, progressive=True)

            # Optionally remove original PNG
            if delete_source:
                os.remove(src_path)

            print(f"[OK] Converted {file} -> {dst_name}")

        except UnidentifiedImageError:
            print(f"[WARN] Skipped (unreadable image): {file}")
        except Exception as e:
            print(f"[ERROR] Failed to convert {file}: {e}")

print("PNG to JPEG conversion completed.")


PNG to JPEG conversion completed.


## Recheck missing data in TXT format

In [6]:
"""
Script: verify_image_label_pairs.py

Purpose:
    - Verify 1:1 pairing between image files and YOLO/segmentation label files (.txt)
      located in the same directory.
    - Report images that lack a matching .txt file and label files that lack a matching image.

Inputs:
    - folder_path: Directory containing both images and .txt label files.

Outputs:
    - Console report listing:
        * Images without label files
        * Label files without matching images
        * Totals and final status

Notes:
    - Pairing is determined by filename stem (e.g., "foo.jpg" <-> "foo.txt").
    - Matching is case-insensitive on file extensions.
"""

# === CONFIGURATION ===
folder_path = "data_txt"  # Directory containing images and .txt labels
image_exts = (".jpg", ".jpeg", ".png")  # Recognized image extensions (lowercased)

# === COLLECT BASENAMES AND EXTENSIONS ===
image_bases: dict[str, str] = {}  # base -> image extension (e.g., ".jpg")
label_bases: dict[str, str] = {}  # base -> ".txt"

for file in os.listdir(folder_path):
    base, ext = os.path.splitext(file)
    ext = ext.lower()

    if ext in image_exts:
        image_bases[base] = ext
    elif ext == ".txt":
        label_bases[base] = ext

image_names = set(image_bases.keys())
label_names = set(label_bases.keys())

# === FIND MISMATCHES ===
images_missing_label = image_names - label_names
labels_missing_image = label_names - image_names

print("=== Validation Report ===")

if images_missing_label:
    print("Images without label files:")
    for name in sorted(images_missing_label):
        print(f" - {name}{image_bases.get(name, '')}")

if labels_missing_image:
    print("\nLabel files without matching images:")
    for name in sorted(labels_missing_image):
        print(f" - {name}{label_bases.get(name, '')}")

# === TOTALS ===
total_images = len(image_names)
total_labels = len(label_names)
total_files = total_images + total_labels

print(f"\nTotals: {total_files} files (images: {total_images}, labels: {total_labels})")

# === FINAL STATUS ===
if not images_missing_label and not labels_missing_image:
    print("All images and label files are correctly paired.")
else:
    print("Mismatch detected: At least one image or label file is missing its pair.")


=== Validation Report ===

Totals: 4082 files (images: 2041, labels: 2041)
All images and label files are correctly paired.


## Split Original dataset to train/val.

In [7]:
import os
import json
import shutil
from collections import defaultdict
from sklearn.model_selection import train_test_split

"""
Script: split_dataset_by_primary_class.py

Purpose:
    - Split a dataset into Train and Val subsets based on the primary (first) class_id
      found in each YOLO Segmentation label file (.txt).
    - Perform a per-class split using a fixed validation ratio.
    - Copy paired image/label files into the output structure:
          dataset/
            images/
              train/
              val/
            labels/
              train/
              val/

Inputs:
    - image_dir:  Directory containing source images (.jpg, .png).
    - label_dir:  Directory containing YOLO segmentation labels (.txt).
    - output_dir: Destination root directory for the split dataset.
    - val_ratio:  Validation split ratio (e.g., 0.2 for 80/20).

Outputs:
    - Files copied into {output_dir}/images/{train|val} and {output_dir}/labels/{train|val}.

Notes:
    - The "primary class" is taken as the first class_id token on the first non-empty line
      of each label file. If a file contains instances of multiple classes, this script
      still assigns the file to a single class by that first token.
    - A class bucket with < 2 files cannot be split; those files are put into Train.
"""

# === CONFIGURATION ===
image_dir = "data_txt"     # Directory containing source images (.jpg, .png)
label_dir = "data_txt"     # Directory containing YOLO segmentation label files (.txt)
output_dir = "dataset"     # Output root directory
val_ratio = 0.2            # Validation split ratio

# === CREATE OUTPUT FOLDERS ===
for split in ["train", "val"]:
    os.makedirs(os.path.join(output_dir, "images", split), exist_ok=True)
    os.makedirs(os.path.join(output_dir, "labels", split), exist_ok=True)

# === MAP EACH LABEL FILE TO ITS PRIMARY CLASS_ID ===
label_to_class = {}
for fname in os.listdir(label_dir):
    if fname.endswith(".txt"):
        path = os.path.join(label_dir, fname)
        with open(path, "r", encoding="utf-8") as f:
            lines = f.readlines()
            classes = [line.strip().split()[0] for line in lines if line.strip()]
            main_class = classes[0] if classes else None
            if main_class is not None:
                base_name = os.path.splitext(fname)[0]
                label_to_class[base_name] = int(main_class)

# === GROUP FILES BY CLASS_ID ===
class_to_files = defaultdict(list)
for base_name, class_id in label_to_class.items():
    class_to_files[class_id].append(base_name)

# === PER-CLASS SPLIT (OR FALL BACK TO TRAIN IF NOT ENOUGH FILES) ===
train_set = set()
val_set = set()

for class_id, files in class_to_files.items():
    if len(files) < 2:
        # Not enough files to split -> assign all to Train
        train_set.update(files)
        print(f"Class {class_id} has {len(files)} file(s) -> assigning to Train.")
    else:
        train_files, val_files = train_test_split(
            files, test_size=val_ratio, random_state=42, shuffle=True
        )
        train_set.update(train_files)
        val_set.update(val_files)

# === FILE COPY HELPER ===
def copy_file(base_name: str, split: str) -> None:
    """
    Copy the image (by trying common extensions) and its .txt label into the split folder.
    """
    # Copy image
    found = False
    for ext in [".jpg", ".png"]:
        img_path = os.path.join(image_dir, base_name + ext)
        if os.path.exists(img_path):
            shutil.copy2(img_path, os.path.join(output_dir, "images", split, base_name + ext))
            found = True
            break
    if not found:
        print(f"[WARN] Image not found for base name: {base_name}")

    # Copy label
    label_src = os.path.join(label_dir, base_name + ".txt")
    if os.path.exists(label_src):
        shutil.copy2(label_src, os.path.join(output_dir, "labels", split, base_name + ".txt"))
    else:
        print(f"[WARN] Label file not found: {base_name}.txt")

# === EXECUTE COPY ===
for b in train_set:
    copy_file(b, "train")
for b in val_set:
    copy_file(b, "val")

# === SUMMARY ===
print("\nTrain/Val split by primary class completed.")
print(f"Train set size: {len(train_set)} file(s)")
print(f"Val set size  : {len(val_set)} file(s)")


Class 7 has 1 file(s) -> assigning to Train.

Train/Val split by primary class completed.
Train set size: 1618 file(s)
Val set size  : 411 file(s)
