# üõ†Ô∏è Data Preparation V4 (Save Class List)

**Update:** Automatically saves `classes.json` into the Run ID folder to ensure class consistency during inference.
**Path:** /workspace/AiTaxonomy

In [1]:
import sys
import subprocess
import os
import glob
import random
import shutil
import json
import numpy as np
import tensorflow as tf
from tqdm.notebook import tqdm
from datetime import datetime, timezone, timedelta
import cv2
from PIL import Image
import pillow_heif

# ================= CONFIGURATION =================
DATA_DIR = r"/workspace/Archive/All-Species"
OUTPUT_BASE_DIR = r"/workspace/AiTaxonomy/TFRecords_AllSpecies_B6"
LOG_DIR = r"/workspace/AiTaxonomy/TF-Training-Logs-B6"

IMG_SIZE = 528
VAL_SPLIT = 0.2
SEED = 123
IMAGES_PER_SHARD = 2000 
# =================================================

os.makedirs(LOG_DIR, exist_ok=True)
os.makedirs(OUTPUT_BASE_DIR, exist_ok=True)

def get_user_input(prompt):
    return input(prompt).strip()

def get_thai_timestamp():
    tz_thai = timezone(timedelta(hours=7))
    return datetime.now(tz_thai).strftime("%Y%m%d-%H%M%S")

print(f"‚úÖ Configuration Loaded.")

2025-12-04 14:17:45.455337: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-12-04 14:17:45.495496: I tensorflow/core/platform/cpu_feature_guard.cc:211] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE3 SSE4.1 SSE4.2 AVX, in other operations, rebuild TensorFlow with the appropriate compiler flags.


‚úÖ Configuration Loaded.


In [2]:
# =============================================================================
# üìù STEP 1: Select ID
# =============================================================================

existing_ids = sorted(os.listdir(OUTPUT_BASE_DIR)) if os.path.exists(OUTPUT_BASE_DIR) else []
print(f"üìÇ Existing IDs: {existing_ids}")

user_id = get_user_input("Enter RUN ID to resume/overwrite (or press Enter for NEW): ")

if not user_id:
    RUN_TIMESTAMP = get_thai_timestamp()
    MODE = 'NEW'
    print(f"‚ú® NEW ID: {RUN_TIMESTAMP}")
else:
    RUN_TIMESTAMP = user_id
    target_dir = os.path.join(OUTPUT_BASE_DIR, RUN_TIMESTAMP)
    if os.path.exists(target_dir):
        print("1) Resume  2) Overwrite  3) Cancel")
        choice = get_user_input("Select: ")
        if choice == '1': MODE = 'RESUME'
        elif choice == '2': MODE = 'OVERWRITE'
        else: MODE = 'CANCEL'
    else:
        MODE = 'NEW'

üìÇ Existing IDs: ['20251202-115257', '20251203-065556', '20251204-211157']


Enter RUN ID to resume/overwrite (or press Enter for NEW):  20251204-211157


1) Resume  2) Overwrite  3) Cancel


Select:  1


In [3]:
# =============================================================================
# üìù STEP 2: Processing Functions
# =============================================================================

def _bytes_feature(value):
    if isinstance(value, type(tf.constant(0))):
        value = value.numpy()
    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))

def _int64_feature(value):
    return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))

def serialize_example(image_string, label):
    feature = {
        'image': _bytes_feature(image_string),
        'label': _int64_feature(label),
    }
    return tf.train.Example(features=tf.train.Features(feature=feature)).SerializeToString()

def process_image_safely(img_path, target_size):
    try:
        if not os.path.exists(img_path) or os.path.getsize(img_path) == 0: return None
        ext = os.path.splitext(img_path)[1].lower()
        img = None
        if ext in ['.heic', '.heif']:
            heif_file = pillow_heif.read_heif(img_path)
            image = Image.frombytes(heif_file.mode, heif_file.size, heif_file.data, "raw")
            img = np.array(image)
            img = cv2.cvtColor(img, cv2.COLOR_RGB2BGR)
        else:
            with open(img_path, "rb") as stream:
                bytes_data = bytearray(stream.read())
                numpyarray = np.asarray(bytes_data, dtype=np.uint8)
                img = cv2.imdecode(numpyarray, cv2.IMREAD_COLOR)

        if img is None or img.size == 0: return None
        img = cv2.resize(img, (target_size, target_size))
        is_success, img_encoded = cv2.imencode('.jpg', img, [int(cv2.IMWRITE_JPEG_QUALITY), 95])
        if not is_success: return None
        return img_encoded.tobytes()
    except: return None

def write_tfrecords(data, output_dir, prefix, class_map, resume=False):
    if not os.path.exists(output_dir): os.makedirs(output_dir)
    start_index = 0
    shard_idx = 0
    
    if resume:
        files = sorted(glob.glob(os.path.join(output_dir, f"{prefix}_*.tfrecord")))
        if files:
            last_file = files[-1]
            try: os.remove(last_file) 
            except: pass
            shard_idx = len(files) - 1
            start_index = shard_idx * IMAGES_PER_SHARD
            print(f"üîÑ Resuming from index {start_index} (Shard {shard_idx})")

    if start_index >= len(data):
        print(f"‚úÖ {prefix} already complete.")
        return

    writer = None
    data_to_process = data[start_index:]
    print(f"Processing {len(data_to_process)} images...")
    
    for i, img_path in tqdm(enumerate(data_to_process), total=len(data_to_process)):
        if writer is None or (i % IMAGES_PER_SHARD == 0):
            if writer: writer.close()
            shard_path = os.path.join(output_dir, f"{prefix}_{shard_idx:04d}.tfrecord")
            writer = tf.io.TFRecordWriter(shard_path)
            shard_idx += 1
        
        class_name = os.path.basename(os.path.dirname(img_path))
        label = class_map.get(class_name)
        
        if label is not None:
            img_bytes = process_image_safely(img_path, IMG_SIZE)
            if img_bytes:
                writer.write(serialize_example(img_bytes, label))
    
    if writer: writer.close()
    print(f"‚úÖ {prefix} Done.")

In [None]:
# =============================================================================
# üìù STEP 3: EXECUTE & SAVE CLASS MAP
# =============================================================================

if MODE != 'CANCEL':
    SAVE_DIR = os.path.join(OUTPUT_BASE_DIR, RUN_TIMESTAMP)
    
    if MODE == 'OVERWRITE':
        print(f"üóëÔ∏è Deleting old data in {SAVE_DIR}...")
        shutil.rmtree(SAVE_DIR)
    
    # 1. Scan & Create Class Map
    print("üîç Scanning classes...")
    classes = sorted([d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))])
    class_map = {name: i for i, name in enumerate(classes)}
    
    # --- NEW: Save Class List to JSON ---
    os.makedirs(SAVE_DIR, exist_ok=True)
    class_json_path = os.path.join(SAVE_DIR, "classes.json")
    with open(class_json_path, "w", encoding="utf-8") as f:
        json.dump(classes, f, ensure_ascii=False, indent=4)
    print(f"üíæ Saved Class List to: {class_json_path}")
    # ------------------------------------
    
    # 2. File Scanning
    all_files = []
    valid_ext = {'.jpg', '.jpeg', '.png', '.bmp', '.webp', '.heic', '.heif'}
    
    for cls in tqdm(classes):
        cls_path = os.path.join(DATA_DIR, cls)
        if os.path.exists(cls_path):
            for f in os.listdir(cls_path):
                if os.path.splitext(f)[1].lower() in valid_ext:
                    if 'bark01' not in f.lower():
                        all_files.append(os.path.join(cls_path, f))

    # 3. Shuffle & Split
    random.seed(SEED)
    random.shuffle(all_files)
    val_count = int(len(all_files) * VAL_SPLIT)
    train_files = all_files[val_count:]
    val_files = all_files[:val_count]
    
    print(f"üìä Valid Files: {len(all_files)} (Train: {len(train_files)}, Val: {len(val_files)})")
    
    # 4. Process
    is_resume = (MODE == 'RESUME')
    write_tfrecords(train_files, os.path.join(SAVE_DIR, 'train'), 'train_data', class_map, resume=is_resume)
    write_tfrecords(val_files, os.path.join(SAVE_DIR, 'val'), 'val_data', class_map, resume=is_resume)
    
    print(f"\nüéâ DATA PREP COMPLETED. ID: {RUN_TIMESTAMP}")

üîç Scanning classes...
üíæ Saved Class List to: /workspace/AiTaxonomy/TFRecords_AllSpecies_B6/20251204-211157/classes.json


  0%|          | 0/542 [00:00<?, ?it/s]

üìä Valid Files: 169283 (Train: 135427, Val: 33856)
üîÑ Resuming from index 0 (Shard 0)
Processing 135427 images...


  0%|          | 0/135427 [00:00<?, ?it/s]

2025-12-04 14:18:33.393450: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 90683 MB memory:  -> device: 0, name: NVIDIA H100 NVL, pci bus id: 0000:26:00.0, compute capability: 9.0
2025-12-04 14:18:33.394841: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1928] Created device /job:localhost/replica:0/task:0/device:GPU:1 with 90683 MB memory:  -> device: 1, name: NVIDIA H100 NVL, pci bus id: 0000:8a:00.0, compute capability: 9.0
