# Create RNN dataset

## Use YOLO model to predict Heldout set

In [None]:
from ultralytics import YOLO
from pathlib import Path
import json
import pandas as pd

df = pd.read_csv("../data/splits/development.csv")

model = YOLO('../runs/detect/pill_imprint_final/weights/best.pt')

image_dir = Path('../data/pillbox_production_images_full_202008')
df['image_path'] = df['original_name'].apply(lambda x: image_dir / x if pd.notna(x) else None)
df['exists'] = df['image_path'].apply(lambda x: x.exists() if x else False)

valid_images = df[df['exists']]['image_path'].tolist()

predictions = []
batch_size = 250

print(f"Processing {len(valid_images)} images in batches of {batch_size}...")


for i in range(0, len(valid_images), batch_size):
    batch = valid_images[i:i+batch_size]
    results = model.predict(
        batch,
        conf=0.15,
        agnostic_nms=True,
        verbose=False)
    
    for img_path, result in zip(batch, results):
        detections = []
        for box in result.boxes:
            detections.append({
                'class_id': int(box.cls),
                'class_name': result.names[int(box.cls)].upper(),  # Convert to uppercase
                'confidence': float(box.conf),
                'bbox': box.xywhn.tolist()[0]
            })
        predictions.append({
            'image': img_path.name,
            'detections': detections
        })
    
    print(f"Processed {min(i+batch_size, len(valid_images))}/{len(valid_images)}")

with open('../data/predictions/developement_yolo_pred.json', 'w') as f:
    json.dump(predictions, f, indent=2)

print(f"Saved predictions to data/predictions/developement_yolo_pred.json")

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as patches
from PIL import Image
import random

with open('../data/predictions/developement_yolo_pred.json') as f:
    predictions = json.load(f)

samples = random.sample(predictions, 5)

fig, axes = plt.subplots(1, 5, figsize=(20, 4))

for ax, pred in zip(axes, samples):
    img_path = image_dir / pred['image']
    img = Image.open(img_path)
    ax.imshow(img)
    
    img_width, img_height = img.size
    
    for det in pred['detections']:
        x_center, y_center, w, h = det['bbox']
        x = (x_center - w/2) * img_width
        y = (y_center - h/2) * img_height
        
        rect = patches.Rectangle((x, y), w * img_width, h * img_height,
                                linewidth=2, edgecolor='red', facecolor='none')
        ax.add_patch(rect)
        
        ax.text(x, y - 5, f"{det['class_name']} ({det['confidence']:.2f})",
               color='red', fontsize=10, fontweight='bold',
               bbox=dict(boxstyle='round,pad=0.3', facecolor='white', alpha=0.7))
    
    ax.axis('off')
    ax.set_title(pred['image'], fontsize=8)

plt.tight_layout()
plt.show()

## Use ResNet model to predice on Heldout set

In [None]:
import torch
import torch.nn as nn
from torchvision import models, transforms
from PIL import Image
from tqdm import tqdm
import json

device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

class MultiTaskResNet(nn.Module):
    def __init__(self, num_shapes, num_colors, num_forms):
        super().__init__()
        self.backbone = models.resnet18(weights=None)
        in_features = self.backbone.fc.in_features
        self.backbone.fc = nn.Identity()
        self.shape_head = nn.Linear(in_features, num_shapes)
        self.color_head = nn.Linear(in_features, num_colors)
        self.form_head = nn.Linear(in_features, num_forms)
    
    def forward(self, x):
        features = self.backbone(x)
        return self.shape_head(features), self.color_head(features), self.form_head(features)

checkpoint = torch.load('../resnet_model/pill_classifier_full.pth', weights_only=False)
resnet_model = MultiTaskResNet(
    checkpoint['num_shape_classes'], 
    checkpoint['num_color_classes'],
    checkpoint['num_form_classes']
).to(device)
resnet_model.load_state_dict(checkpoint['model_state_dict'])
resnet_model.eval()

transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
])

with open('../data/predictions/developement_yolo_pred.json') as f:
    yolo_preds = json.load(f)

resnet_preds = []
for i in tqdm(range(0, len(yolo_preds), 100)):
    batch = yolo_preds[i:i+100]
    imgs = torch.stack([transform(Image.open(image_dir / p['image']).convert('RGB')) for p in batch]).to(device)
    
    with torch.no_grad():
        shape_out, color_out, form_out = resnet_model(imgs)
    
    for j, pred in enumerate(batch):
        resnet_preds.append({
            'image': pred['image'],
            'shape': checkpoint['shape_encoder'].inverse_transform([shape_out[j].argmax().item()])[0],
            'color': checkpoint['color_encoder'].inverse_transform([color_out[j].argmax().item()])[0],
            'form': checkpoint['form_encoder'].inverse_transform([form_out[j].argmax().item()])[0]
        })

with open('../data/predictions/developement_resnet_pred.json', 'w') as f:
    json.dump(resnet_preds, f, indent=2)

print(f"Saved {len(resnet_preds)} predictions")

# Preprocess Splimprint (ground truth labels)

In [None]:
from pathlib import Path
image_dir = Path('../data/pillbox_production_images_full_202008')

Remove _ from splimprint_clean saves as labels. This is because YOLO will not predict any _.

In [None]:
import pandas as pd

# Load ground truth labels
df = pd.read_csv("../data/splits/development.csv")

# Remove underscores from splimprint_clean and save to 'labels' column
df['labels'] = df['splimprint_clean'].str.replace('_', '', regex=False)
df.head()

# Encode ResNet and YOLO predictions

In [None]:
import json
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

# Load YOLO predictions
with open('../data/predictions/developement_yolo_pred.json', 'r') as f:
    yolo_predictions = json.load(f)

# Load ResNet predictions
with open('../data/predictions/developement_resnet_pred.json', 'r') as f:
    resnet_predictions = json.load(f)

# Load ground truth labels from the cleaned df (from previous cells)
label_dict = dict(zip(df['original_name'], df['labels']))

# Filter: keep only images with ground truth labels AND detections
yolo_filtered = []
for pred in yolo_predictions:
    img_name = pred['image']
    has_detections = len(pred.get('detections', [])) > 0
    has_label = img_name in label_dict and pd.notna(label_dict[img_name])
    
    if has_detections and has_label:
        yolo_filtered.append(pred)

print(f"Filtered: {len(yolo_predictions)} -> {len(yolo_filtered)} images")
print(f"  (removed {len(yolo_predictions) - len(yolo_filtered)} without detections or labels)")

# Dynamically find all unique values
all_chars = set()
all_shapes = set()
all_colors = set()
all_forms = set()

for pred in yolo_filtered:
    for det in pred.get('detections', []):
        all_chars.add(det['class_name'].upper())

for pred in resnet_predictions:
    all_shapes.add(pred['shape'])
    all_colors.add(pred['color'])
    all_forms.add(pred['form'])

# Convert to sorted lists
ALL_CHARS = sorted(list(all_chars))
ALL_SHAPES = sorted(list(all_shapes))
ALL_COLORS = sorted(list(all_colors))
ALL_FORMS = sorted(list(all_forms))

print(f"\nFound {len(ALL_CHARS)} unique characters: {ALL_CHARS}")
print(f"Found {len(ALL_SHAPES)} unique shapes: {ALL_SHAPES}")
print(f"Found {len(ALL_COLORS)} unique colors: {ALL_COLORS}")
print(f"Found {len(ALL_FORMS)} unique forms: {ALL_FORMS}")

# Create one-hot encoders
char_encoder = OneHotEncoder(categories=[ALL_CHARS], sparse_output=False)
char_encoder.fit(np.array(ALL_CHARS).reshape(-1, 1))

shape_encoder = OneHotEncoder(categories=[ALL_SHAPES], sparse_output=False)
shape_encoder.fit(np.array(ALL_SHAPES).reshape(-1, 1))

color_encoder = OneHotEncoder(categories=[ALL_COLORS], sparse_output=False)
color_encoder.fit(np.array(ALL_COLORS).reshape(-1, 1))

form_encoder = OneHotEncoder(categories=[ALL_FORMS], sparse_output=False)
form_encoder.fit(np.array(ALL_FORMS).reshape(-1, 1))

# Create lookup dict for ResNet predictions
resnet_dict = {pred['image']: pred for pred in resnet_predictions}

def sort_boxes_left_to_right(boxes):
    """Sort bounding boxes left-to-right, top-to-bottom"""
    if len(boxes) == 0:
        return []
    
    # Extract centers
    centers = [(box['bbox'][0], box['bbox'][1]) for box in boxes]
    
    # Sort by y first (rows), then x (columns within row)
    # Use smaller multiplier for y to avoid over-prioritizing vertical position
    sorted_indices = sorted(range(len(centers)), 
                           key=lambda i: (round(centers[i][1] * 10), centers[i][0]))
    
    return [boxes[i] for i in sorted_indices]

def process_single_image(yolo_pred, resnet_pred):
    """Process one image with both YOLO and ResNet predictions"""
    detections = yolo_pred.get('detections', [])
    sorted_boxes = sort_boxes_left_to_right(detections)
    
    # Encode ResNet features (context - same for all characters)
    shape_ohe = shape_encoder.transform([[resnet_pred['shape']]])[0]
    color_ohe = color_encoder.transform([[resnet_pred['color']]])[0]
    form_ohe = form_encoder.transform([[resnet_pred['form']]])[0]
    
    # Process each character
    sequences = []
    for det in sorted_boxes:
        # Character coordinates (normalized)
        x_center, y_center, w, h = det['bbox']
        
        # Character one-hot encoding
        char = det['class_name'].upper()
        char_ohe = char_encoder.transform([[char]])[0]
        
        # Concatenate: [x, y, char_OHE, shape_OHE, color_OHE, form_OHE]
        feature_vector = np.concatenate([
            [x_center, y_center],
            char_ohe,
            shape_ohe,
            color_ohe,
            form_ohe
        ])
        sequences.append(feature_vector)
    
    return np.array(sequences)

# Process all filtered images
processed_data = []
for yolo_pred in yolo_filtered:
    img_name = yolo_pred['image']
    if img_name in resnet_dict:
        features = process_single_image(yolo_pred, resnet_dict[img_name])
        processed_data.append({
            'image': img_name,
            'features': features,
            'num_chars': len(features),
            'target_str': label_dict[img_name]
        })

# Convert to DataFrame
df_processed = pd.DataFrame(processed_data)

feature_dim = 2 + len(ALL_CHARS) + len(ALL_SHAPES) + len(ALL_COLORS) + len(ALL_FORMS)
print(f"\nProcessed {len(df_processed)} images")
print(f"Feature vector size: {feature_dim}")
print(f"  - Coordinates: 2")
print(f"  - Character OHE: {len(ALL_CHARS)}")
print(f"  - Shape OHE: {len(ALL_SHAPES)}")
print(f"  - Color OHE: {len(ALL_COLORS)}")
print(f"  - Form OHE: {len(ALL_FORMS)}")

## Padding ResNet Sequence

In [7]:
import torch
from torch.nn.utils.rnn import pad_sequence

# Convert to list of tensors
sequences = [torch.FloatTensor(feat) for feat in df_processed['features'].tolist()]

# Pad sequences (batch_first=True for shape: [batch, seq_len, features])
X_padded = pad_sequence(sequences, batch_first=True, padding_value=0.0)

print(f"Padded sequences:")
print(f"  Shape: {X_padded.shape}")
print(f"  Max length: {X_padded.shape[1]}")
print(f"  Format: (num_samples, max_len, feature_dim)")

Padded sequences:
  Shape: torch.Size([7154, 81, 66])
  Max length: 81
  Format: (num_samples, max_len, feature_dim)


## Padding YOLO Sequence

In [8]:
# Add special tokens to ALL_CHARS
special_tokens = ['<SOS>', '<EOS>', '<PAD>']
if not any(t in ALL_CHARS for t in special_tokens):
    # Insert at beginning
    ALL_CHARS = special_tokens + ALL_CHARS
    ALL_CHARS = sorted(ALL_CHARS, key=lambda x: (x[0] != '<', x))  # Keep special tokens first

# Create character to index mapping
char_to_idx = {char: idx for idx, char in enumerate(ALL_CHARS)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

SOS_IDX = char_to_idx['<SOS>']
EOS_IDX = char_to_idx['<EOS>']
PAD_IDX = char_to_idx['<PAD>']

print(f"Vocabulary ({len(ALL_CHARS)} tokens):")
print(f"  ALL_CHARS: {ALL_CHARS}")
print(f"  SOS_IDX: {SOS_IDX}")
print(f"  EOS_IDX: {EOS_IDX}")
print(f"  PAD_IDX: {PAD_IDX}")

# Convert target strings to sequences of indices with SOS and EOS
def encode_target(text):
    """Encode target text as: <SOS> + characters + <EOS>"""
    text = str(text).upper()
    # Convert each character (use PAD for unknown characters)
    char_indices = [char_to_idx.get(c, PAD_IDX) for c in text]
    # Add SOS at start and EOS at end
    return [SOS_IDX] + char_indices + [EOS_IDX]

# Encode targets (already in df_processed)
df_processed['target_encoded'] = df_processed['target_str'].apply(encode_target)

# Pad target sequences with PAD token
max_target_len = df_processed['target_encoded'].apply(len).max()
y_padded = pad_sequence(
    [torch.LongTensor(t) for t in df_processed['target_encoded'].tolist()],
    batch_first=True,
    padding_value=PAD_IDX
)

print(f"\nTarget Encoding:")
print(f"  Total samples: {len(df_processed)}")
print(f"  X_padded shape: {X_padded.shape}")
print(f"  y_padded shape: {y_padded.shape}")
print(f"  Max target length: {max_target_len}")
print(f"\nExample:")
print(f"  Original: {df_processed.iloc[0]['target_str']}")
print(f"  Encoded: {df_processed.iloc[0]['target_encoded']}")
print(f"  Decoded: {[idx_to_char[i] for i in df_processed.iloc[0]['target_encoded']]}")
print(f"  Padded: {y_padded[0].tolist()}")

Vocabulary (38 tokens):
  ALL_CHARS: ['<EOS>', '<PAD>', '<SOS>', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
  SOS_IDX: 2
  EOS_IDX: 0
  PAD_IDX: 1

Target Encoding:
  Total samples: 7154
  X_padded shape: torch.Size([7154, 81, 66])
  y_padded shape: torch.Size([7154, 33])
  Max target length: 33

Example:
  Original: W
  Encoded: [2, 34, 0]
  Decoded: ['<SOS>', 'W', '<EOS>']
  Padded: [2, 34, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


# Split and save data

In [None]:
from sklearn.model_selection import train_test_split

# 80-10-10 split for RNN training
# First split: 80% train, 20% temp
train_idx, temp_idx = train_test_split(
    range(len(df_processed)), 
    test_size=0.2, 
    random_state=42
)

# Second split: 20% temp -> 10% val, 10% test
val_idx, test_idx = train_test_split(
    temp_idx, 
    test_size=0.5,
    random_state=42
)

# Create RNN dataset (train, val, test)
X_train = X_padded[train_idx]
X_val = X_padded[val_idx]
X_test = X_padded[test_idx]

y_train = y_padded[train_idx]
y_val = y_padded[val_idx]
y_test = y_padded[test_idx]

df_train = df_processed.iloc[train_idx].reset_index(drop=True)
df_val = df_processed.iloc[val_idx].reset_index(drop=True)
df_test = df_processed.iloc[test_idx].reset_index(drop=True)

print(f"RNN Dataset Split:")
print(f"  Train: {len(X_train)} ({len(X_train)/len(df_processed)*100:.1f}%)")
print(f"  Val:   {len(X_val)} ({len(X_val)/len(df_processed)*100:.1f}%)")
print(f"  Test:  {len(X_test)} ({len(X_test)/len(df_processed)*100:.1f}%)")
print(f"  Total: {len(df_processed)}")

# Save RNN dataset
torch.save({
    'X_train': X_train,
    'X_val': X_val,
    'X_test': X_test,
    'y_train': y_train,
    'y_val': y_val,
    'y_test': y_test,
    'df_train': df_train,
    'df_val': df_val,
    'df_test': df_test,
    'char_to_idx': char_to_idx,
    'idx_to_char': idx_to_char,
    'char_encoder': char_encoder,
    'shape_encoder': shape_encoder,
    'color_encoder': color_encoder,
    'form_encoder': form_encoder,
    'ALL_CHARS': ALL_CHARS,
    'ALL_SHAPES': ALL_SHAPES,
    'ALL_COLORS': ALL_COLORS,
    'ALL_FORMS': ALL_FORMS,
    'max_len': X_padded.shape[1],
    'max_target_len': max_target_len,
    'feature_dim': X_padded.shape[2],
    'SOS_IDX': SOS_IDX,
    'EOS_IDX': EOS_IDX,
    'PAD_IDX': PAD_IDX
}, '../data/predictions/rnn_dataset.pt')

print(f"\n✓ Saved to data/predictions/rnn_dataset.pt")
print(f"  - Input shape: (batch, {X_padded.shape[1]}, {X_padded.shape[2]})")
print(f"  - Target shape: (batch, {max_target_len})")
print(f"  - Special tokens: SOS={SOS_IDX}, EOS={EOS_IDX}, PAD={PAD_IDX}")

# Example: How a Prediction is Encoded

In [10]:
"""
Clean and Simple: How a Prediction is Encoded
================================================

This shows the complete encoding pipeline for a single pill image:
1. YOLO detects characters (e.g., 'A', '6', '7')
2. ResNet predicts shape/color/form (e.g., OVAL, PINK, C42931)
3. Each character gets encoded with position + one-hot encodings
4. Target label is encoded with special tokens
"""

# Select a random example
import random
example_idx = random.randint(0, len(df_processed) - 1)
example = df_processed.iloc[example_idx]

print("=" * 80)
print("EXAMPLE: Encoding Pipeline")
print("=" * 80)
print(f"\nImage: {example['image']}")
print(f"Target Label: '{example['target_str']}'")
print(f"Number of Characters Detected: {example['num_chars']}")

# Step 1: Show YOLO detections
print("\n" + "-" * 80)
print("STEP 1: YOLO Character Detections (sorted left-to-right)")
print("-" * 80)

yolo_pred = next(p for p in yolo_filtered if p['image'] == example['image'])
resnet_pred = resnet_dict[example['image']]

sorted_boxes = sort_boxes_left_to_right(yolo_pred['detections'])
for i, det in enumerate(sorted_boxes):
    x, y, w, h = det['bbox']
    print(f"  Char {i+1}: '{det['class_name']}' at position ({x:.3f}, {y:.3f}), confidence: {det['confidence']:.3f}")

# Step 2: Show ResNet predictions
print("\n" + "-" * 80)
print("STEP 2: ResNet Global Predictions (same for all characters)")
print("-" * 80)
print(f"  Shape: {resnet_pred['shape']}")
print(f"  Color: {resnet_pred['color']}")
print(f"  Form:  {resnet_pred['form']}")

# Step 3: Show encoding for FIRST character in detail
print("\n" + "-" * 80)
print("STEP 3: Encoding FIRST Character (in detail)")
print("-" * 80)

if len(sorted_boxes) > 0:
    first_det = sorted_boxes[0]
    x, y, w, h = first_det['bbox']
    char = first_det['class_name'].upper()
    
    # Get encodings
    char_ohe = char_encoder.transform([[char]])[0]
    shape_ohe = shape_encoder.transform([[resnet_pred['shape']]])[0]
    color_ohe = color_encoder.transform([[resnet_pred['color']]])[0]
    form_ohe = form_encoder.transform([[resnet_pred['form']]])[0]
    
    print(f"\nCharacter: '{char}'")
    print(f"  Position: ({x:.3f}, {y:.3f})")
    print(f"\n  Character One-Hot ({len(ALL_CHARS)} dims): {char_ohe}")
    print(f"    → Index {np.argmax(char_ohe)} corresponds to '{ALL_CHARS[np.argmax(char_ohe)]}'")
    print(f"\n  Shape One-Hot ({len(ALL_SHAPES)} dims): {shape_ohe}")
    print(f"    → Index {np.argmax(shape_ohe)} corresponds to '{ALL_SHAPES[np.argmax(shape_ohe)]}'")
    print(f"\n  Color One-Hot ({len(ALL_COLORS)} dims): {color_ohe}")
    print(f"    → Index {np.argmax(color_ohe)} corresponds to '{ALL_COLORS[np.argmax(color_ohe)]}'")
    print(f"\n  Form One-Hot ({len(ALL_FORMS)} dims): {form_ohe}")
    print(f"    → Index {np.argmax(form_ohe)} corresponds to '{ALL_FORMS[np.argmax(form_ohe)]}'")
    
    # Concatenate
    feature_vector = np.concatenate([[x, y], char_ohe, shape_ohe, color_ohe, form_ohe])
    print(f"\n  Final Feature Vector ({len(feature_vector)} dims):")
    print(f"    [x={x:.3f}, y={y:.3f}, char_ohe({len(char_ohe)}), shape_ohe({len(shape_ohe)}), color_ohe({len(color_ohe)}), form_ohe({len(form_ohe)})]")
    print(f"    Shape: {feature_vector.shape}")

# Step 4: Show full sequence encoding
print("\n" + "-" * 80)
print("STEP 4: Full Sequence Encoding (all characters)")
print("-" * 80)
print(f"  Sequence Length: {len(example['features'])}")
print(f"  Feature Vector Dimension: {len(example['features'][0])}")
print(f"  Sequence Shape: {example['features'].shape}")

# Step 5: Show target encoding
print("\n" + "-" * 80)
print("STEP 5: Target Label Encoding")
print("-" * 80)
print(f"  Original: '{example['target_str']}'")
print(f"  Encoded:  {example['target_encoded']}")
print(f"  Decoded:  {[idx_to_char[i] for i in example['target_encoded']]}")
print(f"\n  Breakdown:")
for i, idx in enumerate(example['target_encoded']):
    print(f"    Position {i}: {idx:2d} → '{idx_to_char[idx]}'")

# Step 6: Show padded version
print("\n" + "-" * 80)
print("STEP 6: After Padding (for batching)")
print("-" * 80)

# Find this example in X_padded and y_padded
original_idx = df_processed[df_processed['image'] == example['image']].index[0]
X_sample = X_padded[original_idx]
y_sample = y_padded[original_idx]

print(f"  Input (X) Padded Shape: {X_sample.shape}")
print(f"    → (max_sequence_length={X_sample.shape[0]}, feature_dim={X_sample.shape[1]})")
print(f"    → Padding added: {X_sample.shape[0] - len(example['features'])} zeros")

print(f"\n  Target (y) Padded Shape: {y_sample.shape}")
print(f"    → Original: {example['target_encoded']}")
print(f"    → Padded:   {y_sample.tolist()}")
print(f"    → Padding added: {y_sample.shape[0] - len(example['target_encoded'])} PAD tokens ({PAD_IDX})")

print("\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)
print(f"Input:  {example['num_chars']} characters → Padded to {X_sample.shape[0]} timesteps")
print(f"        Each timestep has {X_sample.shape[1]} features")
print(f"Target: '{example['target_str']}' → {example['target_encoded']} → Padded to {y_sample.shape[0]}")
print(f"        Special tokens: <SOS>=2, <EOS>=0, <PAD>=1")
print("=" * 80)

EXAMPLE: Encoding Pipeline

Image: 00555086002.jpg
Target Label: 'B860100'
Number of Characters Detected: 7

--------------------------------------------------------------------------------
STEP 1: YOLO Character Detections (sorted left-to-right)
--------------------------------------------------------------------------------
  Char 1: 'B' at position (0.522, 0.228), confidence: 0.990
  Char 2: '8' at position (0.320, 0.709), confidence: 0.989
  Char 3: '6' at position (0.387, 0.707), confidence: 0.981
  Char 4: '0' at position (0.451, 0.709), confidence: 0.947
  Char 5: '1' at position (0.577, 0.708), confidence: 0.966
  Char 6: '0' at position (0.629, 0.709), confidence: 0.949
  Char 7: '0' at position (0.696, 0.711), confidence: 0.933

--------------------------------------------------------------------------------
STEP 2: ResNet Global Predictions (same for all characters)
--------------------------------------------------------------------------------
  Shape: OVAL
  Color: WHITE
