In [5]:
import os
print(os.getcwd())

c:\Users\jerry\IITM\CFI\Ai Club\2025-26\Precog\captcha_ocr\notebooks


In [None]:
# generate 100 random words

import random
import string

def random_capitalize(word):
    return ''.join(random.choice([c.upper(), c.lower()]) for c in word)

vocab = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789"

words = []
for _ in range(100):
    word = ''.join(random.choice(vocab) for _ in range(5))
    words.append(word)

# Use columns: ['word_id', 'word', 'frequency']
# frequency is dummy, just for compatibility with wiki_words used in generate_captchas

word_id = [idx for idx in range(len(words))]


with open('../diverse_words.tsv', 'w') as f:
    for word_id, word in enumerate(words):
        f.write(f"{word_id}\t{word}\t1\n")

In [3]:
# verifying the train and validation datasets

import sys
import yaml
import json
from pathlib import Path
from typing import List, Dict

# Add project root to path so we can import src
if "../" not in sys.path:
    sys.path.append("../")

from src.processor import CaptchaProcessor, ClassificationProcessor
from src.config.loader import hydrate_config
from src.config.config import ExperimentConfig

def load_config(config_path: str) -> ExperimentConfig:
    """Load config from YAML."""
    path = Path(config_path)
    if not path.exists():
        # Try relative to project root
        path = Path("../") / config_path
        if not path.exists():
            raise FileNotFoundError(f"Config file not found: {config_path}")
    
    with open(path, 'r') as f:
        data = yaml.safe_load(f)
        
    return hydrate_config(data)

# Configuration parameters
config_path = "experiments/training_configs/classification/aconvnext_rnn_1.yaml"
config = load_config(config_path)

# Metadata paths from config (if available) or defaults
train_set_path =  "dataset_train/dataset/metadata.json"
val_set_path = "dataset_test/metadata.json"

# Helper to resolve path relative to project root if running in notebook
def resolve_path(p):
    path = Path(p)
    if not path.exists():
        # Try one level up
        alt = Path("../") / p
        if alt.exists():
            return str(alt)
    return str(path)

train_set_path = resolve_path(train_set_path)
val_set_path = resolve_path(val_set_path)

print(f"--- Dataset Verification ---")
print(f"Config: {config_path}")

try:
    # load_metadata now just needs to open the resolved path
    with open(train_set_path, 'r') as f:
        train_set = json.load(f)
    with open(val_set_path, 'r') as f:
        val_set = json.load(f)
    
    print(f"Train set: {train_set_path} ({len(train_set)} samples)")
    print(f"Val set:   {val_set_path} ({len(val_set)} samples)")
    
    # Finding classes from metadata
    # Using 'word_input' as per recent processor update
    train_classes = sorted(list(set(entry.get('word_input', entry.get('word_rendered', '')) for entry in train_set)))
    val_classes = sorted(list(set(entry.get('word_input', entry.get('word_rendered', '')) for entry in val_set)))

    print(f"\nClasses in train set: {len(train_classes)}")
    print(f"Classes in val set:   {len(val_classes)}")
    
    # Consistency Check
    missing_in_val = set(train_classes) - set(val_classes)
    if missing_in_val:
        print(f"WARNING: {len(missing_in_val)} classes in train but NOT in val.")
        print("Note: This is expected if 'generate' used random sampling with replacement (dataset_count > vocab_size).")
    else:
        print("SUCCESS: Validation set covers all training classes.")

    # Verifying with Processor
    print(f"\n--- Processor Verification ---")
    # This will use the metadata_path to build the internal vocab/classes list
    # Use resolved train_set_path
    config.metadata_path = train_set_path
    processor = ClassificationProcessor(config, metadata_path=train_set_path)
    
    print(f"Processor class count: {len(processor.classes)}")
    if len(processor.classes) == len(train_classes):
        print("SUCCESS: Processor vocabulary matches metadata.")
    else:
        print(f"FAILURE: Mismatch! Processor has {len(processor.classes)} vs Metadata {len(train_classes)}")
        
    # Optional: print first 5 classes
    print(f"Sample classes: {processor.classes[:5]}...")
    
except Exception as e:
    print(f"ERROR during verification: {e}")
    import traceback
    traceback.print_exc()


--- Dataset Verification ---
Config: experiments/training_configs/classification/aconvnext_rnn_1.yaml
Train set: ..\dataset_train\dataset\metadata.json (2000 samples)
Val set:   ..\dataset_test\metadata.json (400 samples)

Classes in train set: 100
Classes in val set:   99
Note: This is expected if 'generate' used random sampling with replacement (dataset_count > vocab_size).

--- Processor Verification ---
Processor class count: 100
SUCCESS: Processor vocabulary matches metadata.
Sample classes: ['0s637', '17EmL', '2H5au', '2NrY4', '2tNwn']...


In [16]:
import os
print(os.getcwd())

c:\Users\jerry\IITM\CFI\Ai Club\2025-26\Precog\captcha_ocr\notebooks


In [19]:
# loading and verifying checkpoint
import torch 

checkpoint_path = r"..\experiments\classification\aconvnext_rnn_1\checkpoints\best_classification_baseline.pth"

model = torch.load(checkpoint_path)
print(model.keys())

dict_keys(['epoch', 'state_dict', 'optimizer_state_dict', 'config', 'metrics'])


In [21]:
model['config']

{'experiment_name': 'classification_baseline',
 'seed': 42,
 'dataset': {'width': 200,
  'height': 80,
  'noise_bg_density': 5000,
  'add_noise_dots': True,
  'add_noise_curve': True,
  'extra_spacing': -5,
  'spacing_jitter': 6,
  'target_height': 80,
  'width_divisor': 4,
  'width_bias': 0},
 'model': {'encoder_type': 'asymmetric_convnext',
  'projector_type': 'linear',
  'sequence_model_type': 'rnn',
  'head_type': 'classification',
  'd_vocab': 100,
  'd_model': 64,
  'loss_type': 'cross_entropy',
  'task_type': 'classification'},
 'training': {'batch_size': 32,
  'epochs': 10,
  'learning_rate': 0.0001,
  'optimizer_type': 'adamw',
  'weight_decay': 0.01,
  'grad_clip_norm': 1.0,
  'device': 'cuda'}}