# Breaking CAPTCHAs — PyTorch

This notebook provides a PyTorch implementation of the character-classifier CNN (LeNet-style) that matches the common TensorFlow/Keras architecture used in the lab.

**Model:**
- Input: (1, 20, 20)
- Conv(20, 5×5, padding='same') → ReLU → MaxPool(2)
- Conv(50, 5×5, padding='same') → ReLU → MaxPool(2)
- Flatten → FC(500) → ReLU → FC(n_classes)

> **Important:** Set `CHAR_IMAGE_FOLDER` to the folder that contains your per-class subfolders (e.g., `A/`, `B/`, ...).

In [None]:
# If you don't have PyTorch installed in this environment, run:
# !pip install torch torchvision

import os, pickle, math
import numpy as np
import cv2
import imutils
from imutils import paths

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset


## Helpers (preprocessing)

These mirror typical lab helper functions: resize to 20×20, scale to [0,1], and use **channel-first** `(1,20,20)` for PyTorch.

In [None]:
def resize_to_fit(image, width, height):
    (h, w) = image.shape[:2]
    if w > h:
        image = imutils.resize(image, width=width)
    else:
        image = imutils.resize(image, height=height)

    padW = int((width - image.shape[1]) / 2.0)
    padH = int((height - image.shape[0]) / 2.0)

    image = cv2.copyMakeBorder(image, padH, padH, padW, padW, cv2.BORDER_REPLICATE)
    image = cv2.resize(image, (width, height))
    return image

def make_feature(image_gray_2d):
    """Return a PyTorch-ready feature: float32 in [0,1] with shape (1,20,20)."""
    image_resized = resize_to_fit(image_gray_2d, 20, 20)
    x = image_resized.astype(np.float32) / 255.0
    x = np.expand_dims(x, axis=0)  # (1,20,20)
    return x


## Dataset

Assumes character images are stored in a directory structure like:

```
CHAR_IMAGE_FOLDER/
  A/xxx.png
  B/yyy.png
  ...
```

The label is taken from the **parent directory name** of each image path.

In [None]:
class CharDataset(Dataset):
    def __init__(self, char_image_folder, label_encoder):
        self.image_paths = list(paths.list_images(char_image_folder))
        self.label_encoder = label_encoder

        self.labels_str = [p.split(os.path.sep)[-2] for p in self.image_paths]
        self.labels = self.label_encoder.transform(self.labels_str)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
        p = self.image_paths[idx]
        img = cv2.imread(p, cv2.IMREAD_GRAYSCALE)
        if img is None:
            raise FileNotFoundError(f"Could not read image: {p}")
        x = make_feature(img)  # (1,20,20)
        y = self.labels[idx]   # integer class index
        return torch.from_numpy(x), torch.tensor(y, dtype=torch.long)


## Configure paths + build train/validation loaders

Set `CHAR_IMAGE_FOLDER` to your character dataset folder.

If you get `num_images = 0`, your path is wrong or the dataset isn't extracted where you think it is.

In [None]:
# TODO: set this to match your lab's dataset location
CHAR_IMAGE_FOLDER = "./characters"
LABELS_PATH = "./labels.pkl"

# Sanity check
all_paths = list(paths.list_images(CHAR_IMAGE_FOLDER))
print("CHAR_IMAGE_FOLDER:", CHAR_IMAGE_FOLDER)
print("num_images:", len(all_paths))
print("sample_paths:", all_paths[:3])

# Build label encoder based on folder names
labels_str_all = [p.split(os.path.sep)[-2] for p in all_paths]
le = LabelEncoder()
le.fit(labels_str_all)
n_classes = len(le.classes_)
print("n_classes:", n_classes)

# Save the label mapping (optional, like the TF lab)
with open(LABELS_PATH, "wb") as f:
    pickle.dump(le, f)

dataset = CharDataset(CHAR_IMAGE_FOLDER, le)

indices = np.arange(len(dataset))
train_idx, val_idx = train_test_split(indices, test_size=0.25, random_state=955996, shuffle=True)

train_ds = Subset(dataset, train_idx)
val_ds   = Subset(dataset, val_idx)

BATCH_SIZE = 32
train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, shuffle=False, num_workers=0)


## Model (PyTorch)

This matches the common Keras LeNet-style model used in many CAPTCHA labs.

In [None]:
class CaptchaCNN(nn.Module):
    def __init__(self, n_classes):
        super().__init__()
        self.conv1 = nn.Conv2d(1, 20, kernel_size=5, padding=2)   # same padding for 5x5
        self.pool  = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(20, 50, kernel_size=5, padding=2)  # same padding

        self.fc1 = nn.Linear(50 * 5 * 5, 500)
        self.fc2 = nn.Linear(500, n_classes)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))  # (B,20,10,10)
        x = self.pool(F.relu(self.conv2(x)))  # (B,50,5,5)
        x = torch.flatten(x, 1)               # (B,1250)
        x = F.relu(self.fc1(x))               # (B,500)
        x = self.fc2(x)                       # (B,n_classes) logits
        return x


## Train + validate

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("device:", device)

model = CaptchaCNN(n_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)

def accuracy(loader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            logits = model(x)
            pred = logits.argmax(dim=1)
            correct += (pred == y).sum().item()
            total += y.numel()
    return correct / total if total else 0.0

N_EPOCHS = 10

for epoch in range(1, N_EPOCHS + 1):
    model.train()
    running_loss = 0.0

    for x, y in train_loader:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        logits = model(x)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * x.size(0)

    train_acc = accuracy(train_loader)
    val_acc = accuracy(val_loader)
    avg_loss = running_loss / len(train_loader.dataset)

    print(f"Epoch {epoch:02d}/{N_EPOCHS} | loss={avg_loss:.4f} | train_acc={train_acc:.4f} | val_acc={val_acc:.4f}")


## Save model weights (optional)

In [None]:
torch.save(model.state_dict(), "./captcha-model-pytorch.pt")
print("Saved:", "./captcha-model-pytorch.pt")


## Optional: Predict character labels with the trained model

If you already have a list of extracted character images (each a 2D grayscale 20×20 image), you can batch-predict them and convert indices back to label strings.

In [None]:
def predict_chars_torch(char_images_20x20_gray):
    feats = np.stack([make_feature(img) for img in char_images_20x20_gray], axis=0)  # (N,1,20,20)
    x = torch.from_numpy(feats).to(device)

    model.eval()
    with torch.no_grad():
        logits = model(x)
        pred_idx = logits.argmax(dim=1).cpu().numpy()

    return le.inverse_transform(pred_idx)

# Example usage:
# pred_labels = predict_chars_torch(list_of_20x20_gray_images)
# print(pred_labels[:10])
