In [None]:
from models import FixedLengthCRNN, VariableLengthCRNN

# Fixed length CAPTCHA

# Variable length CAPTCHA model

### Model Architecture Overview

#### Input Layer
- **Size**: Match preprocessed CAPTCHA images, e.g., `(60, 160, 1)` for height, width, and channels (for grayscale images).

#### Convolutional Layers
- **Purpose**: Extract features. Increase depth while reducing spatial dimensions through pooling.
- **Configuration**: Several convolutional layers, each followed by max pooling layers.

#### Recurrent Layer
- **Type**: RNN with LSTM or GRU units.
- **Purpose**: Process features in sequence, crucial for recognizing characters in CAPTCHA.

#### Dense Layer with Softmax
- **Function**: Multi-class classification for each character position in the CAPTCHA.
- **Activation**: Softmax, to output probabilities for each possible character.

#### Connectionist Temporal Classification (CTC) Loss
- **Purpose**: Handle variable length of CAPTCHA strings and alignment between inputs and targets.

### Rationale
- **CNN + RNN**: CNNs are excellent for spatial feature extraction, RNNs understand sequence dynamics. This combo is effective for spatial and sequential recognition tasks like CAPTCHA.
- **CTC Loss**: Ideal for sequence recognition with variable lengths and unknown alignment.

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# Model architecture
class VariableCRNN(nn.Module):
    def __init__(self):
        super(VariableCRNN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=(3, 3), activation='relu')
        self.pool1 = nn.MaxPool2d((2, 2))
        self.conv2 = nn.Conv2d(32, 64, kernel_size=(3, 3), activation='relu')
        self.pool2 = nn.MaxPool2d((2, 2))
        self.conv3 = nn.Conv2d(64, 64, kernel_size=(3, 3), activation='relu')
        self.rnn_input_size = 64 * 15 * 40  # Update according to the output shape after conv layers
        self.lstm = nn.LSTM(self.rnn_input_size, 128, bidirectional=True, batch_first=True)
        self.fc = nn.Linear(256, 11)  # 128 * 2 for bidirectional, 10 + 1 for classes

    def forward(self, x):
        x = F.relu(self.conv1(x))
        x = self.pool1(x)
        x = F.relu(self.conv2(x))
        x = self.pool2(x)
        x = F.relu(self.conv3(x))
        x = x.view(x.size(0), -1)  # Flatten
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

# Custom CTC Loss
# In PyTorch, CTC Loss is already implemented and can be used directly.
# You will need to provide logits from your model, target (labels), input_lengths, and target_lengths
ctc_loss = nn.CTCLoss()

# Example on how to calculate CTC Loss
# logits: tensor of shape (T, N, C) where T is the maximum sequence length, N is the batch size, C is the number of classes (including blank)
# labels: tensor of shape (sum(target_lengths))
# input_lengths: tensor of size (N)
# target_lengths: tensor of size (N)
# loss = ctc_loss(logits, labels, input_lengths, target_lengths)

# Note: Make sure to use log_softmax on your output logits before passing them to CTC loss


# Model training

In [None]:
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os

class DigitsDataset(Dataset):
    def __init__(self, directory):
        self.directory = directory
        self.filenames = os.listdir(directory)
        self.transform = transforms.Compose([
            transforms.Resize((46, 160)),  # Resize if your images aren't already 160x46
            transforms.ToTensor(),  # Convert images to PyTorch tensors
        ])

    def __len__(self):
        return len(self.filenames)

    def __getitem__(self, idx):
        path = os.path.join(self.directory, self.filenames[idx])
        image = Image.open(path).convert('RGB')  # Ensure image is in RGB
        image = self.transform(image)
        label = os.path.splitext(self.filenames[idx])[0]  # Assumes filename is the label
        label = torch.tensor([int(ch) for ch in label], dtype=torch.long)
        return image, label

In [None]:
from torch import optim

# Initialize dataset and data loader
dataset = DigitsDataset('/path/to/your/dataset')
data_loader = DataLoader(dataset, batch_size=32, shuffle=True)

# Initialize the model, loss function, and optimizer
model = FixedLengthCRNN()
loss_function = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Training loop
for epoch in range(num_epochs):
    for images, labels in data_loader:
        optimizer.zero_grad()
        
        outputs = model(images)
        
        # Since outputs are in shape (batch_size, 4, 36) and labels are (batch_size, 4),
        # you need to calculate the loss for each digit and sum up.
        loss = sum([loss_function(outputs[:, i], labels[:, i]) for i in range(4)])
        
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch+1}, Loss: {loss.item()}')
