# Captcha Recognition with CNN (TensorFlow/Keras)

This notebook implements a Convolutional Neural Network (CNN) to recognize captcha characters. 
It replaces the traditional SVM + HOG approach with a deep learning model for potentially higher accuracy.

## Pipeline
1. **Load Data**: Download dataset from Kaggle.
2. **Preprocessing**: Segmentation mechanism (Same as current project).
3. **Data Preparation**: Convert segmented characters to 32x32 arrays for CNN input.
4. **Model**: Build and train a CNN.
5. **Evaluation**: Check accuracy.

In [None]:
# 1. Install Dependencies
!pip install -q scikit-image scikit-learn pandas joblib kagglehub matplotlib seaborn tqdm tensorflow

In [None]:
# 2. Imports
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from tqdm.notebook import tqdm
from skimage.io import imread
from skimage.color import rgb2gray
from skimage.filters import threshold_otsu
from skimage.transform import resize
from skimage.morphology import opening, footprint_rectangle, remove_small_objects
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import kagglehub

import tensorflow as tf
from tensorflow.keras import layers, models, callbacks

print(f"TensorFlow Version: {tf.__version__}")

In [None]:
# 3. Preprocessing Logic (Consistent with Project)

def preprocess_image_v3(img):
    # Handle different channel configurations reliably
    if img.ndim == 3:
        if img.shape[2] == 4:
            img = img[..., :3]
        img_gray = rgb2gray(img)
    else:
        # Already grayscale 2D array
        img_gray = img
        
    img_resized = resize(img_gray, (50, 200))
    
    # Global Otsu
    thresh = threshold_otsu(img_resized)
    img_bin = img_resized > thresh
    
    # Background Check
    corners = [img_bin[0,0], img_bin[0,-1], img_bin[-1,0], img_bin[-1,-1]]
    if np.mean(corners) < 0.5: 
        img_inv = img_bin.astype(np.uint8)
    else:
        img_inv = (1 - img_bin).astype(np.uint8)
    
    # Clean up
    img_cleaned = opening(img_inv, footprint_rectangle((2, 2)))
    img_cleaned = remove_small_objects(img_cleaned.astype(bool), min_size=20).astype(np.uint8)
    
    return img_cleaned

def segment_characters_v2(img_cleaned, num_chars=5):
    projection = np.sum(img_cleaned, axis=0)
    threshold = np.max(projection) * 0.1
    is_char = projection > threshold
    
    char_indices = []
    in_char = False
    start = 0
    for i, val in enumerate(is_char):
        if val and not in_char:
            start = i
            in_char = True
        elif not val and in_char:
            width = i - start
            if width >= 2:
                if width > 50:
                    num_splits = round(width / 32)
                    split_w = width / num_splits
                    for s in range(num_splits):
                        char_indices.append((int(start + s*split_w), int(start + (s+1)*split_w)))
                else:
                    char_indices.append((start, i))
            in_char = False
    if in_char:
        char_indices.append((start, len(is_char)))

    characters = []
    for (start, end) in char_indices[:num_chars]:
        char_img = img_cleaned[:, start:end]
        h, w = char_img.shape
        if h == 0 or w == 0: continue
        
        diff = abs(h - w)
        p1, p2 = diff // 2, diff - (diff // 2)
        if h > w:
            char_img = np.pad(char_img, ((0, 0), (p1, p2)), mode='constant')
        else:
            char_img = np.pad(char_img, ((p1, p2), (0, 0)), mode='constant')
            
        char_img_resized = resize(char_img, (32, 32))
        characters.append(char_img_resized)
    
    while len(characters) < num_chars:
        characters.append(np.zeros((32, 32)))
        
    return characters

In [None]:
# 4. Prepare Data

print("Downloading dataset...")
path = kagglehub.dataset_download("fournierp/captcha-version-2-images")
images_dir = Path(path) / "samples"
image_files = list(images_dir.glob("*.png")) + list(images_dir.glob("*.jpg"))

X = []
y = []

print("Processing images...")
for path in tqdm(image_files, desc="Segmenting"):
    try:
        img = imread(str(path))
        label_text = path.stem
        
        img_cleaned = preprocess_image_v3(img)
        char_images = segment_characters_v2(img_cleaned, num_chars=5)
        
        if len(char_images) == len(label_text):
            for char_img, char_label in zip(char_images, label_text):
                # CNN Input: Need 32x32x1 shape
                X.append(char_img)
                y.append(char_label)
    except Exception as e:
        continue

X = np.array(X)
X = X.reshape(-1, 32, 32, 1) # Add channel dimension
y = np.array(y)

le = LabelEncoder()
y_enc = le.fit_transform(y)

print(f"Original X Shape: {X.shape}")
print(f"Classes: {le.classes_}")
print(f"Num Classes: {len(le.classes_)}")

In [None]:
# 5. Build CNN Model

num_classes = len(le.classes_)

def create_cnn_model():
    model = models.Sequential([
        # Conv Block 1
        layers.Conv2D(32, (3, 3), activation='relu', input_shape=(32, 32, 1)),
        layers.MaxPooling2D((2, 2)),
        
        # Conv Block 2
        layers.Conv2D(64, (3, 3), activation='relu'),
        layers.MaxPooling2D((2, 2)),
        
        # Conv Block 3
        layers.Conv2D(64, (3, 3), activation='relu'),
        # layers.MaxPooling2D((2, 2)), # Optional depending on size
        
        layers.Flatten(),
        layers.Dense(64, activation='relu'),
        layers.Dropout(0.2), # Reduce Overfitting
        layers.Dense(num_classes, activation='softmax')
    ])
    
    model.compile(optimizer='adam',
                  loss='sparse_categorical_crossentropy',
                  metrics=['accuracy'])
    return model

model = create_cnn_model()
model.summary()

In [None]:
# 6. Train Model

X_train, X_test, y_train, y_test = train_test_split(X, y_enc, test_size=0.2, random_state=42, stratify=y_enc)

early_stopping = callbacks.EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

history = model.fit(
    X_train, y_train,
    epochs=30,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stopping]
)

In [None]:
# 7. Evaluation

plt.figure(figsize=(12, 4))
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'], label='Train Acc')
plt.plot(history.history['val_accuracy'], label='Val Acc')
plt.legend()
plt.title('Accuracy')

plt.subplot(1, 2, 2)
plt.plot(history.history['loss'], label='Train Loss')
plt.plot(history.history['val_loss'], label='Val Loss')
plt.legend()
plt.title('Loss')
plt.show()

# Detailed Report
y_pred_probs = model.predict(X_test)
y_pred = np.argmax(y_pred_probs, axis=1)

print(classification_report(le.inverse_transform(y_test), le.inverse_transform(y_pred)))

test_loss, test_acc = model.evaluate(X_test, y_test)
print(f"\nTest Accuracy: {test_acc*100:.2f}%")