In [102]:
# import needed lib
import random
import numpy as np
import cv2
from matplotlib import pyplot as plt
import os

In [82]:
# read all the captcha 
def load_data(file_path):
    n_samples = len(os.listdir(file_path))
    imgs = []
    labels = []

    # read from the directory
    for i, img in enumerate(os.listdir(file_path)):
        if (i == 10):
            break;
        imgs.append(cv2.resize(cv2.imread(os.path.join(file_path, img), 0), (300, 57)))
        labels.append(img[0:-6])

    return np.array(imgs), labels

    # # print the ddtype
    # print(X[0].dtype) # uint8

    # # preprocess the image
    # for i in range(len(X)):
    #     X[i] = cv2.resize(X[i], (300, 57)) # reshape it to the same size
    #     X[i] = cv2.equalizeHist(X[i]) # increase the contrast of the img
    #     # X[i] = cv2.medianBlur(X[i], (2)) # try to remove noise term

In [70]:
def preprocess_data(imgs):
    processed_data = imgs
    return processed_data

In [72]:
from sklearn.preprocessing import LabelEncoder

# Encode labels as sequences of integers
def encode_labels(labels, max_length):
    label_encoder = LabelEncoder()
    label_encoder.fit(list("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789"))  # All possible characters
    encoded_labels = [label_encoder.transform(list(label)) for label in labels]
    padded_labels = np.array([np.pad(label, (0, max_length - len(label)), mode='constant') for label in encoded_labels]) # ? do we need to pad
    return padded_labels, label_encoder

In [122]:
from keras.models import Model
from keras.layers import Input, Conv2D, MaxPooling2D, Reshape, LSTM, Dense, Lambda
import keras.backend as K

# Define the CNN + RNN model
def create_crnn_model(input_shape, num_classes, max_length=6):
    # Input layer
    input_layer = Input(shape=input_shape)

    # CNN layers
    x = Conv2D(32, (3, 3), activation='relu', padding='same')(input_layer)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(64, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)
    x = Conv2D(128, (3, 3), activation='relu', padding='same')(x)
    x = MaxPooling2D((2, 2))(x)

    # Reshape for RNN
    x = Reshape((259, 128))(x)  # Reshape to (time_steps, features)

    # RNN layers
    x = LSTM(128, return_sequences=True)(x)
    x = LSTM(64, return_sequences=True)(x)

    # Output layer
    output_layer = Dense(num_classes + 1, activation='softmax')(x)  # +1 for CTC blank token

    # Create the model
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

In [148]:
# CTC loss function
import tensorflow as tf

def ctc_loss(y_true, y_pred):
    # Get batch size, input length, and label length
    batch_size = tf.shape(y_true)[0]
    input_length = tf.shape(y_pred)[1]
    label_length = tf.shape(y_true)[1]

    # Cast input_length and label_length to int64
    input_length = tf.cast(input_length, dtype=tf.int64)
    label_length = tf.cast(label_length, dtype=tf.int64)

    # Create tensors for input_length and label_length
    input_length = input_length * tf.ones((batch_size, 1), dtype=tf.int64)
    label_length = label_length * tf.ones((batch_size, 1), dtype=tf.int64)

    # Compute CTC loss
    return tf.keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)

In [136]:
from sklearn.model_selection import train_test_split
def split_data(X, y):
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train, X_val, y_train, y_val

In [154]:
# Predict a CAPTCHA image
def predict_captcha(image_path):
    # Preprocess the image
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, (300, 57))  # Resize to match model input
    image = image / 255.0  # Normalize
    image = image.reshape(1, 57, 300, 1)  # Reshape for model input

    # Predict the CAPTCHA
    predictions = model.predict(image)
    decoded = K.ctc_decode(predictions, input_length=np.ones(predictions.shape[0]) * predictions.shape[1], greedy=True)[0][0]
    decoded = K.eval(decoded)  # Convert tensor to numpy array
    captcha_text = "".join(label_encoder.inverse_transform(decoded[0]))
    return captcha_text

In [None]:
# Evaluate the model on the validation set
def evaluate_model(model, X_val, y_val):
    correct = 0
    for i in range(len(X_val)):
        image = X_val[i].reshape(1, 32, 128, 1)
        prediction = model.predict(image)
        decoded = K.ctc_decode(prediction, input_length=np.ones(prediction.shape[0]) * prediction.shape[1], greedy=True)[0][0]
        decoded = K.eval(decoded)
        predicted_text = "".join(label_encoder.inverse_transform(decoded[0]))
        true_text = "".join(label_encoder.inverse_transform(y_val[i]))
        if predicted_text == true_text:
            correct += 1
    accuracy = correct / len(X_val)
    return accuracy

# Calculate accuracy
accuracy = evaluate_model(model, X_val, y_val)
print(f"Validation Accuracy: {accuracy * 100:.2f}%")

In [160]:
if __name__ == '__main__':
    # loading data
    imgs, labels = load_data("./data/train") # (10, 57, 300), (10)

    # # print the ddtype
    # print(imgs[0].dtype) # uint8
    
    # # print the original img
    # plt.figure(figsize=(8,8))
    # for i in range(len(imgs)):
    #     plt.subplot(5, 2, i+1)
    #     plt.imshow(imgs[i])
    #     plt.xlabel(f"{labels[i]}")
    # plt.show()

    # preprocess data
    processed_imgs = preprocess_data(imgs)

    # encode labels
    max_length = max(len(label) for label in labels)
    encoded_labels, label_encoder = encode_labels(labels, max_length)
    # print(label_encoder.inverse_transform(encoded_labels[0]), labels[0]) # check for the correctness of the encoding

    # Reshape images for CNN input
    processed_imgs_reshape = processed_imgs.reshape(-1, 57, 300, 1)  # (batch_size, height, width, channels)

    # Create the model
    input_shape = (57, 300, 1)  # Input image shape (height, width, channels)
    num_classes = len(label_encoder.classes_)  # Number of unique characters
    model = create_crnn_model(input_shape, num_classes)
    
    # Compile the model
    model.compile(optimizer='adam', loss=ctc_loss)

    # Split the data
    X_train, X_val, y_train, y_val = split_data(processed_imgs, encoded_labels)

    # Train the model
    history = model.fit(X_train, y_train, epochs=20, batch_size=32, validation_data=(X_val, y_val))
    
    # Save the model
    model.save("captcha_crnn_model.keras")

Epoch 1/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 8s/step - loss: 988.0058 - val_loss: 955.2221
Epoch 2/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 452ms/step - loss: 949.3893 - val_loss: 994.0943
Epoch 3/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 501ms/step - loss: 910.9783 - val_loss: 908.1239
Epoch 4/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 473ms/step - loss: 858.3888 - val_loss: 928.1896
Epoch 5/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 558ms/step - loss: 833.3911 - val_loss: 894.3937
Epoch 6/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 687ms/step - loss: 787.4226 - val_loss: 850.4167
Epoch 7/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 541ms/step - loss: 737.7437 - val_loss: 831.0126
Epoch 8/20
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 486ms/step - loss: 704.5043 - val_loss: 808.0816
Epoch 9/20
[1m1/1[0m [32