In [1]:
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
import cv2

In [2]:
from PIL import Image, ImageDraw, ImageFont

font_path = '../dados/targa/Targa.ttf'

In [3]:
import tensorflow as tf

import keras

2024-07-26 16:44:59.341670: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
!wget  -nc https://www.dropbox.com/scl/fi/uaiyxp0t2l8hfcszfadtj/dados.zip?rlkey=lnqcb79vbu8j6cdbfgofogius&dl=1
!unzip -n -q dados.zip?rlkey=lnqcb79vbu8j6cdbfgofogius

/bin/bash: /home/igu/miniconda3/envs/ml/lib/python3.9/site-packages/cv2/../../../../lib/libtinfo.so.6: no version information available (required by /bin/bash)
/bin/bash: /home/igu/miniconda3/envs/ml/lib/python3.9/site-packages/cv2/../../../../lib/libtinfo.so.6: no version information available (required by /bin/bash)
unzip:  cannot find or open dados.zip?rlkey=lnqcb79vbu8j6cdbfgofogius, dados.zip?rlkey=lnqcb79vbu8j6cdbfgofogius.zip or dados.zip?rlkey=lnqcb79vbu8j6cdbfgofogius.ZIP.

No zipfiles found.


In [5]:
image_path = '../dados/CAPTCHA-10k/treinamento'
def generate_df(image_path):
  label_path = '../dados/CAPTCHA-10k/labels10k'

  jpg_files = [f for f in os.listdir(image_path) if f.endswith('.jpg')]
  jpg_files.sort()
  data = []

  for jpg_file in jpg_files:
      txt_file = os.path.splitext(jpg_file)[0] + '.txt'
      txt_file_path = os.path.join(label_path, txt_file)

      if os.path.exists(txt_file_path):
          with open(txt_file_path, 'r') as file:
              txt_content = file.read().strip()

          data.append({'jpg_file': jpg_file, 'txt_content': txt_content})
  return pd.DataFrame(data)

df = generate_df(image_path)
df.head()

Unnamed: 0,jpg_file,txt_content
0,000001.jpg,RNINIC
1,000002.jpg,TVCFS8
2,000003.jpg,N1O1EH
3,000004.jpg,OQZSL4
4,000005.jpg,GST2YA


In [6]:
vocab = [ '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z']
num_classes = len(vocab)
char_to_index = {char: idx for idx, char in enumerate(vocab)}

In [7]:
def generate_clean_captcha(text):
    # Fixed parameters
    size = (180, 50)  # Change size to (height, width)
    font_size = 24
    num_parts = 6

    # Create a blank white image
    image = Image.new('L', size, 255)  # 'L' mode for grayscale

    # Load the custom font
    font = ImageFont.truetype(font_path, font_size)

    # Create a drawing context
    draw = ImageDraw.Draw(image)

    # Calculate positions for each part
    part_width = size[0] / num_parts
    horizontal_positions = [int(part_width * i + part_width / 2) for i in range(num_parts)]
    horizontal_positions = horizontal_positions[:len(text)]  # Adjust to the length of the text

    # Calculate y position to center the text vertically
    text_bbox = draw.textbbox((0, 0), text, font=font)
    text_height = text_bbox[3] - text_bbox[1]
    text_y = (size[1] - text_height) // 2

    # Draw each letter at the calculated position
    for char, x in zip(text, horizontal_positions):
        char_bbox = draw.textbbox((0, 0), char, font=font)
        char_width = char_bbox[2] - char_bbox[0]
        char_x = x - char_width // 2  # Center the character horizontally within its part
        draw.text((char_x, text_y), char, font=font, fill=0)

    # Convert to numpy array if needed for further processing with OpenCV
    captcha_image = np.array(image)

    return captcha_image

In [8]:
def preprocess(img):
  kernel  = cv2.getStructuringElement(cv2.MORPH_RECT, (4, 4))
  img     = cv2.morphologyEx(img, cv2.MORPH_CLOSE, kernel)
  _, img  = cv2.threshold(img, 90, 255, cv2.THRESH_BINARY)
  return img

In [9]:
def generate_X_Y(image_path):
  df = generate_df(image_path)
  X = [preprocess(cv2.imread(os.path.join(image_path, x),cv2.IMREAD_GRAYSCALE)) for x in df["jpg_file"]]
  X = np.array(X)
  X = np.expand_dims(X, axis=-1)

  Y = np.array([generate_clean_captcha(x[:6]) for x in df["txt_content"]])
  X = X.astype('float32') / 255.
  Y = Y.astype('float32') / 255.

  
  return X,Y,df['txt_content']

X_train, Y_train,labels_train = generate_X_Y('../dados/CAPTCHA-10k/treinamento')
X_val, Y_val,labels_val = generate_X_Y('../dados/CAPTCHA-10k/validacao')

In [10]:
def build_dataset(X,predictions, labels, batch_size=32):
    def encode_labels(labels):
        # Create an array to store one-hot encoded labels
        encoded_labels = np.zeros((len(labels), 6, num_classes), dtype=np.float32)
        
        for i, label in enumerate(labels):
            for j, char in enumerate(label[:6]):
                index = char_to_index.get(char, -1)
                encoded_labels[i, j, index] = 1.0
        return encoded_labels
    
    Y = encode_labels(labels)
    
    # Create TensorFlow datasets from X and Y
    dataset = tf.data.Dataset.from_tensor_slices((X, predictions,Y))

    # Shuffle, batch, and prefetch the dataset
    buffer_size = len(X)  # Typically set to the size of the dataset
    dataset = dataset.shuffle(buffer_size).batch(batch_size).prefetch(tf.data.AUTOTUNE)

    return dataset
train_dataset = build_dataset(X_train,Y_train, labels_train)
val_dataset = build_dataset(X_val,Y_val, labels_val)

2024-07-26 16:45:05.856219: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1639] Created device /job:localhost/replica:0/task:0/device:GPU:0 with 13795 MB memory:  -> device: 0, name: NVIDIA GeForce RTX 4060 Ti, pci bus id: 0000:01:00.0, compute capability: 8.9


In [11]:
def rmse(y_true, y_pred):
    return tf.sqrt(tf.reduce_mean(tf.square(y_pred - y_true)))

def psnr(y_true, y_pred):
    max_pixel = 1.0
    return tf.image.psnr(y_true, y_pred, max_val=max_pixel)

In [12]:
autoencoder = tf.keras.models.load_model('model_BCE_aug_best_unet.tf',custom_objects={"rmse": rmse,"psnr":psnr})
classifier = tf.keras.models.load_model('classifier_pre_trained.tf')

In [13]:
def classification_loss(y_true, y_pred):
    # Reshape y_true and y_pred to match the shape expected for loss calculation
    return tf.reduce_mean(tf.keras.losses.CategoricalCrossentropy()(y_true, y_pred))

def reconstruction_loss(autoencoder_output, captcha_predictions):
    return tf.keras.losses.binary_crossentropy(tf.keras.backend.flatten(autoencoder_output), tf.keras.backend.flatten(captcha_predictions))

In [14]:
LAMBDA = 10
@tf.function
def train_step(input_images, true_images, labels, autoencoder, classifier, optimizer):
    with tf.GradientTape() as tape:
        # Forward pass through the autoencoder
        autoencoder_output  = autoencoder(input_images, training=True)
        predictions_list = []

        c_loss = 0.0
        interval = [0,30,60,90,120,150,180]
        for i in range(1,len(interval)):
            fake_img = autoencoder_output[:,:,interval[i-1]:interval[i],:]
            y_pred = classifier(fake_img, training=True)

            predictions_list.append(y_pred)
            c_loss += classification_loss(labels[:,i-1], y_pred)

        c_loss /= len(interval)
        r_loss = reconstruction_loss(autoencoder_output, true_images)
        loss = r_loss + LAMBDA*c_loss
        
    # Compute gradients
    gradients = tape.gradient(loss, autoencoder.trainable_variables + classifier.trainable_variables)
    
    # Apply gradients
    optimizer.apply_gradients(zip(gradients, autoencoder.trainable_variables + classifier.trainable_variables))

    #  Compute accuracy
    predictions = tf.concat(predictions_list, axis=0)  # Concatenate predictions from all patches
    predicted_labels = tf.argmax(predictions, axis=-1)  # Convert to class indices
    predicted_labels = tf.reshape(predicted_labels,[-1,6])
    true_labels = tf.argmax(labels, axis=-1)  # Convert true labels to class indices
    accuracy = tf.reduce_mean(tf.cast(tf.equal(predicted_labels, true_labels), tf.float32))
    return loss, accuracy

In [15]:
optimizer = tf.keras.optimizers.Adam()
epochs = 1
for epoch in range(epochs):
    print(f"Epoch {epoch+1}/{epochs}")
    
    # Training loop
    for (input_images,true_images, labels) in train_dataset:
        loss,acc =  train_step(input_images, true_images, labels, autoencoder, classifier, optimizer)
        print(f"Training loss: {loss.numpy()}  acc {acc.numpy()}")

Epoch 1/1


2024-07-26 16:45:13.059949: I tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:432] Loaded cuDNN version 8907
2024-07-26 16:45:14.003791: I tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:606] TensorFloat-32 will be used for the matrix multiplication. This will only be logged once.
2024-07-26 16:45:14.179541: I tensorflow/compiler/xla/service/service.cc:168] XLA service 0x7be34bcdacc0 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:
2024-07-26 16:45:14.179564: I tensorflow/compiler/xla/service/service.cc:176]   StreamExecutor device (0): NVIDIA GeForce RTX 4060 Ti, Compute Capability 8.9
2024-07-26 16:45:14.183796: I tensorflow/compiler/mlir/tensorflow/utils/dump_mlir_util.cc:255] disabling MLIR crash reproducer, set env var `MLIR_CRASH_REPRODUCER_DIRECTORY` to enable.
2024-07-26 16:45:14.295525: I ./tensorflow/compiler/jit/device_compiler.h:186] Compiled cluster using XLA!  This line is logged at most once for the lifetime of th

Training loss: 14.763928413391113  acc 0.0572916679084301
Training loss: 7.739585876464844  acc 0.02083333395421505
Training loss: 8.827205657958984  acc 0.0520833320915699
Training loss: 6.311063289642334  acc 0.03125
Training loss: 5.5601630210876465  acc 0.0572916679084301
Training loss: 3.7353014945983887  acc 0.02083333395421505
Training loss: 3.5528697967529297  acc 0.0625
Training loss: 4.06256628036499  acc 0.03125
Training loss: 4.330389499664307  acc 0.046875
Training loss: 2.0340499877929688  acc 0.03125
Training loss: 2.915771961212158  acc 0.0520833320915699
Training loss: 2.887209892272949  acc 0.046875
Training loss: 3.3509342670440674  acc 0.02083333395421505
Training loss: 1.7997829914093018  acc 0.03125
Training loss: 1.803154468536377  acc 0.0625
Training loss: 2.359067440032959  acc 0.02604166604578495
Training loss: 2.098780393600464  acc 0.02604166604578495
Training loss: 1.812424898147583  acc 0.0520833320915699
Training loss: 2.8251683712005615  acc 0.046875
Tra