In [1]:
# Install required packages
!pip install --quiet tensorflow matplotlib pillow opencv-python kagglehub

import os
import random
import numpy as np
from glob import glob
from PIL import Image
import tensorflow as tf
from tensorflow.keras import layers, models
import kagglehub

# Download the latest dataset version from Kaggle
dataset_dir = kagglehub.dataset_download("divyanshrai/handwritten-signatures")

# Function to collect all images from the dataset
def collect_images(base_dir):
    exts = ['*.png', '*.jpg', '*.jpeg']
    images = []
    for ext in exts:
        images.extend(glob(os.path.join(base_dir, '**', ext), recursive=True)) #search all subfolders.
    return images

all_imgs = collect_images(dataset_dir)

# Build a dictionary mapping writers to their images
writers_dict = {}
for img_path in all_imgs:
    filename = os.path.basename(img_path)
    writer_id = filename.split("_")[0]
    writers_dict.setdefault(writer_id, []).append(img_path)

writers = list(writers_dict.keys())

# Function to load and preprocess images
IMG_SIZE = (100, 100)
def load_img(path):
    img = Image.open(path).convert('L').resize(IMG_SIZE)
    arr = np.array(img) / 255.0
    return np.expand_dims(arr, axis=-1)

# Generator for image pairs (same writer or different writers)
def pair_generator(batch_size=32):
    while True:
        X1, X2, y = [], [], []
        for _ in range(batch_size):
            if random.random() < 0.5:
                w = random.choice(writers)
                if len(writers_dict[w]) < 2:
                    continue
                imgs = random.sample(writers_dict[w], 2)
                label = 1
            else:
                if len(writers) < 2:
                    continue
                w1, w2 = random.sample(writers, 2)
                imgs = [random.choice(writers_dict[w1]), random.choice(writers_dict[w2])]
                label = 0
            X1.append(load_img(imgs[0]))
            X2.append(load_img(imgs[1]))
            y.append(label)
        yield (np.array(X1, dtype=np.float32), np.array(X2, dtype=np.float32)), np.array(y, dtype=np.float32)

# Create TensorFlow dataset from the generator
BATCH_SIZE = 16
train_dataset = tf.data.Dataset.from_generator(
    lambda: pair_generator(BATCH_SIZE),
    output_signature=(
        (tf.TensorSpec(shape=(None, 100, 100, 1), dtype=tf.float32),
         tf.TensorSpec(shape=(None, 100, 100, 1), dtype=tf.float32)),
        tf.TensorSpec(shape=(None,), dtype=tf.float32)
    )
)

# Build the base CNN for Siamese network
def build_base(input_shape=(100,100,1)):
    model = models.Sequential([
        layers.Input(shape=input_shape),
        layers.Conv2D(32, (3,3), activation="relu"),
        layers.MaxPooling2D((2,2)),
        layers.Conv2D(64, (3,3), activation="relu"),
        layers.MaxPooling2D((2,2)),
        layers.Flatten(),
        layers.Dense(128, activation="relu")
    ])
    return model

# Construct Siamese network
base_net = build_base()
input_a = layers.Input(shape=(100,100,1))
input_b = layers.Input(shape=(100,100,1))
feat_a = base_net(input_a)
feat_b = base_net(input_b)
distance = layers.Lambda(lambda tensors: tf.abs(tensors[0] - tensors[1]))([feat_a, feat_b])
output = layers.Dense(1, activation="sigmoid")(distance)
model = models.Model([input_a, input_b], output)
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
STEPS = 50
VAL_STEPS = 10
EPOCHS = 10
history = model.fit(
    train_dataset,
    steps_per_epoch=STEPS,
    validation_data=train_dataset,
    validation_steps=VAL_STEPS,
    epochs=EPOCHS
)

# Print final training and validation accuracy
final_train_acc = history.history['accuracy'][-1]
final_val_acc = history.history['val_accuracy'][-1]

print(f"Final Training Accuracy: {final_train_acc*100:.2f}%")
print(f"Final Validation Accuracy: {final_val_acc*100:.2f}%")


Downloading from https://www.kaggle.com/api/v1/datasets/download/divyanshrai/handwritten-signatures?dataset_version_number=2...


100%|██████████| 370M/370M [00:02<00:00, 169MB/s]

Extracting files...





Epoch 1/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 815ms/step - accuracy: 0.8396 - loss: 0.5377 - val_accuracy: 0.9563 - val_loss: 0.4422
Epoch 2/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 2s/step - accuracy: 0.9612 - loss: 0.4076 - val_accuracy: 0.9750 - val_loss: 0.3799
Epoch 3/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 887ms/step - accuracy: 0.9476 - loss: 0.4044 - val_accuracy: 0.9438 - val_loss: 0.3785
Epoch 4/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 738ms/step - accuracy: 0.9468 - loss: 0.4036 - val_accuracy: 0.9250 - val_loss: 0.4463
Epoch 5/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 736ms/step - accuracy: 0.9638 - loss: 0.3683 - val_accuracy: 0.9563 - val_loss: 0.3555
Epoch 6/10
[1m50/50[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 884ms/step - accuracy: 0.9508 - loss: 0.3711 - val_accuracy: 0.9563 - val_loss: 0.3516
Epoch 7/10
[1m50/50[0m 

In [7]:
model.save("siamese_model.keras")


In [9]:
from google.colab import files
files.download("siamese_model.keras")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>