# CNN
This notebook aims to deploy the CNN.

In [112]:
import os
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pandas as pd
import string
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical


---
## Loading the data
Data has to load out of the 'carolianminuscule-groundtruth'-folder 

In [113]:
def get_images(folder_path: str):
    """
    Load images and text files from the given path.
    :param folder_path: Path to the directory containing images and text files.
    :return: Two lists - one for image paths and one for text file paths.
    """
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Path {folder_path} does not exist.")

    images = []
    files = []

    for entry in os.listdir(folder_path):
        entry_path = os.path.join(folder_path, entry)
        if os.path.isdir(entry_path):
            # Recursively get images and text files from subdirectories
            sub_images, sub_files = get_images(entry_path)
            images.extend(sub_images)
            files.extend(sub_files)
        elif entry.endswith(".png"):
            images.append(entry_path)
        elif entry.endswith(".txt"):
            files.append(entry_path)

    return images, files


# load the data from the directory
path = "carolineminuscule-groundtruth"
images, files = get_images(path)


# matched the .png- and .txt-file in a folder together
matched_list_path = [
    [img, file]
    for img in images
    for file in files
    if os.path.dirname(img) == os.path.dirname(file)
    and os.path.splitext(os.path.splitext(os.path.basename(img))[0])[0]
    == os.path.splitext(os.path.splitext(os.path.basename(file))[0])[0]
]


In [114]:
print(f"len matched: {len(matched_list_path)}")
print(f"matched_list:\n {matched_list_path[1]}")

len matched: 429
matched_list:
 ['carolineminuscule-groundtruth/bsb00095929/0011/010002.bin.png', 'carolineminuscule-groundtruth/bsb00095929/0011/010002.gt.txt']


---
## Creating the Dataset

In [115]:
# define a dataframe to store the image, image paths and their corresponding text files
df = pd.DataFrame(columns=["name", "image", "transcription"])

for i, (img_path, file_path) in enumerate(matched_list_path):
    # read the image
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    _, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)
    # convert the image to a numpy array
    img = np.array(img)
    # add the image to the dataframe, set "none" here to add transcription later
    df.loc[i] = [os.path.basename(img_path), img, None]
    # read the text file
    with open(file_path, "r") as f:
        # read the transcription
        transcription = f.read()
    # add the transcription to the dataframe
    df.loc[i, "transcription"] = transcription

In [116]:
df["transcription"] = df["transcription"].str.replace("\n", "", regex=False)
df.head(5)

Unnamed: 0,name,image,transcription
0,010005.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",initio sicuti pleriq; studio ad empabacan
1,010002.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",gla memores que s quis: faciliafacto putat
2,010007.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",pro abstinentia ꝓuirtute audacia. largitio. au...
3,010008.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",bant. Quę tametsianimus aspꝑnabatur. insolens ...
4,010017.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",tilinę coniuratione quam uerissime potero paucis


For now it appear that the images only have "255" as values, i.e. white. That's why im checking for other values. But the edges of the images are all white, therefore this is the exspected behaviour.

In [117]:
non_255_values = df['image'].apply(lambda img: np.any(img != 255))
print(f"Rows with values other than 255: {non_255_values.sum()}")

Rows with values other than 255: 429


---
## Preparing the data

By calculating the maximum and avg. number of the height we can use that value later in our CNN.

In [118]:
# Calculate the maximum and average length of the lists in the 'image' column
image_lengths = df['image'].apply(lambda img: img.shape[0])  # Get the height of each image array
max_length = image_lengths.max()
avg_length = image_lengths.mean()

print(f"Maximum height of an image: {max_length}")
print(f"Average height of an image: {avg_length}")

Maximum height of an image: 263
Average height of an image: 140.2004662004662


Getting the number of unique character, server here more as an eploration into the the data. We stand at a pathway here: Either we manually map each of the 83 characters by hand to the according representative in the images or, use the CNN-RNN architecture. Which does not need manual character mapping, but is harder to train.

In [119]:
# Step 1: Create a character-to-index mapping
charset = sorted(set("".join(df['transcription'])))  # Unique characters in the dataset
char_to_index = {char: idx for idx, char in enumerate(charset)}
index_to_char = {idx: char for char, idx in char_to_index.items()}
num_classes = len(charset) + 1  # Add 1 for the blank character (CTC loss)

# Step 2: Convert transcriptions to sequences of integers
label_sequences = [[char_to_index[char] for char in transcription] for transcription in df['transcription']]

# Step 3: Pad sequences to a fixed length
max_sequence_length = 32  # Adjust based on your dataset
padded_labels = pad_sequences(label_sequences, maxlen=max_sequence_length, padding='post', value=num_classes - 1)

# Step 4: Normalize and reshape the images
X = np.array([cv2.resize(img, (128, 64)) for img in df['image']])  # Resize to (64, variable width)
X = X / 255.0  # Normalize pixel values
X = X.reshape(X.shape[0], 64, 128, 1)  # Add channel dimension


In [120]:
# Features (images) and labels (transcriptions)
X = df["image"].values  # Images as NumPy arrays
y = df["transcription"].values  # Transcriptions as strings

In [121]:
# Convert images to NumPy arrays and normalize
X = np.array(
    [cv2.resize(img, (128, 64)) for img in df["image"]], dtype=np.float32
)  # Resize and convert to float32
X = X / 255.0  # Normalize pixel values to [0, 1]
X = X.reshape(X.shape[0], 64, 128, 1)  # Add channel dimension (grayscale images)

print(f"X shape: {X.shape}, dtype: {X.dtype}")

X shape: (429, 64, 128, 1), dtype: float32


In [122]:
from tensorflow.keras.utils import to_categorical

# Convert transcriptions to sequences of integers
label_sequences = [[char_to_index[char] for char in transcription] for transcription in df['transcription']]

# Pad sequences to a fixed length
max_sequence_length = 32  # Adjust based on your dataset
padded_labels = pad_sequences(label_sequences, maxlen=max_sequence_length, padding='post', value=num_classes - 1)

# Convert padded labels to one-hot encoding
y = np.array([to_categorical(seq, num_classes=num_classes) for seq in padded_labels], dtype=np.float32)

print(f"y shape: {y.shape}, dtype: {y.dtype}")

y shape: (429, 32, 84), dtype: float32


In [123]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

num_classes = len(charset) + 1  # Number of unique characters

In [124]:
print(f"X_train shape: {X_train.shape}, dtype: {X_train.dtype}")
print(f"y_train shape: {y_train.shape}, dtype: {y_train.dtype}")

X_train shape: (343, 64, 128, 1), dtype: float32
y_train shape: (343, 32, 84), dtype: float32


---
The 'cnn_rnn'-model is comprised of an
- input layer
  - where images are read into with 64px, None and one channel
- A convolutional layer which is building 64, 3x3-filters
- Maxpooling to reduce parametersize

In [125]:
cnn_rnn = models.Sequential([
    # CNN layers
    layers.Conv2D(64, (3, 3), padding="same", activation="relu", input_shape=(64, None, 1)),
    layers.MaxPooling2D(pool_size=(2, 2)),
    layers.Conv2D(128, (3, 3), padding="same", activation="relu"),
    layers.MaxPooling2D(pool_size=(2, 2)),

    # Reshape for RNN
    layers.Reshape(target_shape=(-1, 128 * 16)),  # Adjust based on pooling and input size

    # RNN layers
    layers.Bidirectional(layers.LSTM(256, return_sequences=True)),
    layers.Bidirectional(layers.LSTM(256, return_sequences=True)),

    # Output layer
    layers.Dense(num_classes, activation="softmax")
])

cnn_rnn.compile(
    optimizer="adam",
    loss="categorical_crossentropy",
    metrics=["accuracy"]
)

cnn_rnn.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [126]:


# # Reshape the images to match the input shape of the model
# X_train_reshaped = np.array(
#     [cv2.resize(img, (32, 32)).reshape(32, 32, 1) for img in X_train]
# )
# X_test_reshaped = np.array(
#     [cv2.resize(img, (32, 32)).reshape(32, 32, 1) for img in X_test]
# )


# #DEBUG
# print(f"X_train_reshaped shape: {X_train_reshaped.shape}")
# print(f"X_test_reshaped shape: {X_test_reshaped.shape}")
# print(f"y_train shape: {y_train.shape}")
# print(f"y_test shape: {y_test.shape}")



In [127]:
# Fit the model
cnn_rnn.fit(
    X_train,
    y_train,
    epochs=10,
    validation_data=(X_test, y_test),
)

Epoch 1/10


2025-05-14 12:20:08.136199: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 67108864 exceeds 10% of free system memory.
2025-05-14 12:20:08.213551: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 33554432 exceeds 10% of free system memory.
2025-05-14 12:20:09.581200: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 33554432 exceeds 10% of free system memory.
2025-05-14 12:20:09.633900: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 28311552 exceeds 10% of free system memory.


[1m 1/11[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m1:22[0m 8s/step - accuracy: 0.0000e+00 - loss: 4.4456

2025-05-14 12:20:09.789969: W external/local_xla/xla/tsl/framework/cpu_allocator_impl.cc:83] Allocation of 33554432 exceeds 10% of free system memory.


[1m 8/11[0m [32m━━━━━━━━━━━━━━[0m[37m━━━━━━[0m [1m5s[0m 2s/step - accuracy: 0.0824 - loss: 3.8128

KeyboardInterrupt: 