# CNN
This notebook aims to deploy the CNN.

In [40]:
import os
import numpy as np
import matplotlib.pyplot as plt
import cv2
import pandas as pd
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import models, layers
from tensorflow.keras.preprocessing.image import ImageDataGenerator

---
## Loading the data
Data has to load out of the 'carolianminuscule-groundtruth'-folder 

In [41]:
def get_images(folder_path: str):
    """
    Load images and text files from the given path.
    :param folder_path: Path to the directory containing images and text files.
    :return: Two lists - one for image paths and one for text file paths.
    """
    if not os.path.exists(folder_path):
        raise FileNotFoundError(f"Path {folder_path} does not exist.")

    images = []
    files = []

    for entry in os.listdir(folder_path):
        entry_path = os.path.join(folder_path, entry)
        if os.path.isdir(entry_path):
            # Recursively get images and text files from subdirectories
            sub_images, sub_files = get_images(entry_path)
            images.extend(sub_images)
            files.extend(sub_files)
        elif entry.endswith(".png"):
            images.append(entry_path)
        elif entry.endswith(".txt"):
            files.append(entry_path)

    return images, files


# load the data from the directory
path = "carolineminuscule-groundtruth"
images, files = get_images(path)


# matched the .png- and .txt-file in a folder together
matched_list_path = [
    [img, file]
    for img in images
    for file in files
    if os.path.dirname(img) == os.path.dirname(file)
    and os.path.splitext(os.path.splitext(os.path.basename(img))[0])[0]
    == os.path.splitext(os.path.splitext(os.path.basename(file))[0])[0]
]


In [42]:
print(f"len matched: {len(matched_list_path)}")
print(f"matched_list:\n {matched_list_path[1]}")

len matched: 429
matched_list:
 ['carolineminuscule-groundtruth/bsb00095929/0011/010002.bin.png', 'carolineminuscule-groundtruth/bsb00095929/0011/010002.gt.txt']


In [43]:
# define a dataframe to store the image, image paths and their corresponding text files
df = pd.DataFrame(columns=["name", "image", "transcription"])

for i, (img_path, file_path) in enumerate(matched_list_path):
    # read the image
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    _, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)
    # convert the image to a numpy array
    img = np.array(img)
    # add the image to the dataframe, set "none" here to add transcription later
    df.loc[i] = [os.path.basename(img_path), img, None]
    # read the text file
    with open(file_path, "r") as f:
        # read the transcription
        transcription = f.read()
    # add the transcription to the dataframe
    df.loc[i, "transcription"] = transcription

In [44]:
df.head(5)

Unnamed: 0,name,image,transcription
0,010005.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",initio sicuti pleriq; studio ad empabacan\n
1,010002.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",gla memores que s quis: faciliafacto putat\n
2,010007.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",pro abstinentia ꝓuirtute audacia. largitio. au...
3,010008.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",bant. Quę tametsianimus aspꝑnabatur. insolens ...
4,010017.bin.png,"[[255, 255, 255, 255, 255, 255, 255, 255, 255,...",tilinę coniuratione quam uerissime potero pauc...


For now it appear that the images only have "255" as values, i.e. white. That's why im checking for other values. But the edges of the images are all white, therefore this is the exspected behaviour.

In [45]:
non_255_values = df['image'].apply(lambda img: np.any(img != 255))
print(f"Rows with values other than 255: {non_255_values.sum()}")

Rows with values other than 255: 429


In [46]:
# Features (images) and labels (transcriptions)
X = df['image'].values  # Images as NumPy arrays
y = df['transcription'].values # Transcriptions as strings

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [47]:
# DEBUG
# Check for missing data in the DataFrame
print(f"Total rows in DataFrame: {len(df)}")
print(f"Rows with missing images: {df['image'].isnull().sum()}")
print(f"Rows with missing transcriptions: {df['transcription'].isnull().sum()}")

# Drop rows with missing data
df = df.dropna(subset=['image', 'transcription'])

# Ensure all arrays have the same number of samples
X = df['image'].values  # Images as NumPy arrays
y = df['transcription'].values  # Transcriptions as strings

print(f"Number of samples in X: {len(X)}")
print(f"Number of samples in y: {len(y)}")

Total rows in DataFrame: 429
Rows with missing images: 0
Rows with missing transcriptions: 0
Number of samples in X: 429
Number of samples in y: 429


In [48]:
model = keras.Sequential([
    keras.Input(shape = (32, 32, 1)),
    layers.Conv2D(10, kernel_size = (3, 3), padding = "same", activation = "relu"),
    layers.Conv2D(10, kernel_size = (3, 3), padding = "same", activation = "relu"),
    layers.Flatten(),
    layers.Dense(100, activation = "relu"),
    layers.Dense(10, activation = "softmax")
])

print(model.summary())

None


In [52]:
model.compile(
    optimizer=keras.optimizers.RMSprop(),
    loss=keras.losses.CategoricalCrossentropy(),
    metrics=[keras.metrics.CategoricalAccuracy()],
)

# Convert labels to categorical format
y_train_categorical = keras.utils.to_categorical(
    [ord(char) for char in "".join(y_train)]
)
y_test_categorical = keras.utils.to_categorical(
    [ord(char) for char in "".join(y_test)]
)

# Reshape the images to match the input shape of the model
X_train_reshaped = np.array(
    [cv2.resize(img, (32, 32)).reshape(32, 32, 1) for img in X_train]
)
X_test_reshaped = np.array(
    [cv2.resize(img, (32, 32)).reshape(32, 32, 1) for img in X_test]
)


#DEBUG
print(f"X_train shape: {X_train_reshaped.shape}")
print(f"X_test shape: {X_test_reshaped.shape}")
print(f"y_train shape: {y_train_categorical.shape}")
print(f"y_test shape: {y_test_categorical.shape}")




X_train shape: (343, 32, 32, 1)
X_test shape: (86, 32, 32, 1)
y_train shape: (16187, 42846)
y_test shape: (4069, 42842)


In [None]:
# Fit the model
model.fit(
    X_train_reshaped,
    y_train_categorical,
    epochs=10,
    validation_data=(X_test_reshaped, y_test_categorical),
)