In [None]:
%matplotlib inline
import os
os.environ["KMP_DUPLICATE_LIB_OK"]="TRUE"
import random
import numpy as np
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
import matplotlib.pyplot as plt
import cv2
import time
from PIL import Image, ImageEnhance

In [None]:
# Constants
vocab_set = set()
max_label_length = 0
vocab_size = 53 # change this
input_size = 512
output_size = vocab_size
hidden_size = 256
batch_size = 64
dropout_rate = 0.1
epochs = 5 # change this to >=100 after testing a little
learning_rate = 0.01
in_channels = 1 # need to define this
height = 32
width = 128

# Checked the vocab:
# i|L|0|M|K|/|U|O|Y|s|F|N|S|W|:|D|2|-|P|4|9|j|r|J|(|;|?|Z|1|#|B|8|7|G|h|e|n|!|f|A|C|a|6|.|R|d| |q|V|H|m|z|I|l|E|w|v|5|"|,|T|)|Q|k|b|X|x|u|'|y|c|3|g|p|t|*|o|
# 77


In [None]:
import kagglehub
path = kagglehub.dataset_download("nibinv23/iam-handwriting-word-database")

In [None]:
# https://www.geeksforgeeks.org/image-resizing-using-opencv-python/
# width, height

path = "/root/.cache/kagglehub/datasets/nibinv23/iam-handwriting-word-database/versions/2/"
for root, dirs, files in os.walk(path):
    print(f"Root: {root}, Files: {files}")

Output hidden; open in https://colab.research.google.com to view.

In [None]:
def gray_scale_check(inputPath, outputPath):
  try:
    image = Image.open(inputPath)
    if image.mode == "RGB": #only convert if RBG
        pixels = image.load()
        width, height = image.size
        grayscaleImage = Image.new("L", (width, height))
        grayscalePixels = grayscaleImage.load()
        for x in range(width):
            for y in range(height):
                r, g, b = pixels[x, y]
                grayValue = int(0.299 * r + 0.587 * g + 0.114 * b) #grayscale formula
                grayscalePixels[x, y] = grayValue
        grayscaleImage.save(outputPath)
        return grayscaleImage
    else:
        image.save(outputPath)
        return image
  except(OSError, Image.UnidentifiedImageError) as e:
      print(f"Image processing error, return none")
      return None #these images will be excluded from the dataset


def label_augment_preprocess():
    datasetRoot = "/root/.cache/kagglehub/datasets/nibinv23/iam-handwriting-word-database/versions/2/"
    wordsNewPath = os.path.join(datasetRoot, "words_new.txt")
    wPath = os.path.join(datasetRoot, "iam_words/words.txt")
    iamWordsDir = os.path.join(datasetRoot, "iam_words/words")
    outputDir = "/content/labeledImages"
    inputFolder = "/content/labeledImages"
    preprocessedFolder = "/content/preprocessed"

    Labels = {}
    with open(wordsNewPath, "r") as f:
        lines = f.readlines()
    for line in lines:
        if line.startswith("#") or len(line.strip()) == 0: continue
        cols = line.split()
        if cols[1] == "err": continue #just skip if theres an error
        fileId = cols[0]
        transcription = " ".join(cols[8:])
        Labels[fileId] = transcription
    os.makedirs(preprocessedFolder, exist_ok=True)

    for root, _, files in os.walk(iamWordsDir):
        for filename in files:
            if filename.endswith(('.png', '.jpg', '.jpeg')): #valid etension
                fileId = filename.rsplit('.', 1)[0]
                if fileId in Labels:
                    label = Labels[fileId]
                    labelDir = os.path.join(outputDir, label)
                    global max_label_length
                    if len(label) > max_label_length:
                        max_label_length = len(label)
                    for char in label: # The block is to adjust the vocab size
                      vocab_set.add(char)

                    os.makedirs(labelDir, exist_ok=True)
                    inputPath = os.path.join(root, filename)
                    outputPath = os.path.join(labelDir, filename)

                    with open(inputPath, 'rb') as srcFile, open(outputPath, 'wb') as destFile:
                        destFile.write(srcFile.read()) #write label to file

    for root, _, files in os.walk(inputFolder):
        for filename in files:
            if filename.endswith(('.png', '.jpg', '.jpeg')):
                inputPath = os.path.join(root, filename)
                relativePath = os.path.relpath(inputPath, inputFolder)
                outputPath = os.path.join(preprocessedFolder, relativePath)

                os.makedirs(os.path.dirname(outputPath), exist_ok=True)
                grayscaleImage = gray_scale_check(inputPath, outputPath)

                if grayscaleImage is None:
                  continue #exclude image from handling

                cropWidth, cropHeight = (10, 10) #crop using small numbers because the images of letters are very small
                imgWidth, imgHeight = grayscaleImage.size
                croppedImage = None
                if imgWidth >= cropWidth and imgHeight >= cropHeight:
                    top = random.randint(0, imgHeight - cropHeight)
                    left = random.randint(0, imgWidth - cropWidth)
                    croppedImage = grayscaleImage.crop((left, top, left + cropWidth, top + cropHeight))
                    #only crops if it is large enough to be cropped
                bFactor, cFactor = 1.2, 0.8 #small enough to be recognizable but still augment. can play around with these numbers

                brightnessAdjusted = (ImageEnhance.Brightness(grayscaleImage)).enhance(bFactor)
                contrastAdjusted = ImageEnhance.Contrast(brightnessAdjusted).enhance(cFactor)
                if croppedImage:
                    croppedOutputPath = outputPath.replace('.png', '_crop.png').replace('.jpg', '_crop.jpg')
                    croppedImage.save(croppedOutputPath) #finally save augmented images
                contrastAdjustedOutputPath = outputPath.replace('.png', '_bc.png').replace('.jpg', '_bc.jpg')

                contrastAdjusted.save(contrastAdjustedOutputPath) # ^^

label_augment_preprocess()

Image processing error, return none


In [None]:
# Instantiate a dataset class
class HandwritingDataset(Dataset):
    def __init__(self, preprocessedFolder, vocab, transform=None):
        self.data_dir = preprocessedFolder
        self.transform = transform
        self.vocab = vocab
        self.image_paths = []
        self.labels = []

        for root, _, files in os.walk(self.data_dir):
          for file in files:
            if file.endswith(('.png', '.jpg', '.jpeg')):
              self.image_paths.append(os.path.join(root, file))
              self.labels.append(os.path.basename(root))

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, idx):
      image_path = self.image_paths[idx]
      label = self.labels[idx]
      image = Image.open(image_path)

      # transform (resizing and conversion to tensor)
      if self.transform:
            image = self.transform(image)

      # convert label into a tensor and pad it to the max length
      label = torch.tensor([self.vocab[char] for char in label], dtype = torch.long)

      padding_length = max_label_length - label.size(0)
      if padding_length > 0:
            label = torch.cat([label, torch.zeros(padding_length, dtype=torch.long)])

      return image, label

# Initialize folders and the datasets
preprocessedFolder = "/content/preprocessed"
nonprocessedFolder = "/content/labeledImages"

vocab = {char: idx for idx, char in enumerate(sorted(vocab_set))}

transform = transforms.Compose([transforms.Resize((32, 128)),
                                transforms.ToTensor(),
                                transforms.Normalize((0.5,), (0.5,))])

trainset = HandwritingDataset(preprocessedFolder, vocab, transform = transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = batch_size, shuffle=True, num_workers = 2)

testset = HandwritingDataset(nonprocessedFolder, vocab, transform = transform)
testloader = torch.utils.data.DataLoader(testset, batch_size = batch_size, shuffle=False, num_workers = 2)

In [None]:
# cnn
class testCNN(nn.Module):
  def __init__(self, in_channels: int, out_channels: int):
    super(testCNN, self).__init__()
    self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
    self.bn1 = nn.BatchNorm2d(out_channels)

    self.conv2 = nn.Conv2d(out_channels, 64, kernel_size=3, stride=1, padding=1)
    self.bn2 = nn.BatchNorm2d(64)

    self.conv3 = nn.Conv2d(64, 128, kernel_size=3, stride=1, padding=1)
    self.bn3 = nn.BatchNorm2d(128)

    self.conv4 = nn.Conv2d(128, 256, kernel_size=3, stride=1, padding=1)
    self.bn4 = nn.BatchNorm2d(256)

    self.pool = nn.MaxPool2d(kernel_size=2, stride=2)

    # could change droupout
    self.dropout = nn.Dropout(0.3)

    #self.fc = nn.Linear(256 * 8 * 8, 512)

  def forward(self, x):
    x = self.pool(F.relu(self.bn1(self.conv1(x))))
    x = self.pool(F.relu(self.bn2(self.conv2(x))))
    x = self.dropout(x)
    x = self.pool(F.relu(self.bn3(self.conv3(x))))
    x = self.dropout(x)
    x = self.pool(F.relu(self.bn4(self.conv4(x))))
    x = self.dropout(x)

    return x

In [None]:
# 2 LSTM layers
class BidirectionalLSTM(nn.Module):
    def __init__(self, input_size, hidden_size, output_size, dropout_rate):
        super().__init__()
        self.cnnLayer = testCNN(1, 32)
        # (batch_size, 256, w/16, h/16) ==flatten=> (batch_size, 256*h/16*w/16)
        # lstm format is (batch_size, sequence_length, input_size)
        self.lstm = nn.LSTM(input_size = input_size, hidden_size = hidden_size, num_layers = 2, bias = True, batch_first = False, dropout = dropout_rate, bidirectional = True)
        self.fc = nn.Linear(hidden_size * 2, output_size)

    def forward(self, x):
        x = self.cnnLayer(x)
        # reshape output from cnn
        x = x.permute(0, 3, 1, 2).contiguous()
        x = x.view(x.size(0), x.size(1), -1)
        x, _ = self.lstm(x)
        x = self.fc(x)
        x = torch.softmax(x, dim = 2)
        return x

In [None]:
# Loss and optimizers
model = BidirectionalLSTM(input_size = input_size, hidden_size = hidden_size, output_size = output_size, dropout_rate = dropout_rate)

criterion = nn.CTCLoss(blank = 0, zero_infinity = True)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)

In [None]:
# Lists to store metrics for plotting
loss_metric = []
training_accuracy = []
test_accuracy = []

In [None]:
t0 = time.time()
for epoch in range(epochs):

    # Statistics
    correct = 0
    total = 0
    running_loss = 0.0
    for i, data in enumerate(trainloader, 0):
        # get the inputs; data is a list of [inputs, labels]
        inputs, labels = data

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(inputs)

        # introduce input and target lengths for ctc loss
        input_lengths = torch.full((inputs.size(0),), outputs.size(1), dtype=torch.long)
        target_lengths = torch.tensor([len(label) for label in labels], dtype=torch.long)

        print(inputs.shape)
        print(labels.shape)
        print(input_lengths.shape)
        print(target_lengths.shape)
        print(batch_size)
        for i in range(batch_size):
            print(labels[i])

        loss = criterion(outputs, labels, input_lengths, target_lengths)
        loss.backward()
        optimizer.step()

        # print statistics and calculate accuracy
        running_loss += loss.item()
        if i % 2000 == 1999:    # print every 2000 mini-batches
            print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 2000:.3f}')
            running_loss = 0.0
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    # Record the training accuracy and loss
    epoch_loss = running_loss / len(trainloader)
    epoch_accuracy = correct / total
    loss_metric.append(epoch_loss)
    training_accuracy.append(epoch_accuracy)

    # Record the testing accuracy
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    acc = correct / total
    test_accuracy.append(acc)

print('Finished Training')
t1 = time.time()
total = t1-t0
print(f"Testing took {total:.2f} seconds for {epochs} epochs")

# Plot loss
plt.subplot(1, 3, 1)
plt.plot(train_loss_list, label='Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.grid(True)

# Plot training accuracy
plt.subplot(1, 3, 2)
plt.plot(train_acc_list, label='Training Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Training Accuracy')
plt.grid(True)

# Plot test accuracy
plt.subplot(1, 3, 3)
plt.plot(test_acc_list, label='Test Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy (%)')
plt.title('Test Accuracy')
plt.grid(True)

plt.tight_layout()
plt.show()


torch.Size([64, 1, 32, 128])
torch.Size([64, 19])
torch.Size([64])
torch.Size([64])
64
tensor([51, 68, 55,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([51, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([53, 58, 51, 62, 62, 55, 64, 57, 55,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([53, 51, 69, 55,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([43, 58, 55, 59, 62, 51,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([54, 55, 52, 51, 70, 55,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([8, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])
tensor([32, 55,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([58, 51, 64, 54,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([33, 70,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0])
tensor([73

RuntimeError: input_lengths must be of size batch_size