# English Character Categorisation
## Part One

This code implements a straight forward machine learning approach to identify and categorise handwritten English letters Aa-Zz from a partition of training, testing and validation data and then to go on...

The 5 steps to build an image classification model
Load and normalize the train and test data
Define the Convolutional Neural Network (CNN)
Define the loss function and optimizer
Train the model on the train data
Test the model on the test data

The English character data is taken from [Kaggle here](https://www.kaggle.com/datasets/dhruvildave/english-handwritten-characters-dataset).
This dataset is designed for computer vision tasks and contains 3,410 images of handwritten characters in English of 62 classes (0-9, A-Z and a-z) with 55 images of each class.

Initially for our purposes we only use 2860 images (A-Z and a-z). 

[deCampos09]
  de Campos, T.E. and Babu, B.R. and Varma, M. Character recognition in natural images. Proceedings of the International Conference on Computer
  Vision Theory and Applications, Lisbon, Portugal. February, 2009.

Code Prerequisites & dependencies
xxx


In [21]:
import os
from time import localtime, strftime, time
import shutil
import random
from easydict import EasyDict as edict
import numpy as np
import torch
import torch.nn as nn
import torchvision
from torch.utils import data
# import matplotlib.pyplot as plt
# import skimage.metrics
from torchvision.transforms import transforms
from PIL import Image
import csv
from pprint import pprint

#Data

Prepare the data by dividing into folders for training, testing and validation in a 6:2:2 ratio.

In [30]:
def create_folders(base_path):
    train_path = os.path.join(base_path, 'train')
    test_path = os.path.join(base_path, 'test')
    val_path = os.path.join(base_path, 'validation')

    os.makedirs(train_path, exist_ok=True)
    os.makedirs(test_path, exist_ok=True)
    os.makedirs(val_path, exist_ok=True)

    return train_path, test_path, val_path

def move_data(destination_path, num_samples, data):
    for row in data:
        image_path, label = row
        os.makedirs(destination_path, exist_ok=True)
        # destination folder already contains images
        images_exist = os.listdir(destination_path)
        if len(images_exist) < num_samples:
            shutil.copy(image_path, destination_path)
        else:
            print(f"Warning: Destination folder {destination_path} already contains images.")


def split_data(csv_file, base_path):
    train_path, test_path, val_path = create_folders(base_path)

    with open(csv_file, 'r') as file:
        reader = csv.reader(file)
        next(reader)  # Skip header if exists

        data = [row for row in reader]

    random.shuffle(data)

    total_samples = len(data)
    train_samples = int(0.6 * total_samples)
    test_samples = int(0.2 * total_samples)

    train_data = data[:train_samples]
    test_data = data[train_samples:train_samples + test_samples]
    val_data = data[train_samples + test_samples:]
    print(
        f'total samples:{total_samples}\ntrain samples: {len(train_data)}\ntest_samples: {len(test_data)}\nval_samples: {len(val_data)}')

    move_data(train_path, train_samples, train_data)
    move_data(test_path, test_samples, test_data)
    move_data(val_path, total_samples - train_samples - test_samples, val_data)


csv_file = "letters.csv"  # original csv file with images and classes
base_path = "images"  # output folder for images in Train, Test and Val

split_data(csv_file, base_path)
print("Data split and copied successfully.")

total samples:2860
train samples: 2001
test_samples: 429
val_samples: 430
Data split and copied successfully.


In [None]:
# config
config = edict()
config.TRAIN = edict()
config.TEST = edict()
config.VAL = edict()

# TRAIN config
config.TRAIN.batch_size = 7
config.TRAIN.early_stopping_num = 5
config.TRAIN.save_every_epoch = 2
config.TRAIN.save_img_every_val_step = 5
config.TRAIN.lr = 0.0001  # init learning rate
config.TRAIN.lr_decay = 0.5  # learning rate decay rate
config.TRAIN.lr_decay_every = 5  # decay every epoch
config.TRAIN.beta1 = 0.5  # beta1 in Adam optimiser
config.TRAIN.n_epoch = 25  # total epoch

config.TRAIN.g_alpha = 15  # weight for pixel loss
config.TRAIN.g_gamma = 0.0025  # weight for perceptual loss
config.TRAIN.g_beta = 0.1  # weight for frequency loss
config.TRAIN.g_adv = 1  # weight for adv loss