In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns  # data visualization library
import zipfile as zf  # used for extracting the dataset from the zip file
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix  # evaluation metrics

from sklearn.datasets import fetch_openml 
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

# image representation 
from numpy import asarray
from PIL import Image

# for calculating the size of the dataset
import os

Before continuing forward, unzip the dataset zip-file to the directory of the project (unzipping from the terminal suggested)

In [None]:
charset = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" # the character set in read order

char_labels = [] # characters to more usable form, i.e. ["A", "B", ... , "Z"]
for i in range (0, len(charset)):
    char_labels.append(charset[i])

After running the cells above, feel free to either continue with the PCA model or jump straight to the CNN model without running the PCA analysis code cells (some of the methods can take some time to execute)

# Method 1: PCA (principal copmponent analysis)

In [None]:
features = []
labels = []
total_count = 0 # count the number of files altogether

for folder in range(33, 58+1): # iterate through all character folders 33..58
    
    dir_path = f'character_set_A_to_Z/{folder}'
    file_count = len([entry for entry in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, entry))])
    total_count = total_count + file_count
    
    for n in range(file_count): # iterate through each character images in each file
        img = Image.open(f"character_set_A_to_Z/{folder}/{char_labels[folder-33]} ({n+1}).png")
        data = asarray(img)
        features.append(data)
        labels.append(f"{char_labels[folder-33]}")
        
X = np.array(features).reshape(total_count, 64*64) # "flatten" the matrix representation to numpy array form
y = np.array(labels)

print(X.shape)
print(y.shape)

In [None]:
# split feature and label data into 80/20 split for PCA analysis
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.22, random_state=1) 
# test size is 0.22 = 22%
    
print(X_train.shape)
print(X_val.shape)

In [None]:
clf = LogisticRegression(solver='liblinear' )
clf.fit(X_train[:X_train.shape[0]], y_train[:y_train.shape[0]]) # let's train with the whole training set
# can take some time

In [None]:
def gen_conf_mat(y_true, y_pred): # function for visualiaing the confusion matrix
    ax = plt.subplot()
    conf_mat = confusion_matrix(y_true, y_pred)
    sns.heatmap(conf_mat, annot=True, fmt='d', ax=ax, xticklabels=char_labels, yticklabels=char_labels, cmap="mako")

    ax.set_xlabel('Predicted labels', fontsize=20)
    ax.set_ylabel('True labels', fontsize=20)
    ax.set_title('Confusion Matrix', fontsize=20)

y_pred = clf.predict(X_val)
gen_conf_mat(y_val, y_pred)
plt.gcf().set_size_inches(15,13)
plt.show()

multi_accuracy = accuracy_score(y_val, y_pred) # compute the accuracy
print(f"Prediction accuracy: {100*multi_accuracy:.2f}%")

In [None]:
# fit the PCA
N = 50
pca = PCA(n_components=N)
X_train_reduced = pca.fit_transform(X_train)

In [None]:
# plot the explained variances
fig, ax1 = plt.subplots(figsize=(12, 5))
color = 'darkgreen'
ax1.bar(1+np.arange(N), pca.explained_variance_ratio_, color=color)
ax1.set_xticks(10+np.arange(N, step=10))
ax1.tick_params(axis='y', labelcolor=color)
ax1.set_ylabel("Explained variance ratio", color=color)
ax1.set_xlabel("Generated feature")
plt.grid(linestyle='--')

# plot the cumulative variances
color = 'navy'
ax2 = ax1.twinx()
ax2.tick_params(axis='y', labelcolor=color)
ax2.plot(1+np.arange(N), np.cumsum(pca.explained_variance_ratio_), color=color)
ax2.set_ylabel("Cumulative explained variance ratio", color=color)
fig.tight_layout()
plt.show()


In [None]:
N = [5, 8, 10, 20] # Prediction accuracy with three different values for N
# can take some time

for each in N:
    pca.set_params(n_components=each)
    X_train_reduced = pca.fit_transform(X_train)
    
    clf_2 = LogisticRegression(solver='sag') # as above, but with different values of N
    clf_2.fit(X_train_reduced, y_train)
    X_val_reduced = pca.fit_transform(X_val)
    y_pred = clf_2.predict(X_val_reduced) # compute the prediction on the validating set
    multi_accuracy = accuracy_score(y_val, y_pred) # compute the accuracy score
    
    # gen_conf_mat(y_val, y_pred)
    # plt.gcf().set_size_inches(15,13)
    # plt.show()

    print(f"Prediction accuracy with N = {each}: {100*multi_accuracy:.2f}%")

# Method 2: CNN (convolutional neural network)

In [None]:
features = []
labels = []
total_count = 0 # count the number of files altogether

for folder in range(33, 58+1): # iterate through all character folders 33..58
    
    dir_path = f'character_set_A_to_Z/{folder}'
    file_count = len([entry for entry in os.listdir(dir_path) if os.path.isfile(os.path.join(dir_path, entry))])
    total_count = total_count + file_count
    
    for n in range(file_count): # iterate through each character images in each file
        img = Image.open(f"character_set_A_to_Z/{folder}/{char_labels[folder-33]} ({n+1}).png")
        data = asarray(img)
        features.append(data)
        labels.append(folder-33)
        # this time the labels are numbers from 0 to 25 in the order A = 0, B = 1, etc. 
        # this is due to how the tensionflow.keras Sequential model below works
        
X = np.array(features)
y = np.array(labels)

print(X.shape)
print(y.shape)

In [None]:
# split feature and label data into 78/22 split for CNN analysis
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.22, random_state=1) 
# test size is 0.22 = 22%

X_train = X_train.reshape(X_train.shape[0], 64, 64, 1) # convert to 4D array for later use
X_test = X_test.reshape(X_test.shape[0], 64, 64, 1)
input_shape = (64, 64, 1)

X_train = X_train.astype('float32')
X_test = X_test.astype('float32')

X_train = X_train/255 # dividing by the max value for greyscale images
X_test = X_test/255

print(X_train.shape)
print(X_test.shape)

In [None]:
from tensorflow.keras.models import Sequential # import the necessary libraries
from tensorflow.keras.layers import Conv2D, Dense, Dropout, Flatten, MaxPooling2D
import tensorflow as tf

model = Sequential() # building the CNN model

model.add(Conv2D(64, kernel_size = (3, 3), input_shape = input_shape))
model.add(MaxPooling2D(pool_size = (2, 2)))
model.add(Flatten())
model.add(Dense(128, activation = tf.nn.relu))
model.add(Dropout(0.2))
model.add(Dense(26, activation = tf.nn.softmax))

In [None]:
model.compile(optimizer = 'adam', loss = 'sparse_categorical_crossentropy', metrics = ['accuracy'])
model.fit(x = X_train, y = y_train, epochs = 10)
# the number of epochs, as well as the optimiser and loss function can be changed aroud for different results

model.evaluate(X_test, y_test) # evaluating