In [31]:
# -*- coding: utf-8 -*-
# https://github.com/gereshes/Blog-Companion/blob/master/keras2PyTorch-Hard/createKears.py

import numpy as np
# from keras.models import Sequential, load_model
# from keras.layers import Dense
# from keras.wrappers.scikit_learn import KerasRegressor
# from keras.callbacks import EarlyStopping, ModelCheckpoint
# from keras.preprocessing.image import ImageDataGenerator
# import matplotlib.pyplot as plt
# import random
# from scipy.integrate import odeint
# import scipy.io
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
# import torchvision.transforms as transforms
import torch.optim as optim # optimzer

import random
import time
import cv2
import os
from pathlib import Path

In [32]:
data_dir = 'mnist_train_imgs'

# TODO: Use all images
data_dirs = [ 
             'mnist_test_imgs',
             'mnist_train_imgs'
             'saved_digits'
             ]

category_labels = ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9']


train_dir = os.path.join('mnist_train_imgs')

test_dir = os.path.join('mnist_test_imgs')
# test_dir = os.path.join('saved_digits')


img_size = (28, 28)

In [33]:

# TODO: Add horizontal/vertical smearing to the images.
# TODO: Add A TINY BIT of random pixel noise to the images.
# TODO: Identify the min / max image fill of the dataset.

def random_alignment(img):
    ax1sums = np.sum(img, axis=0)
    up_shift = np.argmax(ax1sums>0)
    down_shift = np.argmax(ax1sums[::-1]>0)
    
    ax2sums = np.sum(img, axis=1)
    left_shift = np.argmax(ax2sums>0)
    right_shift = np.argmax(ax2sums[::-1]>0)

    random_y = random.randint(-up_shift, down_shift)
    random_x = random.randint(-left_shift, right_shift)
    
    return np.roll(img, (random_x, random_y), axis=(0, 1))

def transform(img, randomize=False):
    img = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    _, img = cv2.threshold(img, 127, 255, cv2.THRESH_BINARY)
    
    if randomize:
        img = random_alignment(img)
        
    img = torch.tensor(img)
    img = img.unsqueeze(0)
    img = img.float()
    img = img / 255
    return img

# TODO: Add up/down/left/right shift to the images.
# TODO: Flip images? Flip certain numbers?
# TODO: Rework to allow multiple data directories. (mnist + saved digits)
        
# Define a custom pytorch Dataset class
class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, img_size, transform, labels, randomize=False):
        self.root_dir = root_dir
        self.image_files = [str(p.relative_to(root_dir)) for p in Path(root_dir).rglob("*.[Pp][Nn][Gg]")]
        self.randomize = randomize
        self.transform = transform
        self.img_size = img_size
        self.labels = labels # NOTE: Not enforced upon loaded images, may crash if extra directories are around
    
    # Define the length of the dataset
    def __len__(self):
        return len(self.image_files)
    
    # Define the getitem function to return images and labels
    def __getitem__(self, idx):
        image_path = os.path.join(self.root_dir, self.image_files[idx])

        # Open image, apply transforms.
        image = cv2.imread(image_path)
        if self.transform:
            image = self.transform(image, randomize=self.randomize)
            
        label = os.path.split(self.image_files[idx])[0]
        target = torch.tensor(self.labels.index(label))
        #Create a dictionary with the image and label
        sample = {'image': image, 'target': target, 'label': label}
        
        return sample

    # def shuffle(self):
    #     random.shuffle(self.image_files)
    
#Create the dataloader for training and test dat
train_data = CustomDataset(root_dir=train_dir, img_size=img_size, transform=transform, labels=category_labels)
test_data = CustomDataset(root_dir=test_dir, img_size=img_size, transform=transform, labels=category_labels, randomize=False)

train_loader = DataLoader(train_data, batch_size=32, shuffle=True)
test_loader = DataLoader(test_data, batch_size=32, shuffle=True)

In [34]:

class Digit_OCR_CNN(nn.Module):

    def __init__(self):
        super(Digit_OCR_CNN, self).__init__()

        # Convolutional layers
        self.conv1 = nn.Conv2d(1, 3, 3, padding=1)
        self.conv2 = nn.Conv2d(3, 6, 3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.drop1 = nn.Dropout(0.25)
        self.fc1 = nn.Linear(6 * 14 * 14, 128)
        self.drop2 = nn.Dropout(0.25)
        self.fc2 = nn.Linear(128, 128)
        self.drop3 = nn.Dropout(0.25)
        self.fc3 = nn.Linear(128, 10)
        
    def forward(self, state):
        x = F.relu(self.conv1(state))
        # x = self.pool(F.relu(self.conv1(state)))
        x = self.pool(F.relu(self.conv2(x)))
        # print(x.size())
        x = self.drop1(x)
        x = x.view(-1, 6 * 14 * 14)
        x = F.relu(self.fc1(x))
        x = self.drop2(x)

        x = F.relu(self.fc2(x))
        x = self.drop3(x)
        x = F.log_softmax(self.fc3(x), dim=1)

        return x



net = Digit_OCR_CNN()

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Assuming that we are on a CUDA machine, this should print a CUDA device:
print(device)
net.to(device)

cpu


Digit_OCR_CNN(
  (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(3, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (drop1): Dropout(p=0.25, inplace=False)
  (fc1): Linear(in_features=1176, out_features=128, bias=True)
  (drop2): Dropout(p=0.25, inplace=False)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (drop3): Dropout(p=0.25, inplace=False)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)

In [37]:
# criterion = nn.CrossEntropyLoss()
criterion = nn.NLLLoss()
# criterion = nn.KLDivLoss() # Boolean values
# optimizer = optim.SGD(net.parameters(), lr=0.001, momentum=0.9)
optimizer = optim.AdamW(net.parameters(), lr=0.001)#, momentum=0.9)


In [40]:

def check_best(best_acc):
    net.eval()
    best_name = 'best_model.pt'
    with torch.no_grad():
        count = 0
        correct = 0
        for d in test_loader:
            i = d['image']
            l = d['label']
            r = net(i.to(device))
            _, predicted = torch.max(r, 1)
            for i, _r in enumerate(predicted):
                count += 1
                if category_labels[_r] == l[i]:
                    correct += 1
                else:
                    pass
        curr_acc = correct/count
        if curr_acc > best_acc:
            best_acc = curr_acc
            torch.save(net.state_dict(), best_name)
            print(f'\tNew best accuracy: {best_acc:.5f}\n\t\t> Saving model as', best_name)
    net.train()
    return best_acc

def train():
    
    # How low the loss must be to trigger early stopping.
    loss_end_thresh = 0.0
    # How many consecutive loss values must be below the threshold to trigger early stopping.
    consecutive_thresh = 5
    # Track how many consecutive loss values have been below the threshold.
    thresh_track = 0
    
    best_interval = int(len(train_loader) / 4) 
    best_acc = 0.95450

    loss_interval = 50
    loss_end_thresh = 0.02
    consecutive_thresh = 5
    thresh_track = 0
    for epoch in range(1):  # loop over the dataset multiple times
        net.train()

        running_loss = 0.0
        for i, data in enumerate(train_loader):

            inputs, targets = data['image'], data['target']

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            # print(outputs.size())

            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            # print statistics
            running_loss += loss.item()
            if i % loss_interval == loss_interval-1:    # print every 2000 mini-batches
                mean_loss = abs(running_loss / loss_interval)
                print('[%d, %5d] loss: %.3f' %
                    (epoch + 1, i + 1, mean_loss))
                running_loss = 0.0
                if mean_loss < loss_end_thresh:
                    print(f'Thresh at {thresh_track} of {consecutive_thresh}')
                    thresh_track += 1
                    if thresh_track > consecutive_thresh:
                        best_acc = check_best(best_acc=best_acc)
                        return
                else:
                    thresh_track = 0
            if i % best_interval == best_interval-1:
                # Check for best
                best_acc = check_best(best_acc=best_acc)

st = time.time()
train()
et = time.time()


print('Finished Training')
print('Time:', et - st)  # milliseconds


[1,    50] loss: 0.166
[1,   100] loss: 0.164
[1,   150] loss: 0.159
[1,   200] loss: 0.151
[1,   250] loss: 0.151
[1,   300] loss: 0.174
[1,   350] loss: 0.156
[1,   400] loss: 0.158
[1,   450] loss: 0.161
	New best accuracy: 0.97260
		> Saving model as best_model.pt
[1,   500] loss: 0.145
[1,   550] loss: 0.156
[1,   600] loss: 0.168
[1,   650] loss: 0.153
[1,   700] loss: 0.168
[1,   750] loss: 0.179
[1,   800] loss: 0.151
[1,   850] loss: 0.173
[1,   900] loss: 0.174
	New best accuracy: 0.97390
		> Saving model as best_model.pt
[1,   950] loss: 0.134
[1,  1000] loss: 0.177
[1,  1050] loss: 0.157
[1,  1100] loss: 0.159
[1,  1150] loss: 0.161
[1,  1200] loss: 0.170
[1,  1250] loss: 0.148
[1,  1300] loss: 0.140
[1,  1350] loss: 0.163
[1,  1400] loss: 0.162
	New best accuracy: 0.97430
		> Saving model as best_model.pt
[1,  1450] loss: 0.157
[1,  1500] loss: 0.151
[1,  1550] loss: 0.139
[1,  1600] loss: 0.147
[1,  1650] loss: 0.150
[1,  1700] loss: 0.150
[1,  1750] loss: 0.162
[1,  1800

In [12]:
net = Digit_OCR_CNN()
# load_name = 'models/ability_icon_1_2.pt'
load_name = 'best_model.pt'
net.load_state_dict(torch.load(load_name))
net.to(device)

Digit_OCR_CNN(
  (conv1): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (conv2): Conv2d(3, 6, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  (pool): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  (drop1): Dropout(p=0.25, inplace=False)
  (fc1): Linear(in_features=1176, out_features=128, bias=True)
  (drop2): Dropout(p=0.25, inplace=False)
  (fc2): Linear(in_features=128, out_features=128, bias=True)
  (drop3): Dropout(p=0.25, inplace=False)
  (fc3): Linear(in_features=128, out_features=10, bias=True)
)

In [51]:
# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in category_labels}
total_pred = {classname: 0 for classname in category_labels}
incorrect_pred = {classname: [] for classname in category_labels}


# again no gradients needed
with torch.no_grad():
    net.eval()
    for data in test_loader:
        images, targets, labels = data['image'], data['target'], data['label']
        outputs = net(images.to(device))
        _, predictions = torch.max(outputs, 1)
        print(predictions)
        # collect the correct predictions for each class
        for label, target, prediction, output in zip(labels, targets, predictions, outputs):
            # print(label, target, prediction, output)
            if target == prediction:
                correct_pred[label] += 1
            else:
                incorrect_pred[label].append(category_labels[int(prediction)])

            total_pred[label] += 1

# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:10s} is {accuracy:.1f} %')
    print(f'\tIncorrect predictions: {incorrect_pred[classname]}')
print(f'Overall Accuracy: {(100 * float(sum(correct_pred.values()) / sum(total_pred.values()))):.3f}')
del correct_pred
del total_pred
del correct_count

tensor([1, 0, 9, 1, 3, 4, 1, 6, 7, 3, 3, 1, 7, 0, 4, 7, 2, 9, 1, 1, 5, 1, 0, 9,
        4, 9, 3, 9, 1, 2, 2, 1])
tensor([3, 5, 1, 2, 3, 9, 6, 8, 7, 7, 4, 1, 0, 1, 4, 6, 1, 7, 8, 3, 2, 9, 9, 9,
        8, 0, 7, 2, 7, 6, 2, 4])
tensor([0, 0, 4, 7, 2, 4, 4, 9, 4, 3, 7, 3, 2, 0, 9, 4, 9, 6, 6, 5, 9, 5, 9, 1,
        6, 6, 2, 0, 7, 9, 2, 3])
tensor([6, 6, 5, 0, 9, 1, 7, 3, 0, 2, 4, 9, 9, 1, 8, 3, 6, 3, 3, 7, 2, 0, 3, 1,
        0, 3, 4, 7, 0, 1, 0, 0])
tensor([3, 3, 7, 9, 0, 3, 8, 4, 2, 9, 9, 2, 2, 2, 2, 1, 9, 2, 0, 2, 4, 1, 9, 9,
        7, 2, 3, 9, 7, 3, 4, 3])
tensor([9, 9, 9, 4, 8, 3, 9, 1, 3, 6, 3, 8, 4, 7, 1, 3, 4, 1, 4, 1, 3, 1, 0, 1,
        1, 1, 6, 8, 5, 2, 1, 5])
tensor([5, 9, 0, 5, 1, 4, 8, 6, 5, 3, 8, 3, 9, 9, 9, 4, 1, 8, 3, 2, 6, 0, 9, 2,
        8, 6, 3, 5, 6, 4, 4, 0])
tensor([8, 1, 3, 8, 5, 4, 1, 3, 2, 7, 6, 0, 5, 0, 0, 2, 2, 4, 2, 3, 5, 7, 9, 7,
        0, 1, 0, 1, 9, 7, 9, 9])
tensor([8, 3, 5, 1, 0, 0, 3, 0, 4, 8, 0, 3, 6, 4, 2, 4, 4, 1, 3, 4, 4, 3, 7, 8,
        7, 8, 3,

KeyboardInterrupt: 

In [58]:
#Create the dataloader for training and test dat
gui_data = CustomDataset(root_dir=os.path.join('saved_digits'),
                         img_size=img_size,
                         transform=transform,
                         labels=category_labels,
                         randomize=True)
gui_loader = DataLoader(gui_data, batch_size=32, shuffle=True)

# prepare to count predictions for each class
correct_pred = {classname: 0 for classname in category_labels}
total_pred = {classname: 0 for classname in category_labels}
incorrect_pred = {classname: [] for classname in category_labels}

# again no gradients needed
with torch.no_grad():
    net.eval()
    for data in gui_loader:
        images, targets, labels = data['image'], data['target'], data['label']
        outputs = net(images.to(device))
        _, predictions = torch.max(outputs, 1)

        # collect the correct predictions for each class
        for label, target, prediction, output in zip(labels, targets, predictions, outputs):
            # print(label, target, prediction, output)
            if target == prediction:
                correct_pred[label] += 1
            else:
                incorrect_pred[label].append(category_labels[int(prediction)])

            total_pred[label] += 1

# print accuracy for each class
for classname, correct_count in correct_pred.items():
    accuracy = 100 * float(correct_count) / total_pred[classname]
    print(f'Accuracy for class: {classname:10s} is {accuracy:.1f} %')
    print(f'\tIncorrect predictions: {incorrect_pred[classname]}')
print(f'Overall Accuracy: {(100 * float(sum(correct_pred.values()) / sum(total_pred.values()))):.3f}')
del correct_pred
del total_pred
del correct_count

Accuracy for class: 0          is 19.0 %
	Incorrect predictions: ['3', '6', '3', '2', '2', '8', '2', '2', '2', '7', '3', '9', '3', '3', '2', '3', '7']
Accuracy for class: 1          is 4.8 %
	Incorrect predictions: ['6', '2', '5', '4', '0', '0', '2', '0', '7', '7', '6', '7', '4', '7', '5', '7', '7', '6', '5', '2']
Accuracy for class: 2          is 42.9 %
	Incorrect predictions: ['3', '6', '4', '3', '3', '6', '6', '6', '0', '7', '0', '8']
Accuracy for class: 3          is 57.1 %
	Incorrect predictions: ['2', '9', '6', '2', '5', '9', '6', '2', '2']
Accuracy for class: 4          is 23.8 %
	Incorrect predictions: ['0', '2', '0', '0', '6', '5', '6', '2', '9', '2', '7', '1', '2', '5', '8', '0']
Accuracy for class: 5          is 19.0 %
	Incorrect predictions: ['6', '9', '6', '3', '3', '4', '9', '7', '2', '3', '7', '0', '6', '6', '3', '3', '8']
Accuracy for class: 6          is 50.0 %
	Incorrect predictions: ['9', '8', '2', '5', '5', '3', '8', '5', '3', '5']
Accuracy for class: 7          is 