In [6]:
import torch
from  torch.autograd import *
from  torch import nn,optim
from  torch.utils.data import DataLoader
from  torchvision import datasets,transforms
 
class simpleNet(nn.Module):
    def __init__(self,in_dim,n_hidden_1,n_hidden_2,out_dim):
        super(simpleNet,self).__init__()
        self.layer1=nn.Linear(in_dim,n_hidden_1)
        self.layer2=nn.Linear(n_hidden_1,n_hidden_2)
        self.layer3=nn.Linear(n_hidden_2,out_dim)
 
    def forward(self, x):
        x=self.layer1(x)
        x=self.layer2(x)
        x=self.layer3(x)
        return x
 
 
class Activation_Net(nn.Module):
    def __init__(self, in_dim, n_hidden_1, n_hidden_2, out_dim):
        super().__init__()
        self.layer1 = nn.Sequential(nn.Linear(in_dim, n_hidden_1),nn.ReLU(True))
        self.layer2 = nn.Sequential(nn.Linear(n_hidden_1, n_hidden_2),nn.ReLU(True))
        self.layer3 = nn.Sequential(nn.Linear(n_hidden_2, out_dim))
 
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return x
 
class Batch_Net(nn.Module):
    def __init__(self, in_dim, n_hidden_1, n_hidden_2, out_dim):
        super().__init__()
        self.layer1 = nn.Sequential(nn.Linear(in_dim, n_hidden_1),nn.BatchNorm1d(n_hidden_1),nn.ReLU(True))
        self.layer2 = nn.Sequential(nn.Linear(n_hidden_1, n_hidden_2),nn.BatchNorm1d(n_hidden_2),nn.ReLU(True))
        self.layer3 = nn.Sequential(nn.Linear(n_hidden_2, out_dim))
 
    def forward(self, x):
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        return x
 
batch_size=64
learning_rate=1e-1
num_epoches=20
 
data_tf=transforms.Compose([transforms.ToTensor(),transforms.Normalize([0.5],[0.5])])
train_dataset=datasets.MNIST(root='./data',train=True,transform=data_tf,download=True)
test_dataset=datasets.MNIST(root="./data",train=False,transform=data_tf)
train_loader=DataLoader(train_dataset,batch_size=batch_size,shuffle=True)
test_loader=DataLoader(test_dataset,batch_size=batch_size,shuffle=False)
 
model=Batch_Net(28*28,300,100,10)
if torch.cuda.is_available():
    model=model.cuda()
criterion=nn.CrossEntropyLoss()
optimizer=optim.SGD(model.parameters(),lr=learning_rate)
 
for epoch in range(num_epoches):
    loss_sum, cort_num_sum,acc = 0.0, 0,0
    for data in train_loader:
        img,label=data
        img=img.view(img.size(0),-1)
        if torch.cuda.is_available():
            inputs = Variable(img).cuda()
            target = Variable(label).cuda()
        else:
            inputs = Variable(img)
            target = Variable(label)
        output =model(inputs)
        loss = criterion(output, target)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        loss_sum += loss.data
        _, pred = output.data.max(1)
        num_correct = pred.eq(target).sum()
        cort_num_sum += num_correct
    acc=cort_num_sum.float()/len(train_dataset)
    print( "After %d epoch , training loss is %.2f , correct_number is %d  accuracy is %.6f. "%(epoch,loss_sum,cort_num_sum,acc))
 
 
model.eval()
eval_loss=0
eval_acc=0
for data in test_loader:
    img ,label =data
    img=img.view(img.size(0),-1)
    if torch.cuda.is_available():
        img=Variable(img,volatile=True)
        label=Variable(label,volatile=True)
    else:
        img = Variable(img, volatile=True)
        label = Variable(label, volatile=True)
    out=model(img)
    loss=criterion(out,label)
    eval_loss+=loss.data*label.size(0)
    _,pred=out.data.max(1)
    num_correct=pred.eq(label).sum()
    eval_acc+=num_correct.data
print('Test loss: {:.6f},ACC: {:.6f}'.format(eval_loss.float()/(len(test_dataset)),eval_acc.float()/(len(test_dataset))))

After 0 epoch , training loss is 230.50 , correct_number is 56109  accuracy is 0.935150. 
After 1 epoch , training loss is 88.17 , correct_number is 58295  accuracy is 0.971583. 
After 2 epoch , training loss is 61.50 , correct_number is 58789  accuracy is 0.979817. 
After 3 epoch , training loss is 47.45 , correct_number is 59058  accuracy is 0.984300. 
After 4 epoch , training loss is 35.46 , correct_number is 59294  accuracy is 0.988233. 
After 5 epoch , training loss is 30.74 , correct_number is 59400  accuracy is 0.990000. 
After 6 epoch , training loss is 25.21 , correct_number is 59496  accuracy is 0.991600. 
After 7 epoch , training loss is 20.62 , correct_number is 59617  accuracy is 0.993617. 
After 8 epoch , training loss is 16.71 , correct_number is 59676  accuracy is 0.994600. 
After 9 epoch , training loss is 15.05 , correct_number is 59704  accuracy is 0.995067. 
After 10 epoch , training loss is 12.08 , correct_number is 59781  accuracy is 0.996350. 
After 11 epoch , tr



Test loss: 0.058028,ACC: 0.983500


To see clearly the pictures we use in dataset, we can simply run the codes below

In [5]:

import numpy as np
import struct
 
from PIL import Image
import os
 
data_file = './data/raw/train-images-idx3-ubyte'
# It's 47040016B, but we should set to 47040000B
data_file_size = 47040016
data_file_size = str(data_file_size - 16) + 'B'
 
data_buf = open(data_file, 'rb').read()
 
magic, numImages, numRows, numColumns = struct.unpack_from(
    '>IIII', data_buf, 0)
datas = struct.unpack_from(
    '>' + data_file_size, data_buf, struct.calcsize('>IIII'))
datas = np.array(datas).astype(np.uint8).reshape(
    numImages, 1, numRows, numColumns)
 
label_file = './data/raw/train-labels-idx1-ubyte'
 
# It's 60008B, but we should set to 60000B
label_file_size = 60008
label_file_size = str(label_file_size - 8) + 'B'
 
label_buf = open(label_file, 'rb').read()
 
magic, numLabels = struct.unpack_from('>II', label_buf, 0)
labels = struct.unpack_from(
    '>' + label_file_size, label_buf, struct.calcsize('>II'))
labels = np.array(labels).astype(np.int64)
 
datas_root = 'mnist_train'
if not os.path.exists(datas_root):
    os.mkdir(datas_root)
 
for i in range(10):
    file_name = datas_root + os.sep + str(i)
    if not os.path.exists(file_name):
        os.mkdir(file_name)
 
for ii in range(numLabels):
    img = Image.fromarray(datas[ii, 0, 0:28, 0:28])
    label = labels[ii]
    file_name = datas_root + os.sep + str(label) + os.sep + \
        'mnist_train_' + str(ii) + '.png'
    img.save(file_name)

After trying different learning rate, the MLP model gets a good result of 0.058028 loss and 0.983500 accuracy, which is quite efficient on recognizing single handwritten number. The next step I'd like is to enrich this algorithm making it possible to split numbers and letters in sentences and articles, and so finally to recognize the whole handwritten text.