<a href="https://colab.research.google.com/github/Hramchenko/Handwritting/blob/master/HTR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [15]:
import torch
print("Device " + torch.cuda.get_device_name(0))
device = torch.device("cuda:0")
#device = torch.device("cpu")
print(device)

Device Tesla K80
cuda:0


In [0]:
batch_size = 10

In [17]:
import sys
sys.path.append("./Handwritting/")
from IAMWords import IAMWords
train = IAMWords("train", "./IAM/", batch_size=batch_size)
test = IAMWords("test", "./IAM/", batch_size=batch_size)

Reading ./IAM/words.train.pkl...
Reading finished
Reading ./IAM/words.test.pkl...
Reading finished


In [0]:
import torch.nn as nn
import torch.nn.functional as F
import torch


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3, stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != self.expansion*planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, self.expansion*planes, kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(self.expansion*planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out

class HTRNet(nn.Module):
    def __init__(self, cnn_cfg, rnn_cfg, nclasses):
        super(HTRNet, self).__init__()

        #cfg = [(2, 16), 'M', (4, 32), 'M', (6, 64), 'M', (2, 128)]

        in_channels = 1
        self.features = nn.ModuleList([])
        cntm = 0
        cnt = 1
        for m in cnn_cfg:
            if m == 'M':
                self.features.add_module('mxp' + str(cntm), nn.MaxPool2d(kernel_size=2, stride=2))
                cntm += 1
            else:
                for i in range(m[0]):
                    x = m[1]
                    self.features.add_module('cnv' + str(cnt), nn.Conv2d(in_channels, x, 3, 1, 1, bias=True))
                    #self.features.add_module('cnv' + str(cnt), BasicBlock(in_channels, x))
                    in_channels = x
                    self.features.add_module('nl' + str(cnt), nn.Sequential(nn.BatchNorm2d(x, momentum=.5), nn.ReLU()))
                    #self.features.add_module('nl' + str(cnt), nn.ReLU())
                    cnt += 1


        rnn_in = cnn_cfg[-1][-1]
        hidden, num_layers = rnn_cfg

        self.rec = nn.LSTM(rnn_in, hidden, num_layers=num_layers, bidirectional=True)

        self.fnl = nn.Sequential(nn.Linear(2*hidden, 400), nn.ReLU(), nn.Dropout(.5), nn.Linear(400, nclasses))

    def forward(self, x):

        y = x
        for nn_module in self.features:
            y = nn_module(y)

        y = F.max_pool2d(y, [y.size(2), 1], padding=[0, 0])
        y = y.permute(2, 3, 0, 1)[0]  # 1 x seq_len x batch_size x feat
        y = self.rec(y)[0] #.view(1, -1)
        y = self.fnl(y)

        return y


In [0]:
cnn_cfg = [(2, 32), 'M', (4, 64), 'M', (6, 128), 'M', (2, 256)]
rnn_cfg = (256, 1)  # (hidden , num_layers)
net = HTRNet(cnn_cfg, rnn_cfg, len(train.alphabet)).to(device)

In [0]:
from warpctc_pytorch import CTCLoss
loss = CTCLoss()
net_parameters = net.parameters()
nlr = 1e-4
optimizer = torch.optim.Adam(net_parameters, nlr, weight_decay=0.00005)

In [0]:
data, target = train.make_batch()
data = data.view(batch_size, 1, 128, 400)
data = data.to(device)

In [0]:
output = net(data)

In [29]:
output.shape

torch.Size([50, 10, 80])

In [30]:
len(test.alphabet)

80

    # acts: Tensor of (seqLength x batch x outputDim) containing output activations from network (before softmax)
    # labels: 1 dimensional Tensor containing all the targets of the batch in one large sequence
    # act_lens: Tensor of size (batch) containing size of each output sequence from the network
    # label_lens: Tensor of (batch) containing label length of each example

In [0]:
act_lens = torch.IntTensor(img.size(0)*[output.size(0)])
labels = Variable(torch.IntTensor([cdict[c] for c in ''.join(transcr)]))
label_lens = torch.IntTensor([len(t) for t in transcr])

loss_val = loss(output.cpu(), labels, act_lens, label_lens)

In [33]:
o = output#.permute(1, 0, 2)
o.shape

torch.Size([50, 10, 80])

In [32]:
t = target.flatten()
t.shape

torch.Size([300])

In [49]:
al = torch.IntTensor(batch_size*[o.shape[0]])
al 

tensor([50, 50, 50, 50, 50, 50, 50, 50, 50, 50], dtype=torch.int32)

In [50]:
ll = torch.IntTensor(batch_size*[target.shape[1]])
ll

tensor([30, 30, 30, 30, 30, 30, 30, 30, 30, 30], dtype=torch.int32)

In [54]:
lss = loss(output, target.flatten(), al, ll)
lss

tensor([2036.5027], grad_fn=<_CTCBackward>)

In [53]:
output

tensor([[[-9.2984e-02,  2.6708e-02,  1.6410e-01,  ...,  1.6956e-02,
           5.7450e-02,  5.7150e-02],
         [ 2.1227e-02,  1.3474e-01,  2.2912e-01,  ...,  8.8107e-02,
           7.1193e-02, -7.4169e-02],
         [ 9.0972e-02,  8.3708e-02,  1.6955e-01,  ...,  5.8192e-02,
           1.6916e-01, -6.9750e-03],
         ...,
         [ 1.9357e-02,  6.4526e-02,  2.3440e-01,  ..., -2.2923e-02,
           5.7868e-02,  9.0086e-03],
         [-4.8007e-02,  6.1810e-02,  2.5245e-01,  ...,  3.5174e-02,
           1.3194e-01, -2.8149e-02],
         [ 1.9059e-03,  5.8252e-02,  8.1492e-02,  ..., -7.4695e-03,
           1.2975e-01, -2.8046e-02]],

        [[-2.3815e-02,  7.2186e-02,  1.4510e-01,  ..., -4.8895e-02,
           6.1914e-02,  4.6696e-02],
         [-1.1655e-01, -5.6488e-02,  1.1558e-01,  ...,  9.0759e-02,
           1.7135e-01, -7.0518e-03],
         [ 6.0257e-02,  7.9678e-02,  2.5769e-01,  ...,  7.5825e-02,
           1.0033e-01, -6.1019e-02],
         ...,
         [-3.2652e-02,  1

In [41]:
lss

tensor([0.], grad_fn=<_CTCBackward>)

In [0]:
lss.backward()

In [43]:
lss

tensor([0.], grad_fn=<_CTCBackward>)

In [0]:
import ctcdecode
decoder = ctcdecode.CTCBeamDecoder([c for c in test.alphabet], beam_width=100)

In [55]:
batch_idx = 0
train.to_start()
al = torch.IntTensor(batch_size*[o.shape[0]])
ll = torch.IntTensor(batch_size*[target.shape[1]])
while True:
  batch = train.make_batch()
  
  if batch is None:
    break
  data, target = batch
  data = data.view(batch_size, 1, 128, 400)
  data = data.to(device)
  print(data.shape)
  output = net.forward(data)
  
  loss_val = loss(output, target.flatten(), al, ll)
  print(loss_val)
  
#   act_lens = torch.IntTensor(128*[output.size(0)])
#   #labels = Variable(torch.IntTensor([cdict[c] for c in ''.join(transcr)]))
#   label_lens = torch.IntTensor([len(t) for t in transcr])

#   loss_val = loss(output, labels, act_lens, label_lens)
#   #closs += [loss_val.data]

#   loss_val.backward()

  
  batch_idx += 1
  if (batch_idx % 10 == 0):
    print(batch_idx)
  

torch.Size([10, 1, 128, 400])
tensor([2039.3319], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2038.7579], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2038.5931], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2037.3873], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2038.4689], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2036.3082], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2036.7057], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2038.3156], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2038.1860], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2039.3909], grad_fn=<_CTCBackward>)
10
torch.Size([10, 1, 128, 400])
tensor([2038.5283], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2037.3938], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tensor([2036.8712], grad_fn=<_CTCBackward>)
torch.Size([10, 1, 128, 400])
tenso

KeyboardInterrupt: ignored

In [48]:
import torch
from warpctc_pytorch import CTCLoss
ctc_loss = CTCLoss()
# expected shape of seqLength x batchSize x alphabet_size
probs = torch.FloatTensor([[[0.1, 0.6, 0.1, 0.1, 0.1], [0.1, 0.1, 0.6, 0.1, 0.1]]]).transpose(0, 1).contiguous()
print(probs.shape)
labels = torch.IntTensor([1, 2])
label_sizes = torch.IntTensor([2])
probs_sizes = torch.IntTensor([2])
probs.requires_grad_(True)  # tells autograd to compute gradients for probs
cost = ctc_loss(probs, labels, probs_sizes, label_sizes)
print(cost)
cost.backward()

torch.Size([2, 1, 5])
tensor([2.4629], grad_fn=<_CTCBackward>)
