<a href="https://colab.research.google.com/github/Hramchenko/Handwritting/blob/master/HTR_tf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
print("Device " + torch.cuda.get_device_name(0))
#device = torch.device("cuda:0")
device = torch.device("cpu")
print(device)

Device Tesla K80
cpu


In [0]:
batch_size = 30

In [3]:
import sys
sys.path.append("./Handwritting/")
from IAMWords import IAMWords
train_set = IAMWords("train", "./IAM/", batch_size=batch_size)
test_set = IAMWords("test", "./IAM/", batch_size=batch_size)

Reading ./IAM/words.train.pkl...
Reading finished
Reading ./IAM/words.test.pkl...
Reading finished


In [0]:
def modify_dataset(dataset):
  l = len(dataset.codes)
  s = "<START>"
  dataset.codes[s] = l
  dataset.inv_codes[l] = s
  return dataset

train_set = modify_dataset(train_set)
test_set = modify_dataset(test_set)


In [0]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [0]:
class ConvLayer(nn.Module):
    def __init__(self, size, padding=1, pool_layer=nn.MaxPool2d(2, stride=2),
                 bn=False, dropout=False, activation_fn=nn.ReLU(), stride=1):
        super(ConvLayer, self).__init__()
        layers = []
        layers.append(nn.Conv2d(size[0], size[1], size[2], padding=padding, stride=stride))
        if pool_layer is not None:
            layers.append(pool_layer)
        if bn:
            layers.append(nn.BatchNorm2d(size[1]))
        if dropout:
            layers.append(nn.Dropout2d())
        layers.append(activation_fn)
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

In [0]:
class DeconvLayer(nn.Module):
    def __init__(self, size, padding=1, stride=1, 
                 bn=False, dropout=False, activation_fn=nn.ReLU(), output_padding=0):
        super(DeconvLayer, self).__init__()
        layers = []
        layers.append(nn.ConvTranspose2d(size[0], size[1], size[2], padding=padding, 
                                         stride=stride, output_padding=output_padding))
        if bn:
            layers.append(nn.BatchNorm2d(size[1]))
        if dropout:
            layers.append(nn.Dropout2d())
        layers.append(activation_fn)
        
        self.model = nn.Sequential(*layers)
    
    def forward(self, x):
        return self.model(x)

In [0]:
class FullyConnected(nn.Module):
    def __init__(self, sizes, dropout=False, activation_fn=nn.Tanh):
        super(FullyConnected, self).__init__()
        layers = []
        
        for i in range(len(sizes) - 2):
            layers.append(nn.Linear(sizes[i], sizes[i+1]))
            if dropout:
                layers.append(nn.Dropout())
            layers.append(activation_fn())
        else: # нам не нужен дропаут и фнкция активации в последнем слое
            layers.append(nn.Linear(sizes[-2], sizes[-1]))
        
        self.model = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.model(x)

In [0]:
batch = train_set.make_batch()
data, target = batch
target = target.to(device)
data = data/255.0
data = data.view(batch_size, 1, 128, 400).to(device)

In [0]:
class HTREncoder(nn.Module):
    def __init__(self, batchnorm=True, dropout=False):
        super(HTREncoder, self).__init__()
        
        self.convolutions = nn.Sequential(
        ConvLayer([1, 16, 3], padding=0, bn=batchnorm),
        ConvLayer([16, 32, 3], padding=0, bn=batchnorm),
        ConvLayer([32, 50, 3], padding=0, bn=batchnorm),
        ConvLayer([50, 64, 3], padding=0, stride=2, bn=batchnorm, pool_layer=None))
    
    def forward(self, x):
        h = self.convolutions(x)
        #print(h.shape)
        h = F.max_pool2d(h, [h.size(2), 1], padding=[0, 0])
        #print("MP " + str(h.shape))
        h = h.permute([2, 3, 0, 1])[0]
        #h = h.permute([0, 3, 1, 2])
        #print("P " + str(h.shape))
        #h = h.flatten(start_dim=2)
        #print("P[0] " + str(h.shape))
        return h
    

In [0]:
encoder = HTREncoder().to(device)

In [0]:
class HTRDecoder(nn.Module):
    def __init__(self, ntoken, encoded_width=23, encoded_height=64, batchnorm=True, dropout=False):
        super(HTRDecoder, self).__init__()
        self.ntoken = ntoken
        self.encoded_height = encoded_height
        self.lstm_size = 128
        lstm_layers = 2
        self.rnn = nn.LSTM(self.encoded_height*encoded_width + ntoken, self.lstm_size, lstm_layers, dropout=0.3, bidirectional=True)
        self.embedding = nn.Embedding(ntoken, ntoken)
        self.decoder = nn.Linear(1*self.lstm_size*2, ntoken)#*batch_size)
        self.drop = nn.Dropout(0.3)
        self.concatenated = torch.FloatTensor(24, )
    
    def forward(self, x, prev, hidden=None):
        x = self.drop(x)
        emb = self.embedding(prev)
        emb = emb.permute([1, 0, 2])
        #print(x.shape)
        #print(emb.shape)
        x = torch.cat([x, emb], dim=2)
        #print(x.shape)
        x, hidden = self.rnn(x, hidden)
        #print("rnn")
        #print(x.shape)
        #print("rnn " + str(x.shape))
        x = x.permute(1, 0, 2)
        x = x.flatten(start_dim=1)
        #print("x" + str(x.shape))
        x = self.drop(x)
        x = self.decoder(x)
        return x, hidden  
      
    def makeHidden(self):
      h1 = torch.zeros(4, batch_size, self.lstm_size).to(device)
      h2 = torch.zeros(4, batch_size, self.lstm_size).to(device)
      return (h1, h2)

In [0]:
decoder = HTRDecoder(len(train_set.codes)).to(device)

In [0]:
START = train_set.codes['<START>']
current_symbol = torch.LongTensor(batch_size, 1).to(device)
current_symbol[:, :] = START

In [38]:
len(train_set.codes)

81

In [0]:
#decoder.embedding(current_symbol).shape

In [47]:
enc = encoder(data)
print("enc")
print(enc.shape)
#s = enc.contiguous().view(1, batch_size, -1)#[1, :, :].view(1, batch_size, -1)
s = enc.permute(1, 0, 2)
s = s.flatten(start_dim=1).view(1, 30, 1472)
print("xx")
print(s.shape)
print(current_symbol.shape)
hidden = decoder.makeHidden()
dec, h = decoder(s, current_symbol)
#print(dec.shape)


enc
torch.Size([23, 30, 64])
xx
torch.Size([1, 30, 1472])
torch.Size([30, 1])


In [41]:
print(dec.shape)
ss = torch.multinomial(dec.exp(), 1)

torch.Size([30, 81])


In [42]:
ss

tensor([[14],
        [65],
        [80],
        [58],
        [79],
        [67],
        [23],
        [59],
        [30],
        [ 7],
        [74],
        [64],
        [73],
        [ 2],
        [39],
        [36],
        [ 6],
        [69],
        [54],
        [79],
        [66],
        [13],
        [28],
        [ 3],
        [50],
        [38],
        [13],
        [ 9],
        [ 6],
        [ 5]])

In [43]:
hidden.shape


AttributeError: ignored

In [44]:
h[0].shape

torch.Size([4, 30, 128])

In [51]:
encoder_optimizer = optim.Adam(encoder.parameters(), lr=1e-4, weight_decay=0.00005)
decoder_optimizer = optim.Adam(decoder.parameters(), lr=1e-4, weight_decay=0.00005)

criterion = nn.CrossEntropyLoss()


def train(epoch):
  print("Training epoch " + str(epoch) + "...")
  train_set.to_start()
  batch_idx = 0
  c_loss = 0
  START = train_set.codes['<START>']
  current_symbol = torch.LongTensor(batch_size, 30+1).to(device)
  while True:
    batch = train_set.make_batch()
    if batch is None:
      break
    encoder.zero_grad()
    decoder.zero_grad()
    
    data, target = batch
    data = data.view(batch_size, 1, 128, 400)/255.0
    data = data.to(device)
    target = target.to(device)
    hidden = decoder.makeHidden()    

    loss = 0
    enc = encoder(data)
    #s = enc.contiguous().view(1, batch_size, -1)
    
    s = enc.permute(1, 0, 2)
    s = s.flatten(start_dim=1).view(1, 30, 1472)
    
    #print("---------")
    #print(s.shape)
    current_symbol[:, 0] = START
    #print(current_symbol[:, 0].shape)
    for i in range(0, target.shape[1]):
      #enc_i = enc[i, :, :].view(1, batch_size, -1)
      symb = current_symbol[:, i].view(batch_size, 1).contiguous()
      dec, hidden = decoder(s, symb, hidden)
      #print("dec " + str(dec.shape))
      sampled = torch.multinomial(dec.exp(), 1)
      current_symbol[:, i+1] = sampled.squeeze()
      o = dec.view(30, 1, 81).flatten(start_dim=0,end_dim=1)
#      o = dec.view(30, -1, 80).flatten(start_dim=0,end_dim=1)
      t = target[:, i].flatten()
      
      
      #print(o.shape)
      #print(t.shape)
      loss += criterion(o, t)
    
    #output = net.forward(data)
    #print(output.shape)
    #print(target.shape)
    #o = output.view(30, -1, 80).flatten(start_dim=0,end_dim=1)
    #t = target.flatten()

    #loss = criterion(o, t)
    c_loss += loss.item()
    freq = 1
    if (batch_idx % freq == 0) and (batch_idx != 0):
      c_loss /= freq 
      print("  " + str(batch_idx) + " " + str(c_loss))
      c_loss = 0
    loss.backward()
    grad_clip = 0.1
    torch.nn.utils.clip_grad_norm_(decoder.parameters(), grad_clip)
    encoder_optimizer.step()
    decoder_optimizer.step()
    batch_idx += 1

for i in range(0, 100):
  train(i)


Training epoch 0...
  1 239.6279754638672
  2 117.57829284667969
  3 116.05209350585938
  4 114.99407958984375
  5 112.21305084228516
  6 110.7999267578125
  7 109.36397552490234
  8 108.42613983154297
  9 107.76636505126953
  10 103.55252838134766
  11 102.61219787597656
  12 99.52122497558594
  13 98.07105255126953
  14 95.46385955810547
  15 92.95638275146484
  16 90.49456024169922
  17 90.5299072265625
  18 86.03430938720703
  19 83.0023193359375
  20 81.03985595703125
  21 77.54261779785156
  22 75.4160385131836
  23 74.32049560546875
  24 70.83709716796875
  25 66.58506774902344
  26 62.502960205078125
  27 62.600425720214844
  28 61.00706100463867
  29 56.83729934692383
  30 52.731632232666016
  31 50.34827423095703
  32 48.70368194580078
  33 46.856773376464844
  34 44.66219711303711
  35 43.779457092285156
  36 40.0951042175293
  37 38.531044006347656
  38 38.55298614501953
  39 35.166114807128906
  40 36.9328498840332
  41 31.358367919921875
  42 31.54657745361328
  43 27.664

KeyboardInterrupt: ignored

In [0]:
class Net(nn.Module):
    def __init__(self, ntoken, latent_size=64, batchnorm=True, dropout=False):
        super(Net, self).__init__()
    
        self.latent_size = latent_size
        
        self.convolutions = nn.Sequential(
        ConvLayer([1, 16, 3], padding=0, bn=batchnorm),
        ConvLayer([16, 32, 3], padding=0, bn=batchnorm),
        ConvLayer([32, 50, 3], padding=0, bn=batchnorm),
        ConvLayer([50, 64, 3], padding=0, stride=2, bn=batchnorm, pool_layer=None))
        
        

        lstm_size = 128
        lstm_layers = 2
        # (batch_size, timesteps, input_dim)
        self.rnn = nn.LSTM(latent_size, lstm_size, lstm_layers, dropout=0.3, bidirectional=True)
        self.decoder = nn.Linear(23*lstm_size*2, ntoken*30)#nn.Linear(lstm_size, ntoken)
#        self.fcd = FullyConnected([latent_size, 32*2*2])
        self.drop = nn.Dropout(0.3)
        
#         nlayers = 2
        
#         self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=0.3)
#         self.decoder = nn.Linear(nhid, ntoken)

#         self.init_weights()

#         self.nhid = nhid
#         self.nlayers = nlayers

        
#     def forward(self, x, hidden=None):
#         emb = self.drop(self.encoder(x))
#         output, hidden = self.rnn(emb, hidden)
#         output = self.drop(output)
#         decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
#         return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                weight.new(self.nlayers, bsz, self.nhid).zero_())

        
        
        
    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)
                

    
    def encode(self, x):
        h = self.convolutions(x)
        #print(h.shape)
        h = F.max_pool2d(h, [h.size(2), 1], padding=[0, 0])
        #print("MP " + str(h.shape))
        h = h.permute([2, 3, 0, 1])[0]
        #h = h.permute([0, 3, 1, 2])
        #print("P " + str(h.shape))
        #h = h.flatten(start_dim=2)
        #print("P[0] " + str(h.shape))
        
        
        return h
      
    def decode(self, h):
        x = self.drop(h)
        x, hidden = self.rnn(x)
        #print("rnn " + str(x.shape))
        x = x.permute(1, 0, 2)
        x = x.flatten(start_dim=1)
        #print("x" + str(x.shape))
        x = self.drop(x)
        x = self.decoder(x)
        return x
    
#     def decode(self, h):
#         flatten = self.fcd(h)
        
#         output, hidden = self.rnn(flatten, hidden)
#         output = self.drop(output)
#         decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
#         x = decoded.view(output.size(0), output.size(1), decoded.size(1))
#         return x, hidden
    
    def forward(self, x):
        h = self.encode(x)
        #print(h.shape)
        r = self.decode(h)
        return r
    
    

In [0]:
net = Net(len(train_set.alphabet)).to(device)

In [0]:
output=net.forward(data)

RuntimeError: ignored

In [0]:
output.shape

In [0]:
criterion = nn.CrossEntropyLoss()

In [0]:
o = output.view(30, -1, 80).flatten(start_dim=0,end_dim=1)
o.shape

In [0]:
target.shape

In [0]:
t = target.flatten()
t.shape

In [0]:
2400/900


In [0]:
import numpy as np
def to_onehot(x, n, device=None):
    if isinstance(x, np.ndarray):
        x = torch.Tensor(x).to(torch.long)
    one_hot = torch.zeros((x.shape[0], n))
    one_hot.scatter_(1, x[:, None], 1.)
    if device is not None:
        one_hot = one_hot.to(device)
    return one_hot  

In [0]:
def test(epoch):
  with torch.no_grad():
    test_set.to_start()
    batch_idx = 0
    c_loss = 0
    while True:
      batch = test_set.make_batch()
      if batch is None:
        break
      data, target = batch
      data = data.view(batch_size, 1, 128, 400)/255.0
      data = data.to(device)
      target = target.to(device)
      output = net.forward(data)
      o = output.view(30, -1, 80).flatten(start_dim=0,end_dim=1)
      t = target.flatten()
      loss = criterion(o, t)
      c_loss += loss.item()
      batch_idx += 1
    print("  Test loss: " + str(c_loss/batch_idx))   

In [0]:
test(1)

In [0]:
optimizer = optim.Adam(net.parameters(), lr=1e-4, weight_decay=0.00005)
criterion = nn.CrossEntropyLoss()

teacher_forcing_ratio = 0.5

def forcing_train(epoch):
  print("Training epoch " + str(epoch) + "...")
  train_set.to_start()
  batch_idx = 0
  c_loss = 0
  while True:
    batch = train_set.make_batch()
    if batch is None:
      break
    data, target = batch
    data = data.view(batch_size, 1, 128, 400)/255.0
    data = data.to(device)
    target = target.to(device)
    net.zero_grad()
    
    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
    
    output = net.forward(data)
    #print(output.shape)
    #print(target.shape)
    o = output.view(30, -1, 80).flatten(start_dim=0,end_dim=1)
    t = target.flatten()

    loss = criterion(o, t)
    c_loss += loss.item()
    freq = 200
    if (batch_idx % freq == 0) and (batch_idx != 0):
      c_loss /= freq 
      print("  " + str(batch_idx) + " " + str(c_loss))
      c_loss = 0
    loss.backward()
    grad_clip = 0.1
    torch.nn.utils.clip_grad_norm_(net.parameters(), grad_clip)
    optimizer.step()
    batch_idx += 1

for i in range(0, 100):
  train(i)
  test(i)

In [0]:
optimizer = optim.Adam(net.parameters(), lr=1e-4, weight_decay=0.00005)
criterion = nn.CrossEntropyLoss()

def train(epoch):
  print("Training epoch " + str(epoch) + "...")
  train_set.to_start()
  batch_idx = 0
  c_loss = 0
  while True:
    batch = train_set.make_batch()
    if batch is None:
      break
    data, target = batch
    data = data.view(batch_size, 1, 128, 400)/255.0
    data = data.to(device)
    target = target.to(device)
    net.zero_grad()
    output = net.forward(data)
    #print(output.shape)
    #print(target.shape)
    o = output.view(30, -1, 80).flatten(start_dim=0,end_dim=1)
    t = target.flatten()

    loss = criterion(o, t)
    c_loss += loss.item()
    freq = 200
    if (batch_idx % freq == 0) and (batch_idx != 0):
      c_loss /= freq 
      print("  " + str(batch_idx) + " " + str(c_loss))
      c_loss = 0
    loss.backward()
    grad_clip = 0.1
    torch.nn.utils.clip_grad_norm_(net.parameters(), grad_clip)
    optimizer.step()
    batch_idx += 1

for i in range(0, 100):
  train(i)
  test(i)

Training epoch 0...
  200 0.40906761199235914
  400 0.39776449725031854
  600 0.3906879594922066
  800 0.3917499200254679
  1000 0.38672335997223856
  1200 0.38541785448789595
  1400 0.3825364875793457
  1600 0.37158095367252825
  1800 0.3702904235571623
  2000 0.36803763136267664
  2200 0.36549833960831163
  2400 0.36047813162207604
  Test loss: 0.36128279687459597
Training epoch 1...
  200 0.35956719525158404
  400 0.3510160192847252
  600 0.34734406001865864
  800 0.35050836570560934
  1000 0.34704456470906736
  1200 0.34642483055591583
  1400 0.34863717637956143
  1600 0.3386073859035969
  1800 0.3388120226562023
  2000 0.33854374147951605
  2200 0.33705330215394497
  2400 0.3338170325756073
  Test loss: 0.3369936909631034
Training epoch 2...
  200 0.3353282096982002
  400 0.3285587485134602
  600 0.32515278153121474
  800 0.3304219239205122
  1000 0.3270692164450884
  1200 0.3259670868515968
  1400 0.3295812138915062
  1600 0.3206078252196312
  1800 0.32301347233355043
  2000 0.32

In [0]:
class HTRModel(nn.Module):

    def __init__(self, ntoken, ninp, nhid, nlayers, dropout=0.5):
        super(HTRModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        
        self.rnn = nn.LSTM(ninp, nhid, nlayers, dropout=dropout)
        self.decoder = nn.Linear(nhid, ntoken)

        self.init_weights()

        self.nhid = nhid
        self.nlayers = nlayers

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, x, hidden=None):
        emb = self.drop(self.encoder(x))
        output, hidden = self.rnn(emb, hidden)
        output = self.drop(output)
        decoded = self.decoder(output.view(output.size(0)*output.size(1), output.size(2)))
        return decoded.view(output.size(0), output.size(1), decoded.size(1)), hidden

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        return (weight.new(self.nlayers, bsz, self.nhid).zero_(),
                weight.new(self.nlayers, bsz, self.nhid).zero_())


In [0]:
ntokens = len(corpus.dictionary)
model = RNNModel('LSTM', ntokens, 128, 128, 2, 0.3).to(device)
criterion = nn.CrossEntropyLoss()

In [0]:
def train():
    model.train()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    for batch, (data, targets) in enumerate(train_loader):
        data = data.to(device)
        targets = targets.to(device)
        model.zero_grad()
        output, hidden = model(data)
        loss = criterion(output.view(-1, ntokens), targets)
        loss.backward()

        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        torch.nn.utils.clip_grad_norm_(model.parameters(), grad_clip)
        for p in model.parameters():
            p.data.add_(-lr, p.grad.data)

        total_loss += loss.item()

        if batch % log_interval == 0 and batch > 0:
            cur_loss = total_loss / log_interval
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:02.2f} | loss {:5.2f} | ppl {:8.2f}'.format(
                epoch, batch, len(train_loader) // sequence_length, lr, cur_loss, math.exp(cur_loss)))
            total_loss = 0

In [0]:
def evaluate(data_loader):
    model.eval()
    total_loss = 0
    ntokens = len(corpus.dictionary)
    hidden = model.init_hidden(eval_batch_size)
    for i, (data, targets) in enumerate(data_loader):
        data = data.to(device)
        targets = targets.to(device)
        output, hidden = model(data)
        output_flat = output.view(-1, ntokens)
        total_loss += len(data) * criterion(output_flat, targets).item()
    return total_loss / len(data_loader)

In [0]:
# https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html

teacher_forcing_ratio = 0.5


def train(input_tensor, target_tensor, encoder, decoder, encoder_optimizer, decoder_optimizer, criterion, max_length=MAX_LENGTH):
    encoder_hidden = encoder.initHidden()

    encoder_optimizer.zero_grad()
    decoder_optimizer.zero_grad()

    input_length = input_tensor.size(0)
    target_length = target_tensor.size(0)

    encoder_outputs = torch.zeros(max_length, encoder.hidden_size, device=device)

    loss = 0

    for ei in range(input_length):
        encoder_output, encoder_hidden = encoder(
            input_tensor[ei], encoder_hidden)
        encoder_outputs[ei] = encoder_output[0, 0]

    decoder_input = torch.tensor([[SOS_token]], device=device)

    decoder_hidden = encoder_hidden

    use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False

    if use_teacher_forcing:
        # Teacher forcing: Feed the target as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            loss += criterion(decoder_output, target_tensor[di])
            decoder_input = target_tensor[di]  # Teacher forcing

    else:
        # Without teacher forcing: use its own predictions as the next input
        for di in range(target_length):
            decoder_output, decoder_hidden, decoder_attention = decoder(
                decoder_input, decoder_hidden, encoder_outputs)
            topv, topi = decoder_output.topk(1)
            decoder_input = topi.squeeze().detach()  # detach from history as input

            loss += criterion(decoder_output, target_tensor[di])
            if decoder_input.item() == EOS_token:
                break

    loss.backward()

    encoder_optimizer.step()
    decoder_optimizer.step()

    return loss.item() / target_length

  
  
  
  def trainIters(encoder, decoder, n_iters, print_every=1000, plot_every=100, learning_rate=0.01):
    start = time.time()
    plot_losses = []
    print_loss_total = 0  # Reset every print_every
    plot_loss_total = 0  # Reset every plot_every

    encoder_optimizer = optim.SGD(encoder.parameters(), lr=learning_rate)
    decoder_optimizer = optim.SGD(decoder.parameters(), lr=learning_rate)
    training_pairs = [tensorsFromPair(random.choice(pairs))
                      for i in range(n_iters)]
    criterion = nn.NLLLoss()

    for iter in range(1, n_iters + 1):
        training_pair = training_pairs[iter - 1]
        input_tensor = training_pair[0]
        target_tensor = training_pair[1]

        loss = train(input_tensor, target_tensor, encoder,
                     decoder, encoder_optimizer, decoder_optimizer, criterion)
        print_loss_total += loss
        plot_loss_total += loss

        if iter % print_every == 0:
            print_loss_avg = print_loss_total / print_every
            print_loss_total = 0
            print('%s (%d %d%%) %.4f' % (timeSince(start, iter / n_iters),
                                         iter, iter / n_iters * 100, print_loss_avg))

        if iter % plot_every == 0:
            plot_loss_avg = plot_loss_total / plot_every
            plot_losses.append(plot_loss_avg)
            plot_loss_total = 0

    showPlot(plot_losses)