In [1]:
from model import *
import re
import torch
import numpy as np
import torch.nn as nn
from tqdm import tqdm
from torch.utils.data.sampler import SubsetRandomSampler
from matplotlib import pyplot as plt
import yaml
import pickle
# Gaoyj1019

In [2]:
with open('deu.txt') as f:
    sentences = f.readlines()
# print(sentences)

In [3]:
NUM_INSTANCES = 10000
MAX_SENT_LEN = 10
eng_sentences, deu_sentences = [], []
eng_words, deu_words = set(), set()
for i in tqdm(range(NUM_INSTANCES)):
  rand_idx = np.random.randint(len(sentences))
  # find only letters in sentences
  eng_sent, deu_sent = ["<sos>"], ["<sos>"]

  eng_sent += re.findall(r"\w+", sentences[rand_idx].split("\t")[1]) 
  deu_sent += re.findall(r"\w+", sentences[rand_idx].split("\t")[0])

  # change to lowercase
  eng_sent = [x.lower() for x in eng_sent]
  deu_sent = [x.lower() for x in deu_sent]
  eng_sent.append("<eos>")
  deu_sent.append("<eos>")

  if len(eng_sent) >= MAX_SENT_LEN:
    eng_sent = eng_sent[:MAX_SENT_LEN]
  else:
    for _ in range(MAX_SENT_LEN - len(eng_sent)):
      eng_sent.append("<pad>")

  if len(deu_sent) >= MAX_SENT_LEN:
    deu_sent = deu_sent[:MAX_SENT_LEN]
  else:
    for _ in range(MAX_SENT_LEN - len(deu_sent)):
      deu_sent.append("<pad>")

  # add parsed sentences
  eng_sentences.append(eng_sent)
  deu_sentences.append(deu_sent)

  # update unique words
  eng_words.update(eng_sent)
  deu_words.update(deu_sent)

eng_words, deu_words = list(eng_words), list(deu_words)

# encode each token into index
for i in tqdm(range(len(eng_sentences))):
  eng_sentences[i] = [eng_words.index(x) for x in eng_sentences[i]]
  deu_sentences[i] = [deu_words.index(x) for x in deu_sentences[i]]

idx = 10
print(eng_sentences[idx])
print([eng_words[x] for x in eng_sentences[idx]])
print(deu_sentences[idx])
print([deu_words[x] for x in deu_sentences[idx]])

with open("input_vocab", "wb") as fp:   #Pickling
    pickle.dump(eng_sentences, fp)
with open("target_vocab", "wb") as fp:   #Pickling
    pickle.dump(deu_sentences, fp)
with open("input_words", "wb") as fp:   #Pickling
    pickle.dump(eng_words, fp)
with open("target_words", "wb") as fp:   #Pickling
    pickle.dump(deu_words, fp)

100%|██████████| 10000/10000 [00:00<00:00, 54135.28it/s]
100%|██████████| 10000/10000 [00:00<00:00, 36042.46it/s]

[353, 306, 278, 119, 273, 310, 196, 196, 196, 196]
['<sos>', '824', '0', '767', '862', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>']
[9, 15, 15, 2, 18, 18, 18, 18, 18, 18]
['<sos>', 'c', 'c', '<eos>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>', '<pad>']





In [4]:
ENG_VOCAB_SIZE = len(eng_words)
DEU_VOCAB_SIZE = len(deu_words)
NUM_EPOCHS = 20
HIDDEN_SIZE = 16
EMBEDDING_DIM = 30
BATCH_SIZE = 128
NUM_HEADS = 2
NUM_LAYERS = 3
LEARNING_RATE = 1e-3
DROPOUT = .3
DEVICE = torch.device('cpu') 

In [5]:
np.random.seed(777)   # for reproducibility
dataset = MTDataset(eng_sentences, deu_sentences)
NUM_INSTANCES = len(dataset)
TEST_RATIO = 0.3
TEST_SIZE = int(NUM_INSTANCES * 0.3)

indices = list(range(NUM_INSTANCES))

test_idx = np.random.choice(indices, size = TEST_SIZE, replace = False)
train_idx = list(set(indices) - set(test_idx))
train_sampler, test_sampler = SubsetRandomSampler(train_idx), SubsetRandomSampler(test_idx)

train_loader = torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE, sampler = train_sampler)
test_loader = torch.utils.data.DataLoader(dataset, batch_size = BATCH_SIZE, sampler = test_sampler)

In [6]:
model = TransformerNet(ENG_VOCAB_SIZE, DEU_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, NUM_HEADS, NUM_LAYERS, MAX_SENT_LEN, MAX_SENT_LEN, DROPOUT).to(DEVICE)
criterion = nn.NLLLoss()
optimizer = torch.optim.Adam(model.parameters(), lr = LEARNING_RATE)


print(ENG_VOCAB_SIZE, DEU_VOCAB_SIZE, EMBEDDING_DIM, HIDDEN_SIZE, NUM_HEADS, NUM_LAYERS, MAX_SENT_LEN, MAX_SENT_LEN, DROPOUT)



475 25 30 16 2 3 10 10 0.3


In [8]:
%%time
loss_trace = []
for epoch in tqdm(range(NUM_EPOCHS)):
  current_loss = 0
  for i, (x, y) in enumerate(train_loader):
    x, y  = x.to(DEVICE), y.to(DEVICE)
    print(x,y)
    outputs = model(x, y)
    loss = criterion(outputs.permute(1, 2, 0), y)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    current_loss += loss.item()
  loss_trace.append(current_loss)
torch.save(model.state_dict(), 'mymodel.mp')

# loss curve
plt.plot(range(1, NUM_EPOCHS+1), loss_trace, 'r-')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.show()

  0%|          | 0/20 [00:00<?, ?it/s]

tensor([[353, 214, 278,  ..., 196, 196, 196],
        [353, 303, 278,  ..., 196, 196, 196],
        [353, 276, 278,  ..., 196, 196, 196],
        ...,
        [353, 390, 361,  ..., 310, 196, 196],
        [353,   0, 278,  ..., 196, 196, 196],
        [353, 289, 278,  ..., 196, 196, 196]]) tensor([[ 9,  6, 14,  ..., 18, 18, 18],
        [ 9,  6,  5,  ..., 18, 18, 18],
        [ 9, 19, 12,  ..., 18, 18, 18],
        ...,
        [ 9,  8, 15,  ..., 18, 18, 18],
        [ 9, 17,  5,  ..., 18, 18, 18],
        [ 9, 14, 19,  ..., 18, 18, 18]])


  return self._call_impl(*args, **kwargs)


tensor([[353,  18, 278,  ..., 196, 196, 196],
        [353, 214, 278,  ..., 196, 196, 196],
        [353, 289, 278,  ..., 196, 196, 196],
        ...,
        [353, 403, 278,  ..., 196, 196, 196],
        [353, 391, 442,  ..., 310, 196, 196],
        [353, 337, 278,  ..., 196, 196, 196]]) tensor([[ 9, 12, 16,  ..., 18, 18, 18],
        [ 9,  3, 23,  ..., 18, 18, 18],
        [ 9, 23, 19,  ..., 18, 18, 18],
        ...,
        [ 9,  0, 13,  ..., 18, 18, 18],
        [ 9, 11, 15,  ..., 18, 18, 18],
        [ 9, 21, 16,  ..., 18, 18, 18]])
tensor([[353, 399, 278,  ..., 196, 196, 196],
        [353,  99, 436,  ..., 196, 196, 196],
        [353,  59, 278,  ..., 196, 196, 196],
        ...,
        [353, 469, 278,  ..., 196, 196, 196],
        [353,  18, 278,  ..., 196, 196, 196],
        [353, 274, 278,  ..., 196, 196, 196]]) tensor([[ 9,  7, 17,  ..., 18, 18, 18],
        [ 9, 14, 22,  ..., 18, 18, 18],
        [ 9,  0,  0,  ..., 18, 18, 18],
        ...,
        [ 9, 16, 23,  ..., 18, 18

  0%|          | 0/20 [00:15<?, ?it/s]


KeyboardInterrupt: Interrupted by user