# Preparing Data¶

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim

from torchtext.datasets import TranslationDataset, Multi30k
from torchtext.data import Field, BucketIterator

import spacy

import random
import math
import time
import pickle
import os

In [25]:
spacy_en = spacy.load('en')

In [68]:
# define the path variables
BASE_DIR = os.path.abspath('')
DATA_DIR = os.path.join(BASE_DIR, 'data')
REQUEST_DATA_PATH = os.path.join(DATA_DIR, 'clean_requests.pickle')
TRAIN_EN_PATH = os.path.join(DATA_DIR, 'multi30k/train.en')
VAL_EN_PATH = os.path.join(DATA_DIR, 'multi30k/val.en')
TEST_EN_PATH = os.path.join(DATA_DIR, 'multi30k/test2016.en')
# TRAIN_SRC_PATH = os.path.join(DATA_DIR, 'multi30k/train.src')
# VAL_SRC_PATH = os.path.join(DATA_DIR, 'multi30k/val.src')
# TEST_SRC_PATH = os.path.join(DATA_DIR, 'multi30k/test2016.src')
TRAIN_TOKEN_PATH = os.path.join(DATA_DIR, 'tokens.pickle')

In [69]:
# set the random seeds for deterministic results
SEED = 1234

random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [70]:
"""
Tokenizes English text from a string into a list of strings (tokens)
"""
def tokenize_en(text):
    return [tok.text for tok in spacy_en.tokenizer(text)]

In [160]:
# define the source text and target text of the seq2seq network
# use 'tokenize_en' defined above as the tokenizer
# <sos>: start of the sentence
# <eos>: end of the sentence
# lower case char only
SRC = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

TRG = Field(tokenize = tokenize_en, 
            init_token = '<sos>', 
            eos_token = '<eos>', 
            lower = True)

In [174]:
# load the text data (an array of strings)
requests = pickle.load(open(REQUEST_DATA_PATH, "rb"))
requests = requests[:500]

# devide the text data into training, validating, and testing set
#random.shuffle(requests)
request_number = len(requests)
test = requests[:int(request_number*0.03)]
validation = requests[int(request_number*0.03):int(request_number*0.06)]
train = requests[int(request_number*0.06):]

# save the data for Multi30k
with open(TEST_EN_PATH, 'w') as target:
    for t in test:
        line = t + '\n'
        target.write(line)
target.close()
with open(VAL_EN_PATH, 'w') as target:
    for v in validation:
        line = v + '\n'
        target.write(line)
target.close()
with open(TRAIN_EN_PATH, 'w') as target:
    for t in train:
        line = t + '\n'
        target.write(line)
target.close()

In [175]:
# tokenize the text
# exts = ('.en', '.en'): source and target are both in English
# fields = (SRC, TRG): source and target are actually the same
train_data, valid_data, test_data = Multi30k.splits(exts = ('.en', '.en'), 
                                                    fields = (SRC, TRG), root=DATA_DIR)

print(f"Number of training examples: {len(train_data.examples)}")
print(f"Number of validation examples: {len(valid_data.examples)}")
print(f"Number of testing examples: {len(test_data.examples)}")

# with open(TRAIN_TOKEN_PATH, 'wb') as f:
#     pickle.dump(train_data, f)
# f.close()
# vars(test_data[0])

Number of training examples: 470
Number of validation examples: 15
Number of testing examples: 15


In [176]:
# build the vocabulary from the training set
# each word is represented by a unique number(maybe it's the order they appear in the corpus?)
# min_freq = 2: a valid word must appear at least twice
SRC.build_vocab(train_data, min_freq = 2)
TRG.build_vocab(train_data, min_freq = 2)
print(f"Unique tokens in request vocabulary: {len(TRG.vocab)}")
vars(TRG.vocab)

Unique tokens in request vocabulary: 1721


{'freqs': Counter({'raise': 3,
          'the': 633,
          'approval': 11,
          'limit': 3,
          'of': 179,
          'wehrmeier': 2,
          'christoph': 2,
          'in': 242,
          'coupa': 3,
          'please': 262,
          'to': 871,
          'njm': 579,
          'usd': 1,
          'reason': 6,
          'for': 361,
          'that': 74,
          'is': 208,
          'recent': 4,
          'invoices': 9,
          'have': 140,
          'been': 36,
          'going': 6,
          'his': 16,
          'manager': 19,
          'wendy': 2,
          'allardes': 1,
          'director': 2,
          'since': 12,
          'he': 17,
          'only': 21,
          'has': 67,
          'ajn': 281,
          'approvel': 1,
          'past': 5,
          'this': 151,
          'was': 38,
          'working': 24,
          'without': 5,
          'having': 5,
          'include': 2,
          'new': 106,
          'badge': 2,
          'request': 78,
          '

In [177]:
# cuda is not available on Mac GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("I'm using", device)

# split the data into mini batches
BATCH_SIZE = 128
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data), 
    batch_size = BATCH_SIZE, 
    device = device)

I'm using cpu


In [141]:
# build the encoder that generates the context vector of the sentences
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()
        
        self.input_dim = input_dim
        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(input_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):
        
        #src = [src sent len, batch size]
        
        embedded = self.dropout(self.embedding(src))
        
        #embedded = [src sent len, batch size, emb dim]
        
        outputs, (hidden, cell) = self.rnn(embedded)
        
        #outputs = [src sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #outputs are always from the top hidden layer
        return hidden, cell

In [142]:
# buid the decoder that deciphers the context vectors to source sentences
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, n_layers, dropout):
        super().__init__()

        self.emb_dim = emb_dim
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.n_layers = n_layers
        self.dropout = dropout
        
        self.embedding = nn.Embedding(output_dim, emb_dim)
        
        self.rnn = nn.LSTM(emb_dim, hid_dim, n_layers, dropout = dropout)
        
        self.out = nn.Linear(hid_dim, output_dim)
        
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, input, hidden, cell):
        
        #input = [batch size]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #n directions in the decoder will both always be 1, therefore:
        #hidden = [n layers, batch size, hid dim]
        #context = [n layers, batch size, hid dim]
        
        input = input.unsqueeze(0)
        
        #input = [1, batch size]
        
        embedded = self.dropout(self.embedding(input))
        
        #embedded = [1, batch size, emb dim]
                
        output, (hidden, cell) = self.rnn(embedded, (hidden, cell))
        
        #output = [sent len, batch size, hid dim * n directions]
        #hidden = [n layers * n directions, batch size, hid dim]
        #cell = [n layers * n directions, batch size, hid dim]
        
        #sent len and n directions will always be 1 in the decoder, therefore:
        #output = [1, batch size, hid dim]
        #hidden = [n layers, batch size, hid dim]
        #cell = [n layers, batch size, hid dim]
        
        prediction = self.out(output.squeeze(0))
        
        #prediction = [batch size, output dim]
        
        return prediction, hidden, cell

In [143]:
# build a sequence to sequence network that train the encoders to produce context vectors that represent the sentences well
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, device):
        super().__init__()
        
        self.encoder = encoder
        self.decoder = decoder
        self.device = device
        self.context_vectors = []
        
        assert encoder.hid_dim == decoder.hid_dim, \
            "Hidden dimensions of encoder and decoder must be equal!"
        assert encoder.n_layers == decoder.n_layers, \
            "Encoder and decoder must have equal number of layers!"
        
    def forward(self, src, trg, teacher_forcing_ratio = 0.5):
        
        #src = [src sent len, batch size]
        #trg = [trg sent len, batch size]
        #teacher_forcing_ratio is probability to use teacher forcing
        #e.g. if teacher_forcing_ratio is 0.75 we use ground-truth inputs 75% of the time
        
        batch_size = trg.shape[1]  # the number of sentences in the batch
        max_len = trg.shape[0]  # length of the longest sentence in the batch
        trg_vocab_size = self.decoder.output_dim
        
        #tensor to store decoder outputs
        outputs = torch.zeros(max_len, batch_size, trg_vocab_size).to(self.device)
        
        #last hidden state of the encoder is used as the initial hidden state of the decoder
        hidden, cell = self.encoder(src)
        self.context_vectors.append((src, hidden))
        
        #first input to the decoder is the <sos> tokens
        input = trg[0,:]
        
        for t in range(1, max_len):
            
            output, hidden, cell = self.decoder(input, hidden, cell)
            outputs[t] = output
            teacher_force = random.random() < teacher_forcing_ratio
            top1 = output.max(1)[1]
            input = (trg[t] if teacher_force else top1)
        
        return outputs

In [190]:
# create a model
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TRG.vocab)
ENC_EMB_DIM = 256
DEC_EMB_DIM = 256
HID_DIM = 512
N_LAYERS = 2
ENC_DROPOUT = 0.5
DEC_DROPOUT = 0.5

enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS, ENC_DROPOUT)
dec = Decoder(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, DEC_DROPOUT)

model = Seq2Seq(enc, dec, device).to(device)

# initialize the weights with normal distribution
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.08, 0.08)
        
model.apply(init_weights)

# count the number of trainable parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f'The model has {count_parameters(model):,} trainable parameters')

# define the optimizer as Adam
optimizer = optim.Adam(model.parameters())

# loss function
PAD_IDX = TRG.vocab.stoi['<pad>']
criterion = nn.CrossEntropyLoss(ignore_index = PAD_IDX)

# train
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        
        # this is all the sentences in a batch
        src = batch.src
        trg = batch.trg
        
        optimizer.zero_grad()
        
        # train the model on src sentences and target sentences
        output = model(src, trg)
        
        #trg = [trg sent len, batch size]
        #output = [trg sent len, batch size, output dim]
        
        output = output[1:].view(-1, output.shape[-1])
        trg = trg[1:].view(-1)
        
        #trg = [(trg sent len - 1) * batch size]
        #output = [(trg sent len - 1) * batch size, output dim]
        
        loss = criterion(output, trg)
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# evaluation
def evaluate(model, iterator, criterion):
    
    model.eval()
    
    epoch_loss = 0
    
    with torch.no_grad():
    
        for i, batch in enumerate(iterator):

            src = batch.src
            trg = batch.trg

            output = model(src, trg, 0) #turn off teacher forcing

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]

            output = output[1:].view(-1, output.shape[-1])
            trg = trg[1:].view(-1)

            #trg = [(trg sent len - 1) * batch size]
            #output = [(trg sent len - 1) * batch size, output dim]

            loss = criterion(output, trg)
            
            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

# epoch timing
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

The model has 9,120,441 trainable parameters


In [145]:
# train
N_EPOCHS = 10
CLIP = 1

best_valid_loss = float('inf')

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save (model.state_dict(), 'tut1-model.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 0m 4s
	Train Loss: 5.605 | Train PPL: 271.662
	 Val. Loss: 5.345 |  Val. PPL: 209.557
Epoch: 02 | Time: 0m 4s
	Train Loss: 5.436 | Train PPL: 229.483
	 Val. Loss: 4.810 |  Val. PPL: 122.720
Epoch: 03 | Time: 0m 3s
	Train Loss: 5.118 | Train PPL: 166.995
	 Val. Loss: 3.960 |  Val. PPL:  52.448
Epoch: 04 | Time: 0m 3s
	Train Loss: 4.701 | Train PPL: 110.033
	 Val. Loss: 3.706 |  Val. PPL:  40.706
Epoch: 05 | Time: 0m 3s
	Train Loss: 4.537 | Train PPL:  93.415
	 Val. Loss: 3.736 |  Val. PPL:  41.912
Epoch: 06 | Time: 0m 4s
	Train Loss: 4.456 | Train PPL:  86.185
	 Val. Loss: 3.685 |  Val. PPL:  39.827
Epoch: 07 | Time: 0m 3s
	Train Loss: 4.407 | Train PPL:  82.041
	 Val. Loss: 3.563 |  Val. PPL:  35.280
Epoch: 08 | Time: 0m 4s
	Train Loss: 4.352 | Train PPL:  77.619
	 Val. Loss: 3.505 |  Val. PPL:  33.271
Epoch: 09 | Time: 0m 4s
	Train Loss: 4.340 | Train PPL:  76.732
	 Val. Loss: 3.517 |  Val. PPL:  33.698
Epoch: 10 | Time: 0m 7s
	Train Loss: 4.332 | Train PPL:  76.123


In [189]:
for i, batch in enumerate(train_iterator):
    if i == 0:
        print(batch.src.transpose(0,1))

tensor([[  2,   0,  16,  ...,   1,   1,   1],
        [  2,   6,   0,  ...,   1,   1,   1],
        [  2,  27, 147,  ...,   1,   1,   1],
        ...,
        [  2,   0,   4,  ...,   1,   1,   1],
        [  2,  43,  27,  ...,   1,   1,   1],
        [  2, 611,   4,  ...,   1,   1,   1]])


In [172]:
vars(train_iterator)

{'batch_size': 128,
 'train': True,
 'dataset': <torchtext.datasets.translation.Multi30k at 0x7faefea0fef0>,
 'batch_size_fn': None,
 'iterations': 1,
 'repeat': False,
 'shuffle': True,
 'sort': False,
 'sort_within_batch': False,
 'sort_key': <function torchtext.datasets.translation.TranslationDataset.sort_key(ex)>,
 'device': device(type='cpu'),
 'random_shuffler': <torchtext.data.utils.RandomShuffler at 0x7faf2fe5b898>,
 '_iterations_this_epoch': 1,
 '_random_state_this_epoch': (3,
  (3297715018,
   58426169,
   253714516,
   2099196304,
   3733172384,
   4212010110,
   2783653618,
   1768284441,
   4209158285,
   744410686,
   2103605509,
   2842406700,
   2289981335,
   212922146,
   2778221656,
   1053740017,
   1424792856,
   4224842428,
   3731841819,
   2988451423,
   627018369,
   1607112236,
   3657240987,
   2038364475,
   1298114200,
   2120695671,
   284817816,
   3883902282,
   2896043057,
   1831934767,
   39630914,
   1666895466,
   548679849,
   117588422,
   7673995

In [171]:

max(len(sent.split()) for sent in requests[6:100])

155

In [157]:
len(vars(train_data.examples[0])['src'])

21

In [133]:
vars(train_iterator.dataset[0])

{'src': ['bryan',
  'poltilove',
  'has',
  'transfered',
  'to',
  'thermo',
  'fisher',
  'middletown',
  'va',
  'site',
  'from',
  'lsg',
  'fredrick',
  'md',
  'site',
  'he',
  'will',
  'need',
  'access',
  'to',
  'master',
  'control',
  'please',
  'activate',
  'bryan',
  'poltilove',
  'in',
  'master',
  'control',
  'at',
  'the',
  'middletown',
  'site'],
 'trg': ['bryan',
  'poltilove',
  'has',
  'transfered',
  'to',
  'thermo',
  'fisher',
  'middletown',
  'va',
  'site',
  'from',
  'lsg',
  'fredrick',
  'md',
  'site',
  'he',
  'will',
  'need',
  'access',
  'to',
  'master',
  'control',
  'please',
  'activate',
  'bryan',
  'poltilove',
  'in',
  'master',
  'control',
  'at',
  'the',
  'middletown',
  'site']}

In [122]:
model.context_vectors[0][1][0].shape

torch.Size([47, 512])

In [114]:
model.context_vectors

[(tensor([[  2,   2,   2,  ...,   2,   2,   2],
          [195,  15,  11,  ...,  81, 263,  15],
          [  0,  20,  36,  ...,   7,   0, 243],
          ...,
          [  1,   1,   1,  ...,   1,   1,   1],
          [  1,   1,   1,  ...,   1,   1,   1],
          [  1,   1,   1,  ...,   1,   1,   1]]),
  tensor([[[ 0.1163,  0.0141, -0.0396,  ...,  0.0279, -0.0065,  0.0345],
           [ 0.1063,  0.0249, -0.0401,  ...,  0.0257, -0.0165,  0.0461],
           [ 0.1035,  0.0244, -0.0268,  ...,  0.0287, -0.0045,  0.0468],
           ...,
           [ 0.1057, -0.0015, -0.0231,  ...,  0.0470,  0.0095,  0.0676],
           [ 0.1120,  0.0086, -0.0446,  ...,  0.0241, -0.0286,  0.0554],
           [ 0.1185,  0.0231, -0.0215,  ...,  0.0435, -0.0097,  0.0584]],
  
          [[ 0.1080, -0.0260, -0.0554,  ..., -0.0691, -0.0613,  0.0088],
           [ 0.1128, -0.0254, -0.0374,  ..., -0.0831, -0.0769, -0.0247],
           [ 0.0923, -0.0444, -0.0339,  ..., -0.0697, -0.0691, -0.0286],
           ...,
  

In [50]:
model.load_state_dict(torch.load('tut1-model.pt'))

test_loss = evaluate(model, test_iterator, criterion)

print(f'| Test Loss: {test_loss:.3f} | Test PPL: {math.exp(test_loss):7.3f} |')

| Test Loss: 5.706 | Test PPL: 300.713 |
