<a href="https://colab.research.google.com/github/Jackwitt77/Transformer-Implementation/blob/main/attention_from_scratch_may_2023.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install tokenizers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting tokenizers
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m25.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: tokenizers
Successfully installed tokenizers-0.13.3


In [2]:
import re
import string
import os
import pickle
from unicodedata import normalize
from collections import Counter

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset
from torchvision import datasets
from torch.utils.data import DataLoader
from torch.nn.functional import log_softmax, pad

from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer

import random

import numpy as np
import math
import matplotlib.pyplot as plt

In [3]:
device = torch.device('cuda')

In [4]:
random.seed("25")

In [5]:
enRawName = "drive/MyDrive/colab data/multi30kEnTrain.txt"
deRawName = "drive/MyDrive/colab data/multi30kDeTrain.txt"
en30kVal = "drive/MyDrive/colab data/multi30kEnVal.txt"
de30kVal = "drive/MyDrive/colab data/multi30kDeVal.txt"
englishCleanName = "data/english_tokens.pkl"
germanCleanName = "data/german_tokens.pkl"
englishSortedName = "data/englishSorted.pkl"
germanSortedName = "data/germanSorted.pkl"

truncEn = "drive/MyDrive/colab data/truncEn.pkl"
truncDe = "drive/MyDrive/colab data/truncDe.pkl"

enTokenizerName = "drive/MyDrive/colab data/enTokenizer.pkl"
deTokenizerName = "drive/MyDrive/colab data/deTokenizer.pkl"
pairsName = "drive/MyDrive/colab data/pairs.pkl"
folder = "drive/MyDrive/colab data/"

enTrainingFileName = folder + "enTraining"
deTrainingFileName = folder + "deTraining"
enTestFileName = folder + "enTest"
deTestFileName = folder + "deTest"
enValFileName = folder + "enValidation"
deValFileName = folder + "deValidation"

enCombinedFileName = folder + "enCombined"
deCombinedFileName = folder + "deCombined"

In [6]:
# load doc into memory
def load_doc(filename):
    # open the file as read only
    file = open(filename, mode='rt')
    # read all text
    text = file.read()
    # close the file
    file.close()
    return text

In [7]:
# split a loaded document into sentences
def to_sentences(doc):
    return doc.strip().split('\n')

In [8]:
# clean a list of lines
def clean_lines(lines):
    cleaned = list()
    # prepare regex for char filtering
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    # prepare translation table for removing punctuation
    #table = str.maketrans('', '', string.punctuation)
    for line in lines:
        # normalize unicode characters
        line = normalize('NFD', line).encode('ascii', 'ignore')
        line = line.decode('UTF-8')
        # tokenize on white space
        line = line.split()
        # convert to lower case
        #line = [word.lower() for word in line]
        # remove punctuation from each token
        #line = [word.translate(table) for word in line]
        # remove non-printable chars form each token
        line = [re_print.sub('', w) for w in line]
        # remove tokens with numbers in them
        line = [word for word in line if word.isalpha()]
        # store as string
        cleaned.append(' '.join(line))
    return cleaned

In [9]:
def cleanLine(line, addSOS=False):
    # prepare regex for char filtering
    #re_print = re.compile('[^%s]' % re.escape(string.printable))
    # normalize unicode characters
    #line = normalize('NFD', line).encode('ascii', 'ignore')
    #line = line.decode('UTF-8')
    # tokenize on white space
    line = line.split()
    # convert to lower case
    #line = [word.lower() for word in line]
    # remove punctuation from each token
    #line = [word.translate(table) for word in line]
    # remove non-printable chars form each token
    #line = [re_print.sub('', w) for w in line]
    if addSOS:
        line.insert(0, "[SOS]")
    line.append("[EOS]")
    line = (' '.join(line))
    return line

In [10]:
def pairSentences(englishSentences, germanSentences):
    skips = ["", "."]
    pairs = []
    for i in range(len(englishSentences)):
        englishSentences[i] = cleanLine(englishSentences[i])
        germanSentences[i] = cleanLine(germanSentences[i], False)
        if englishSentences[i] in skips or germanSentences[i] in skips:
            continue
        enLen = len(englishSentences[i])
        deLen = len(germanSentences[i])
        
        if (enLen / deLen) > 10 or (deLen / enLen) > 10:
            print(len(englishSentences[i]))
            print(len(germanSentences[i]))
            print("English:", englishSentences[i])
            print("German:", germanSentences[i])
            continue
        if enLen <= 3 or deLen <= 3:
            continue
        pairs.append((englishSentences[i], germanSentences[i]))
    pairs = sorted(pairs, key=lambda x: len(x[0].split(" ")))
    return pairs

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [12]:
def writeLinesWithNewLines(lines, fileName):
  file = open(fileName, "w")
  for line in lines:
    file.write(line + "\n")

In [13]:
def createDatasets(suffix = ".txt", size=10000):
  enLines = to_sentences(load_doc(englishRawName))
  deLines = to_sentences(load_doc(germanRawName))

  temp = list(zip(enLines, deLines))
  random.shuffle(temp)
  res1, res2 = zip(*temp)
  # res1 and res2 come out as tuples, and so must be converted to lists.
  enLines, deLines = list(res1)[:size], list(res2)[:size]

  trainEnLines = enLines[:(int)(0.8*len(enLines))]
  testEnLines = enLines[(int)(0.8*len(enLines)):(int)(0.9*len(enLines))]
  valEnLines = enLines[(int)(0.9*len(enLines)):]

  trainDeLines = deLines[:(int)(0.8*len(deLines))]
  testDeLines = deLines[(int)(0.8*len(deLines)):(int)(0.9*len(deLines))]
  valDeLines = deLines[(int)(0.9*len(deLines)):]

  enFileNames = [enTrainingFileName, enTestFileName, enValFileName, enCombinedFileName]
  deFileNames = [deTrainingFileName, deTestFileName, deValFileName, deCombinedFileName]

  enFileNames = [name + suffix for name in enFileNames]
  deFileNames = [name + suffix for name in deFileNames]
  enDatasets = [trainEnLines, testEnLines, valEnLines, enLines]
  deDatasets = [trainDeLines, testDeLines, valDeLines, deLines]

  enPair = (enDatasets, enFileNames)
  dePair = (deDatasets, deFileNames)

  for pair in [enPair, dePair]:
    for i in range(len(pair[0])):
      writeLinesWithNewLines(pair[0][i], pair[1][i])
  

In [14]:
def createTrainTestVal(numLines):
  trainEnLines = to_sentences(load_doc(enTrainingFileName))
  testEnLines = to_sentences(load_doc(enTestFileName))
  valEnLines = to_sentences(load_doc(enValFileName))

  trainDeLines = to_sentences(load_doc(deTrainingFileName))
  testDeLines = to_sentences(load_doc(deTestFileName))
  valDeLines = to_sentences(load_doc(deValFileName))

  trainEnLines, trainDeLines = \
              zip(*random.sample(list(zip(trainEnLines, trainDeLines)), (int) (0.8 * numLines)))
  testEnLines, testDeLines = \
              zip(*random.sample(list(zip(testEnLines, testDeLines)), (int)(0.1* numLines)))
  valEnLines, valDeLines = \
              zip(*random.sample(list(zip(valEnLines, valDeLines)), (int)(0.1*numLines)))

  enAll = trainEnLines + testEnLines + valEnLines
  deAll = trainDeLines + testDeLines + valDeLines

  return trainEnLines, trainDeLines, testEnLines, testDeLines, valEnLines, valDeLines, enAll, deAll


In [None]:
# englishSentences = to_sentences(load_doc(englishRawName))
# germanSentences = to_sentences(load_doc(germanRawName))

# temp = list(zip(englishSentences, germanSentences))
# random.shuffle(temp)
# res1, res2 = zip(*temp)
# # res1 and res2 come out as tuples, and so must be converted to lists.
# englishSentences, germanSentences = list(res1), list(res2)

# writeLinesWithNewLines(englishSentences[:(int)(0.8*len(englishSentences))], folder + "training.en")
# writeLinesWithNewLines(germanSentences[:(int)(0.8*len(germanSentences))], folder + "training.de")

In [None]:
# writeLinesWithNewLines(englishSentences[(int)(0.8*len(englishSentences)):(int)(0.9*len(englishSentences))], folder + "test.en")
# writeLinesWithNewLines(germanSentences[(int)(0.8*len(germanSentences)):(int)(0.9*len(germanSentences))], folder + "test.de")

In [None]:
# writeLinesWithNewLines(englishSentences[(int)(0.9*len(englishSentences)):], enVal)
# writeLinesWithNewLines(germanSentences[(int)(0.9*len(germanSentences)):], deVal)

In [15]:
import random
class SentenceDataset(Dataset):
    def __init__(self, enFileName, deFileName, newPairs = True):
        englishSentences = to_sentences(load_doc(enFileName))
        germanSentences = to_sentences(load_doc(deFileName))
        if not os.path.exists(pairsName) or newPairs:
            print("Creating pairs...")
            #self.englishSentences, self.germanSentences = \
             # zip(*random.sample(list(zip(self.englishSentences, self.germanSentences)), numLines))
            englishSentences = list(englishSentences)
            germanSentences = list(germanSentences)
            self.pairs = pairSentences(englishSentences, germanSentences)
            print(len(self.pairs))
            pickle.dump(self.pairs, open(pairsName, "wb"))
            # pickle.dump(englishSentences, open(truncEn, "wb"))
            # pickle.dump(germanSentences, open(truncDe, "wb"))
        else:
            # self.englishSentences = pickle.load(open(truncEn, "rb"))
            # self.germanSentences = pickle.load(open(truncDe, "rb"))
            print("Loading pairs...")
            self.pairs = pickle.load(open(pairsName, "rb"))
        
    def __len__(self):
        return len(self.pairs)
    
    def __getitem__(self, index):
        return self.pairs[index]

In [16]:
def setupTokenizers(enDataName, deDataName, newTokenizer = False, vocab_size_src=60000, vocab_size_tgt = 37000, enTokenizerName = folder+"en10ktokenizer.pkl", deTokenizerName = folder + "de10ktokenizer.pkl"):
  if not os.path.exists(enTokenizerName) or newTokenizer:
      print("creating en tokenizer...")
      enTokenizer = Tokenizer(BPE(unk_token="[UNK]"))
      enTokenizer.pre_tokenizer = Whitespace()
      trainer = BpeTrainer(vocab_size = vocab_size_src, special_tokens=["[SOS]", "[EOS]", "[PAD]", "[UNK]"])
      enTokenizer.train([enDataName], trainer=trainer)
      pickle.dump(enTokenizer, open(enTokenizerName, "wb"))
  if not os.path.exists(deTokenizerName) or newTokenizer:
      print("creating de tokenizer...")
      deTokenizer = Tokenizer(BPE())
      deTokenizer.pre_tokenizer = Whitespace()
      trainer = BpeTrainer(vocab_size = vocab_size_tgt, special_tokens=["[SOS]", "[EOS]", "[PAD]", "[UNK]"])
      deTokenizer.train([deDataName], trainer=trainer)
      pickle.dump(deTokenizer, open(deTokenizerName, "wb"))
  else:
      print("Loading tokenizer...")
      enTokenizer = pickle.load(open(enTokenizerName, "rb"))
      deTokenizer = pickle.load(open(deTokenizerName, "rb"))
  return enTokenizer, deTokenizer

In [17]:
import time
PADDING_IDX = 2
class PadCollate:
    """
    a variant of callate_fn that pads according to the longest sequence in
    a batch of sequences
    """

    def __init__(self, enTokenizer, deTokenizer):
          self.enTokenizer = enTokenizer
          self.deTokenizer = deTokenizer

    def pad_collate(self, batch):
        """
        args:
            batch - list of (String, String)

        reutrn:
            xs - a tensor of all examples in 'batch' after padding
            ys - a LongTensor of all labels in batch
        """
        startTime = time.time()
        # find longest sequence
        Enlengths = [len(x[0].split(" ")) for x in batch]
        #print("english lengths:", Enlengths)
        Delengths = [len(x[1].split(" ")) for x in batch]
        #print("german lengths:", Delengths)
        max_len1 = max(Enlengths)
        max_len2 = max(Delengths)
        max_len = max([max_len1, max_len2])
        self.enTokenizer.enable_padding(pad_id = PADDING_IDX, pad_token="[PAD]", length=max_len)
        self.enTokenizer.enable_truncation(max_length=max_len)
        self.deTokenizer.enable_padding(pad_id = PADDING_IDX, pad_token="[PAD]", length=max_len)
        self.deTokenizer.enable_truncation(max_length=max_len)
        english = [pair[0] for pair in batch]
        german = [pair[1] for pair in batch]
        enTokens = [line.ids for line in self.enTokenizer.encode_batch(english)]
        deTokens = [line.ids for line in self.deTokenizer.encode_batch(german)]
        for i in range(len(deTokens)):
            deTokens[i].insert(0,0)
#         print(english[0])
#         print(german[0])
#         print(enTokens[0])
#         print(deTokens[0])
        englishEncoded = torch.IntTensor(enTokens)
        germanEncoded = torch.IntTensor(deTokens)
        #print("tokenization took", time.time()-startTime)
        #print("englishEncoded", englishEncoded)
        return englishEncoded, germanEncoded

    def __call__(self, batch):
        return self.pad_collate(batch)


In [18]:
class EmbeddingLayer(nn.Module):
    def __init__(self, vocabSize, embeddingSize, padding_idx):
        super().__init__()
        self.embedding = nn.Embedding(vocabSize, embeddingSize, padding_idx = padding_idx)
        self.embeddingSize = embeddingSize
        
    def forward(self, sentenceBatch):
        return self.embedding(sentenceBatch) * math.sqrt(self.embeddingSize)

In [19]:
class PositionalEncoding(nn.Module):

    def __init__(self, d_model, dropout = 0.1, max_len=5000):
        super().__init__()
        self.dropout = nn.Dropout(p=dropout)

        # Compute the positional encodings once in log space.
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len).unsqueeze(1)
        div_term = torch.exp(
            torch.arange(0, d_model, 2) * -(math.log(10000.0) / d_model)
        )
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0)
        self.register_buffer("pe", pe)

    def forward(self, x):
        x = x + self.pe[:, : x.size(1)].requires_grad_(False)
        return self.dropout(x)


In [20]:
def attention(queries, keys, values, dropout, mask=None):
    keysTransposed = torch.transpose(keys, -2, -1)
    dotProduct = torch.matmul(queries, keysTransposed)
    scaledDotProduct = dotProduct / math.sqrt(keys.shape[-1])
    if mask is not None:
        weights = scaledDotProduct.masked_fill(mask == 0, -1e9)
    else:
        weights = scaledDotProduct
    
    weights = torch.softmax(weights, -1)
    weights = dropout(weights)
    finalProduct = torch.matmul(weights, values)
    return finalProduct

In [21]:
class MultiHeadedAttention(nn.Module):
    def __init__(self, d_model, numHeads, dropout=0.1): # h is the number of heads
        super().__init__()
        self.d_model = d_model
        self.qWeight = nn.Linear(d_model, d_model)
        self.kWeight = nn.Linear(d_model, d_model)
        self.vWeight = nn.Linear(d_model, d_model)
        self.finalWeight = nn.Linear(d_model, d_model)
        self.numHeads = numHeads
        self.d_head = d_model // numHeads # we treat the input matrix as numHead d_head length heads
        self.dropout = nn.Dropout(p = dropout)
        # ex: 8 heads of length 64 for 512 embedding
        
    def forward(self, queries, keys, values, mask):
        #print("mask shape before squeeze:", mask.shape)
        if mask is not None:
            # Same mask applied to all h heads.
            mask = mask.unsqueeze(1)
        #print("mask shape after squeeze:", mask.shape)
        batchSize = queries.shape[0]
        # Send the queries, keys, and values through their corresponding linear layer
        # The dimensions should be unchanged
        queries = self.qWeight(queries).to(device)
        keys = self.kWeight(keys).to(device)
        values = self.vWeight(values).to(device)
        
        # Then treat the outputs of each linear layer as if they are numHead parts
        # ex: treat a batch of 50 sequence length x 512 embedding size matrices as 
        # a batch of 50 sequence length x (8 heads x 64 head length) size matrices
        
        # the -1 is the sequence length, which is inferred
        queries = queries.reshape(batchSize, -1, self.numHeads, self.d_head)
        keys = keys.reshape(batchSize, -1, self.numHeads, self.d_head)
        values = values.reshape(batchSize, -1, self.numHeads, self.d_head)
        
        #print("query lin", queries.shape)
        
        # Then swap the sequence length and number of heads dimensions
        queries = queries.transpose(1, 2)
        keys = keys.transpose(1, 2)
        values = values.transpose(1, 2)
        
        # Apply attention to this set of queries, keys, and values
        x = attention(queries, keys, values, self.dropout, mask)
        
        # Because we divided the queries, keys, and values into some number of heads, each head has a different value now
        # We need to combine all of these heads back into one matrix again before moving on
        x = x.transpose(1, 2) # put sequence length back into dimension 1 and num heads into dimension 2
        x = x.reshape(batchSize, -1, (self.numHeads * self.d_head)) # we're now back to our d_model embedding shape
        #print("shape after attention:", x.shape)
        del queries
        del keys
        del values
        
        # Finally, put x through one last linear layer, just because
        x = self.finalWeight(x)
        return x
        

In [22]:
class FeedForwardNN(nn.Module):
    # This is just a two layer neural net.
    # The input and outputs are d_model, with one d_feedForward hidden layer
    # It uses relu for the hidden layer and linear for the output
    def __init__(self, d_model, d_feedForward, dropout=0.1):
        super().__init__()
        self.linear1 = nn.Linear(d_model, d_feedForward)
        self.linear2 = nn.Linear(d_feedForward, d_model)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x):
        x = self.linear1(x)
        x = x.relu()
        x = self.dropout(x)
        x = self.linear2(x)
        return x
        

In [23]:
class LayerNorm(nn.Module):
    "Construct a layernorm module (See citation for details)."
    # Taken from http://nlp.seas.harvard.edu/annotated-transformer/
    # because I didn't feel like learning about layer normalization

    def __init__(self, features, eps=1e-6):
        super(LayerNorm, self).__init__()
        self.a_2 = nn.Parameter(torch.ones(features))
        self.b_2 = nn.Parameter(torch.zeros(features))
        self.eps = eps

    def forward(self, x):
        mean = x.mean(-1, keepdim=True)
        std = x.std(-1, keepdim=True)
        return self.a_2 * (x - mean) / (std + self.eps) + self.b_2

In [24]:
class Sublayer(nn.Module):
    # A sublayer is a wrapper around either a feed forward layer or multiheaded attention layer
    # It takes the output of either one of those and then adds the original input to it
    # It then does layer normalization to this sum
    def __init__(self, d_model, function, isAttentionLayer, dropout = 0.1):
        super().__init__()
        self.function = function # function is either feed foward or multiheaded attention
        self.layerNorm = LayerNorm(d_model)
        self.isAttentionLayer = isAttentionLayer
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, x, mask=None, encoding=None):
        # The attention layer expects three inputs (queries, keys, values), not 1
        if self.isAttentionLayer:
            if encoding is None:
                sum_ = x + self.dropout(self.function(x, x, x, mask))
            else:
                sum_ = x + self.dropout(self.function(x, encoding, encoding, mask))
        else:
            sum_ = x + self.dropout(self.function(x))
            
        return self.layerNorm(sum_)

In [25]:
class EncoderLayer(nn.Module):
    # an encoder layer is two sublayers.
    # It goes through a sublayer containing the multiheaded attention and then a sublayer containing the feed forward layer
    def __init__(self, d_model, selfAttention, feedForward, dropout = 0.1):
        super().__init__()
        self.selfAttention = selfAttention
        self.feedForward = feedForward
        self.sublayer1 = Sublayer(d_model, self.selfAttention, True, dropout)
        self.sublayer2 = Sublayer(d_model, self.feedForward, False, dropout)
        
    def forward(self, x, mask):
        # put x through both sublayers
        x = self.sublayer1(x, mask)
        x = self.sublayer2(x, mask)
        return x

In [26]:
class Encoder(nn.Module):
    # The encoder is a sequence of encoder layers
    def __init__(self, numLayers, d_model):
        self.d_model = d_model
        super().__init__()
        self.layers = []
        for i in range(numLayers):
            selfAttentionLayer = MultiHeadedAttention(d_model, 8)
            feedForwardLayer = FeedForwardNN(d_model, 2048)
            encoderLayer = EncoderLayer(d_model, selfAttentionLayer, feedForwardLayer)
            self.layers.append(encoderLayer)
        self.layers = nn.ModuleList(self.layers)
    
    def forward(self, x, mask):
        for layer in self.layers:
            x = layer(x, mask)
        return x

In [27]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, selfAttention, encoderAttention, feedForward, dropout=0.1):
        super().__init__()
        self.selfAttention = selfAttention
        self.feedForward = feedForward
        self.encoderAttention = encoderAttention
        self.sublayer1 = Sublayer(d_model, self.selfAttention, True, dropout)
        self.sublayer2 = Sublayer(d_model, self.encoderAttention, True, dropout)
        self.sublayer3 = Sublayer(d_model, self.feedForward, False, dropout)
        
    def forward(self, x, encoding, src_mask, tgt_mask):
        x = self.sublayer1(x, tgt_mask)
        x = self.sublayer2(x, src_mask, encoding)
        x = self.sublayer3(x)
        return x

In [28]:
class Decoder(nn.Module):
    # The decoder is a sequence of decoder layers
    def __init__(self, numLayers, d_model):
        self.d_model = d_model
        super().__init__()
        self.layers = []
        for i in range(numLayers):
            selfAttentionLayer = MultiHeadedAttention(d_model, 8)
            encoderAttentionLayer = MultiHeadedAttention(d_model, 8)
            feedForwardLayer = FeedForwardNN(d_model, 2048)
            decoderLayer = DecoderLayer(d_model, selfAttentionLayer, encoderAttentionLayer, feedForwardLayer)
            self.layers.append(decoderLayer)
        self.layers = nn.ModuleList(self.layers)
        
    def forward(self, x, encoding, src_mask, tgt_mask):
        for layer in self.layers:
            x = layer(x, encoding, src_mask, tgt_mask)
            return x

In [29]:
class Generator(nn.Module):
    def __init__(self, d_model, vocab):
        super().__init__()
        self.proj = nn.Linear(d_model, vocab)

    def forward(self, x):
        return log_softmax(self.proj(x), dim=-1)

In [30]:
class WholeNetwork(nn.Module):
    def __init__(self, d_model, d_vocab_src, d_vocab_tgt, padding_idx):
        super().__init__()
        self.embeddingSrc = EmbeddingLayer(d_vocab_src, d_model, padding_idx)
        self.embeddingTgt = EmbeddingLayer(d_vocab_tgt, d_model, padding_idx)
        self.posEncoding = PositionalEncoding(d_model, 0.1)
        self.encoder = Encoder(3, d_model)
        self.decoder = Decoder(3, d_model)
        self.generator = Generator(d_model, d_vocab_tgt)

    def encode(self, src, src_mask):
        src = self.embeddingSrc(src)
        src = self.posEncoding(src)
        encodings = self.encoder(src, src_mask)
        return encodings
    
    def decode(self, encodings, tgt, src_mask, tgt_mask):
        tgt = self.embeddingTgt(tgt)
        tgt = self.posEncoding(tgt)
        output = self.decoder(tgt, encodings, src_mask, tgt_mask)
        return self.generator(output)
        
    def forward(self, src, tgt, src_mask, tgt_mask):
        encodings = self.encode(src, src_mask)
        return self.decode(encodings, tgt, src_mask, tgt_mask)
        
        

In [31]:
def get_tgt_mask(size) -> torch.tensor:
    # Generates a squeare matrix where the each row allows one word more to be seen
    mask = torch.tril(torch.ones(size, size) == 1) # Lower triangular matrix
    mask = mask.float()
    mask = mask.masked_fill(mask == 0, float('-inf')) # Convert zeros to -inf
    mask = mask.masked_fill(mask == 1, float(0.0)) # Convert ones to 0
    return mask
    # EX for size=5:
    # [[0., -inf, -inf, -inf, -inf],
    #  [0.,   0., -inf, -inf, -inf],
    #  [0.,   0.,   0., -inf, -inf],
    #  [0.,   0.,   0.,   0., -inf],
    #  [0.,   0.,   0.,   0.,   0.]]

In [32]:
def decodeSentences(intTensor, tokenizer):
    intList = intTensor.tolist()
    #print(intList)
    sentences = tokenizer.decode_batch(intList)
    return sentences

In [33]:
def subsequent_mask(size):
    "Mask out subsequent positions."
    attn_shape = (1, size, size)
    subsequent_mask = torch.triu(torch.ones(attn_shape), diagonal=1).type(
        torch.uint8
    )
    return subsequent_mask == 0

In [34]:
def makeStdMask(tgt):
    tgt_mask = (tgt != PADDING_IDX).unsqueeze(-2)
    tgt_mask = tgt_mask & subsequent_mask(tgt.size(-1)).type_as(tgt_mask.data)
    return tgt_mask

In [35]:
def getLearningRate(d_model, stepNum, warmupSteps):
  stepNum += 19430 #67 epochs
  d_model = d_model ** -0.5
  min1 = stepNum ** -0.5 # was -0.5
  min2 = stepNum * (warmupSteps ** -1.5)
  return d_model * min(min1, min2)

In [36]:
class LabelSmoothing(nn.Module):
    "Implement label smoothing."

    def __init__(self, size, padding_idx, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.criterion = nn.KLDivLoss(reduction="sum")
        self.padding_idx = padding_idx
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing
        self.size = size
        self.true_dist = None

    def forward(self, x, target):
        assert x.size(1) == self.size
        true_dist = x.data.clone()
        true_dist.fill_(self.smoothing / (self.size - 2))
        true_dist.scatter_(1, target.data.unsqueeze(1).long(), self.confidence)
        true_dist[:, self.padding_idx] = 0
        mask = torch.nonzero(target.data == self.padding_idx)
        if mask.dim() > 0:
            true_dist.index_fill_(0, mask.squeeze(), 0.0)
        self.true_dist = true_dist
        return self.criterion(x, true_dist.clone().detach())

In [37]:
#enTrain, deTrain, enTest, deTest, enVal, deVal, enAll, deAll = createTrainTestVal(10000)
#createDatasets("100k.txt", 100000)

In [38]:
suffix = "100k.txt"
#enTokenizer = pickle.load(open("en10ktokenizer.pkl", "rb"))
#deTokenizer = pickle.load(open("de10ktokenizer.pkl", "rb"))
enName = folder+"enTokenizer.pkl"
deName = folder+"deTokenizer.pkl"
enTokenizer, deTokenizer = setupTokenizers(enRawName, deRawName, False, 10000, 10000, enName, deName)

Loading tokenizer...


In [39]:

def trainModel(epochs = 999, vocab_size_src = 10000, vocab_size_tgt = 10000, d_model=512, warmupSteps=4000, batchSize=128):
    suffix = "100k.txt"
    dataset = SentenceDataset(enRawName, deRawName, True)

    train_dataloader = DataLoader(dataset, batch_size=batchSize, collate_fn =
                                  PadCollate(enTokenizer, deTokenizer))
    valDataset = SentenceDataset(en30kVal, de30kVal, True)
    val_dataloader = DataLoader(valDataset, batch_size=batchSize, collate_fn =
                                  PadCollate(enTokenizer, deTokenizer))
    val_dataloader_iter = iter(val_dataloader)
    dataloader_iter = iter(train_dataloader)
    warmupSteps = 3*len(train_dataloader)
    losses = []
    valLosses = []
    startTime = time.time()
    i = 0
    model = WholeNetwork(d_model, vocab_size_src, vocab_size_tgt, PADDING_IDX)
    #model.load_state_dict(torch.load(folder + "model_2vocab_11_12"))
    model = model.to(device)
    # This was important from their code.
    # Initialize parameters with Glorot / fan_avg.
    for p in model.parameters():
        if p.dim() > 1:
            nn.init.xavier_uniform_(p)
    opt = optim.Adam(model.parameters(), lr=0.5, betas=(0.9, 0.98), eps=1e-9)
    scheduler = optim.lr_scheduler.LambdaLR(opt, lambda step: getLearningRate(d_model, step+1, warmupSteps))
    labelSmoothing = LabelSmoothing(vocab_size_tgt, PADDING_IDX, 0.1)
    lowestAvgLoss = 100
    numFails = 0
    stepNum = 0
    print("Beginning training...")
    for j in range(epochs):
        avgLossAtEpoch = 100
        lossSumForEpoch = 0
        dataloader_iter = iter(train_dataloader)
        val_dataloader_iter = iter(val_dataloader)
        i = 0
        startTime = time.time()
        for src, tgt in dataloader_iter:
            
            #print("Everything else time:", time.time()-startTime)
            src, tgt = src.to(device), tgt.to(device)
            #tgt_x = tgt[:, :-1]
            #tgt_y = tgt[:, 1:]
            #print(tgt.shape)
            #src_mask = (src != PADDING_IDX).unsqueeze(-2)
            #tgt_mask = 
            #print(tgt_mask[0])
            #print(tgt_mask.shape)
            #print(src.shape)
            #tgt_mask = get_tgt_mask(src.shape[1]).to(device)
            output = model(src, tgt[:, :-1], (src != PADDING_IDX).unsqueeze(-2), makeStdMask(tgt[:, :-1]))
            #output = torch.transpose(output, 1, 2)
            #print(output.shape)
            
            #loss = lossFunction(output.reshape((output.shape[0], output.shape[2], output.shape[1])), tgt_y.long())
            nonPadTokens = (tgt[:, 1:] != PADDING_IDX).data.sum()
            #print(output.shape)
            loss = labelSmoothing(output.contiguous().view(-1, output.size(-1)), tgt[:, 1:].contiguous().view(-1))/nonPadTokens

            #lossMask = tgt_y != PADDING_IDX
            #print("loss before mask", loss)
            #loss = loss.masked_fill(lossMask == 0, 0)
            #print("loss after mask", loss)
            #loss = loss.sum()/lossMask.sum()
            #print("loss after sum", loss)
            #print(lossMask)
            #loss *= lossMask
            #print("Loss calculation time:", time.time()-startTime)
            #print("Loss:", loss.item())
            losses.append(loss.item())
            lossSumForEpoch += loss.item()
            #startTime = time.time()
            #print("Model weights:")
#             for param in model.parameters():
#                 print(param.data)
#                 break
            opt.zero_grad()
            loss.backward()
            opt.step()
            scheduler.step()
            #print("opt step time:", time.time()-startTime)
            startTime = time.time()
            argmaxOutput = torch.argmax(output, 2)
            stepNum += 1
            del loss
            #print("ARGMAX")
            #print(argmaxOutput)
            i+=1
    #         if i > 20:
    #             break
        k = 0
        valLossSumForEpoch = 0
        for src, tgt2 in val_dataloader_iter:
            src, tgt2 = src.to(device), tgt2.to(device)
            output = model(src, tgt2[:, :-1], (src != PADDING_IDX).unsqueeze(-2), makeStdMask(tgt2[:, :-1]))
            nonPadTokens = (tgt2[:, 1:] != PADDING_IDX).data.sum()
            loss = labelSmoothing(output.contiguous().view(-1, output.size(-1)), tgt2[:, 1:].contiguous().view(-1))/nonPadTokens
            valLosses.append(loss.item())
            valLossSumForEpoch += loss.item()
            del loss
            k += 1

        if i == 0:
          continue

        avgLossAtEpoch = lossSumForEpoch / i
        avgValLossAtEpoch = valLossSumForEpoch / k
        if avgLossAtEpoch >= lowestAvgLoss:
            numFails += 1
        else:
            numFails = 0
            lowestAvgLoss = avgLossAtEpoch
        
        print("finished epoch", j)
        #print("epoch time:", time.time()-startTime)
        print("Average loss", avgLossAtEpoch)
        print("Average val loss", avgValLossAtEpoch)
        print("num fails:", numFails)
        print("Predicted output: ")
        print(decodeSentences(argmaxOutput, deTokenizer)[0])
        print("Correct output: ")
        print(decodeSentences(tgt, deTokenizer)[0])
        print()
        print()

        if numFails >= 5:
            break
        
        torch.save(model.state_dict(), folder + "model_2vocab_5_7")
        if stepNum >= 100000: # the model needs to stop training eventually...
          break
    plt.plot(losses)
    plt.show()
    return model
        

In [None]:
model = trainModel()

Creating pairs...
29000
Creating pairs...
1014
Beginning training...
finished epoch 0
Average loss 4.199629508451218
Average val loss 3.7187726199626923
num fails: 0
Predicted output: 
Eine Gruppe von Menschen sitzt die in und und mit einem ein mit mit und mit weißen mit einem Haaren und , mit Kamera in weißen . mit mit . einem weißen und in und und , ,
Correct output: 
Eine Gruppe von Menschen , die nebeneinander sitzen und von denen einige weiße Hemden und blaue Westen mit gelben Schärpen tragen und die andere große flausch ige weiße Hüte mit einem pinkfarbenen flauschigen Oberteil .


finished epoch 1
Average loss 3.0582602087096498
Average val loss 3.0830485075712204
num fails: 0
Predicted output: 
Eine Gruppe von Menschen sitzt die in , neben mit einem zwei andere weißen mit weißen weißen , weißen weißen mit , weißen andere in weißen . weißen weißen . weißen weißen weißen weißen in , . .
Correct output: 
Eine Gruppe von Menschen , die nebeneinander sitzen und von denen einige weiß

KeyboardInterrupt: ignored

In [None]:
# save the model
#pickle.dump(model, open(folder + "size10kModel.pkl", "wb"))
# enTokenizer = pickle.load(open(enTokenizerName, "rb"))
# deTokenizer = pickle.load(open(deTokenizerName, "rb"))
# pickle.dump(enTokenizer, open(folder + "en10ktokenizer.pkl", "wb"))
# pickle.dump(deTokenizer, open(folder + "de10ktokenizer.pkl", "wb"))
#torch.save(model.state_dict(), folder + "size10Modell")

In [None]:
#model = WholeNetwork(512, 15000, 1).to(device)
#model.load_state_dict(torch.load(folder + "size10Modell"))

#model = pickle.load(open(folder + "size10kModel.pkl", "rb"))

In [50]:
import sys
from nltk.translate.bleu_score import sentence_bleu
from nltk.translate.bleu_score import corpus_bleu

def evalModel(srcVocab=10000, tgtVocab=10000):
  with torch.no_grad():
    model = WholeNetwork(512, srcVocab, tgtVocab, PADDING_IDX)
    model.load_state_dict(torch.load(folder + "model_2vocab_5_7", map_location=device))
    model = model.to(device)
    model.eval()
    #suffix = "10k.txt"
    bleuScores = []
    dataset = SentenceDataset(en30kVal, de30kVal, True)

    train_dataloader = DataLoader(dataset, batch_size=128, collate_fn =
                                  PadCollate(enTokenizer, deTokenizer))
    dataloader_iter = iter(train_dataloader)
    i = 0
    numFails = 0
    print("Beginning training...")
    avgLossAtEpoch = 100
    dataloader_iter = iter(train_dataloader)
    i = 0
    for src, tgt in dataloader_iter:
        
        #print("Everything else time:", time.time()-startTime)
        src, tgt = src.to(device), tgt.to(device)
        tgt_x = tgt[:, :-1]
        tgt_y = tgt[:, 1:]
        #print(tgt.shape)
        src_mask = (src != PADDING_IDX).unsqueeze(-2)
        tgt_mask = makeStdMask(tgt_x)
        #print(tgt_mask[0])
        #print(tgt_mask.shape)
        #print(src.shape)
        #tgt_mask = get_tgt_mask(src.shape[1]).to(device)
        hyps = predictFromTokens(model, src, srcVocab, tgtVocab, 100)
        refs = decodeSentences(tgt_y, deTokenizer)
        refs = [[ref.split(" ")] for ref in refs]
        hyps = [hyp.split(" ") for hyp in hyps]
        
        print(refs)
        print(hyps)
        
        # print(len(argmaxOutput), len(tgt_y))
        bleu = corpus_bleu(refs, hyps)
        print(bleu)
        bleuScores.append(bleu)
        i+=1
  print(bleuScores)
  print(np.mean(bleuScores))

        #print("ARGMAX")
        #print(argmaxOutput)
        
#         if i > 20:
#             break
        # predictions = decodeSentences(argmaxOutput, deTokenizer)[:5]
        # actual = decodeSentences(tgt, deTokenizer)[:5]
        # print("Predicted output: ")
        # print(predictions)
        # print("Correct output: ")
        # print(actual)
        

In [51]:
#del model

evalModel()

Creating pairs...
1014
Beginning training...
[[['Ein', 'Mann', 'box', 't']], [['Ein', 'Junge', 'sitzt', 'auf', 'einer', 'Schaukel', '.']], [['Ein', 'Künstler', 'malt', 'im', 'Freien', '.']], [['Drei', 'kleine', 'Hunde', 'schnüff', 'eln', 'an', 'etwas', '.']], [['Zwei', 'Mädchen', 'gehen', 'eine', 'Straße', 'entlang', '.']], [['Drei', 'Frauen', 'sitzen', 'da', 'und', 'lächeln', '.']], [['Zwei', 'Menschen', 'überqueren', 'eine', 'Straße', '.']], [['Ein', 'kleiner', 'Junge', 'springt', 'in', 'das', 'Wasser', '.']], [['Drei', 'Hunde', 'spielen', 'im', 'Schnee', '.']], [['Sie', 'posieren', 'für', 'ein', 'Bild', '.']], [['Eine', 'Band', 'spielt', 'auf', 'dem', 'Gehweg', '.']], [['Junge', 'macht', 'Kunststücke', 'auf', 'einem', 'Skateboard']], [['Eine', 'Person', 'trägt', 'viele', 'Taschen', '.']], [['Ein', 'Mann', 'beim', 'Wake', 'boarden', 'im', 'Wasser', '.']], [['Ein', 'Vogel', 'fliegt', 'über', 'das', 'Wasser', '.']], [['Ein', 'Snowboarder', 'vollführt', 'ein', 'Kunststück', '.']], [['Di

In [46]:
def predictFromTokens(model, input, srcVocab, tgtVocab, maxLength):
    with torch.no_grad():
        predictions = []
        for line in input:
            line = line.reshape(1, line.shape[0])
            src_mask = (line != PADDING_IDX).unsqueeze(-2)
            encodings = model.encode(line, src_mask)
            output = "[SOS]"
            output = torch.IntTensor(deTokenizer.encode(output).ids)
            output = output.reshape(1, output.shape[0]).to(device)
            i = 0
            #print(input)
            # print(output)
            while i < maxLength: # while its not [EOS]
                tgt_mask = (output != PADDING_IDX).unsqueeze(-2)
                # print(line.shape)
                # print(output.shape)
                prediction = model.decode(encodings, output, src_mask, tgt_mask)[0, -1]
                argmaxOutput = torch.argmax(prediction, -1)
                #print(argmaxOutput)
                #print(argmaxOutput.shape)
                #print(argmaxOutput)
                if argmaxOutput == 1:
                    break
            
                #print(output.shape)
                
                output = torch.cat((output, argmaxOutput.reshape((1,1))), -1)
                #print(decodeSentences(output, deTokenizer))
                #print(output.shape)
                i+=1
            predictions.append(decodeSentences(output, deTokenizer)[0])
    return predictions


In [None]:
def predict(model, input, maxLength):
  with torch.no_grad():

    input += "[EOS]"
    input = torch.IntTensor(enTokenizer.encode(input).ids)
    print(input)
    input = input.reshape(1, input.shape[0]).to(device)
    output = "[SOS]"
    output = torch.IntTensor(deTokenizer.encode(output).ids)
    output = output.reshape(1, output.shape[0]).to(device)
    i = 0
    #print(input)
   # print(output)
    while i < maxLength: # while its not [EOS]
      src_mask = (input != PADDING_IDX).unsqueeze(-2)
      tgt_mask = makeStdMask(output)
      prediction = model(input, output, src_mask, tgt_mask)[0, -1]
      argmaxOutput = torch.argmax(prediction, -1)
      #print(argmaxOutput.shape)
      #print(argmaxOutput)
      if argmaxOutput == 1:
        print(decodeSentences(output, deTokenizer))
        break
      i+=1
      #print(output.shape)
      
      output = torch.cat((output, argmaxOutput.reshape((1,1))), -1)
      print(decodeSentences(output, deTokenizer))
      #print(output.shape)


In [None]:
model = WholeNetwork(512, 10000, 10000, PADDING_IDX)
model.load_state_dict(torch.load(folder + "model_2vocab_5_7", map_location=device))
model = model.to(device)
model.eval()

WholeNetwork(
  (embeddingSrc): EmbeddingLayer(
    (embedding): Embedding(10000, 512, padding_idx=2)
  )
  (embeddingTgt): EmbeddingLayer(
    (embedding): Embedding(10000, 512, padding_idx=2)
  )
  (posEncoding): PositionalEncoding(
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Encoder(
    (layers): ModuleList(
      (0-2): 3 x EncoderLayer(
        (selfAttention): MultiHeadedAttention(
          (qWeight): Linear(in_features=512, out_features=512, bias=True)
          (kWeight): Linear(in_features=512, out_features=512, bias=True)
          (vWeight): Linear(in_features=512, out_features=512, bias=True)
          (finalWeight): Linear(in_features=512, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inplace=False)
        )
        (feedForward): FeedForwardNN(
          (linear1): Linear(in_features=512, out_features=2048, bias=True)
          (linear2): Linear(in_features=2048, out_features=512, bias=True)
          (dropout): Dropout(p=0.1, inpla

In [None]:
predict(model, "A man sings a song.", 30)

tensor([  30,   93, 1561,   57, 2705,   15,    1,    2,    2,    2,    2,    2,
           2,    2,    2,    2], dtype=torch.int32)
['Ein']
['Ein Mann']
['Ein Mann singt']
['Ein Mann singt ein']
['Ein Mann singt ein Lied']
['Ein Mann singt ein Lied .']
['Ein Mann singt ein Lied .']
