In [1]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
cd "/content/gdrive/My Drive/Sequential_model/my_machine_translator"

/content/gdrive/My Drive/Sequential_model/my_machine_translator


In [3]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
"""
Ref:
https://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html
https://pytorch.org/tutorials/beginner/translation_transformer.html
Download the data from https://download.pytorch.org/tutorial/data.zip
"""

'\nRef: \nhttps://pytorch.org/tutorials/intermediate/seq2seq_translation_tutorial.html\nhttps://pytorch.org/tutorials/beginner/translation_transformer.html\nDownload the data from https://download.pytorch.org/tutorial/data.zip\n'

In [5]:
SOS_token = 1
EOS_token = 2
PAD_token = 0

class Lang:
    def __init__(self, name):
        self.name = name
        self.word2index = {"<PAD>":0, "<SOS>":1, "<EOS>": 2}
        self.word2count = {"<PAD>":1, "<SOS>":1, "<EOS>": 1}
        self.index2word = {0: "<PAD>", 1: "<SOS>", 2: "<EOS>"}
        self.n_words = 3  # Count PAD, SOS and EOS

    def addSentence(self, sentence):
        for word in sentence.split(' '):
            self.addWord(word)

    def addWord(self, word):
        if word not in self.word2index:
            self.word2index[word] = self.n_words
            self.word2count[word] = 1
            self.index2word[self.n_words] = word
            self.n_words += 1
        else:
            self.word2count[word] += 1

In [6]:
# ref: https://stackoverflow.com/a/518232/2809427
# The files are all in Unicode, to simplify we will turn Unicode characters to ASCII, make everything lowercase, and trim most punctuation.
# Turn a Unicode string to plain ASCII
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

"""
output_string = unicodeToAscii("Café au Lait")
    Input string: Café au Lait
    Output string: Cafe au Lait
"""

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

"""
s = re.sub(r"([.!?])", r" \1", s):
    Input string: Hello! How are you? I'm fine.
    Output string: Hello ! How are you ? I'm fine .

s = re.sub(r"[^a-zA-Z!?]+", r" ", s):
    Input string: Hello123! How are you? I'm fine.
    Output string: Hello ! How are you ? I m fine .
"""

'\ns = re.sub(r"([.!?])", r" \x01", s):\n    Input string: Hello! How are you? I\'m fine.\n    Output string: Hello ! How are you ? I\'m fine .\n\ns = re.sub(r"[^a-zA-Z!?]+", r" ", s):\n    Input string: Hello123! How are you? I\'m fine.\n    Output string: Hello ! How are you ? I m fine .\n'

In [7]:
def readLangs(lang1, lang2, reverse=False):
    print("Reading lines...")

    # Read the file and split into lines
    lines = open('data/%s-%s.txt' % (lang1, lang2), encoding='utf-8').\
        read().strip().split('\n')

    # Split every line into pairs and normalize
    pairs = [[normalizeString(s) for s in l.split('\t')] for l in lines]

    # Reverse pairs, make Lang instances
    if reverse:
        pairs = [list(reversed(p)) for p in pairs]
        input_lang = Lang(lang2)
        output_lang = Lang(lang1)
    else:
        input_lang = Lang(lang1)
        output_lang = Lang(lang2)

    return input_lang, output_lang, pairs

Since there are a *lot* of example sentences and we want to train
something quickly, we'll trim the data set to only relatively short and
simple sentences. Here the maximum length is 10 words (that includes
ending punctuation) and we're filtering to sentences that translate to
the form "I am" or "He is" etc. (accounting for apostrophes replaced
earlier).




In [8]:
MAX_LENGTH = 10

eng_prefixes = (
    "i am ", "i m ",
    "he is", "he s ",
    "she is", "she s ",
    "you are", "you re ",
    "we are", "we re ",
    "they are", "they re "
)

def filterPair(p):
    return len(p[0].split(' ')) < MAX_LENGTH and \
        len(p[1].split(' ')) < MAX_LENGTH and \
        p[1].startswith(eng_prefixes)


def filterPairs(pairs):
    return [pair for pair in pairs if filterPair(pair)]

The full process for preparing the data is:

-  Read text file and split into lines, split lines into pairs
-  Normalize text, filter by length and content
-  Make word lists from sentences in pairs




In [9]:
def prepareData(lang1, lang2, reverse=False):
    input_lang, output_lang, pairs = readLangs(lang1, lang2, reverse)
    print("Read %s sentence pairs" % len(pairs))
    pairs = filterPairs(pairs)
    print("Trimmed to %s sentence pairs" % len(pairs))
    print("Counting words...")
    for pair in pairs:
        input_lang.addSentence(pair[0])
        output_lang.addSentence(pair[1])
    print("Counted words:")
    print(input_lang.name, input_lang.n_words)
    print(output_lang.name, output_lang.n_words)
    return input_lang, output_lang, pairs

input_lang, output_lang, pairs = prepareData('eng', 'fra', True)
print(random.choice(pairs))

Reading lines...
Read 135842 sentence pairs
Trimmed to 11445 sentence pairs
Counting words...
Counted words:
fra 4602
eng 2992
['il ne compte pas au nombre de mes amis', 'he is no friend of mine']


In [10]:
def indexesFromSentence(lang, sentence):
    return [lang.word2index[word] for word in sentence.split(' ')]

def tensorFromSentence(lang, sentence):
    indexes = indexesFromSentence(lang, sentence)
    indexes.insert(0, SOS_token)
    indexes.append(EOS_token)
    return torch.tensor(indexes, dtype=torch.long, device=device).view(1, -1) #(1,t)

def tensorsFromPair(pair):
    input_tensor = tensorFromSentence(input_lang, pair[0])
    target_tensor = tensorFromSentence(output_lang, pair[1])
    return (input_tensor, target_tensor)

def get_dataloader(batch_size):
    #SOS_token = 0
    #EOS_token = 1
    input_lang, output_lang, pairs = prepareData('eng', 'fra', True)

    """load all data (indexs) to train_data"""
    n = len(pairs)
    input_ids = np.zeros((n, MAX_LENGTH+1), dtype=np.int32) #max word = 9, +<SOS> <EOS>
    target_ids = np.zeros((n, MAX_LENGTH+1), dtype=np.int32)

    for idx, (inp, tgt) in enumerate(pairs):
        #Sentence to indexs
        inp_ids = indexesFromSentence(input_lang, inp)
        tgt_ids = indexesFromSentence(output_lang, tgt)
        #add <SOS> index
        inp_ids.insert(0, SOS_token)
        tgt_ids.insert(0, SOS_token)
        #add <EOS> index
        inp_ids.append(EOS_token)
        tgt_ids.append(EOS_token)
        input_ids[idx, :len(inp_ids)] = inp_ids #inp_ids = <This> <is> <example> <sentence> <EOS> <SOS> <SOS> .... <SOS>
        target_ids[idx, :len(tgt_ids)] = tgt_ids

    train_data = TensorDataset(torch.LongTensor(input_ids).to(device),
                               torch.LongTensor(target_ids).to(device))

    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)
    return input_lang, output_lang, train_dataloader

In [11]:
input_lang, output_lang, train_dataloader = get_dataloader(2)
for inp, tgt in train_dataloader:
    print(tgt)
    indexs = tgt[0].tolist()
    print(" ".join([output_lang.index2word[idx] for idx in indexs]))
    indexs = tgt[1].tolist()
    print(" ".join([output_lang.index2word[idx] for idx in indexs]))
    break

Reading lines...
Read 135842 sentence pairs
Trimmed to 11445 sentence pairs
Counting words...
Counted words:
fra 4602
eng 2992
tensor([[   1,    3,    4,  147, 2333,  378,  532, 1663,    2,    0,    0],
        [   1,  129,   78,   42,  841,  469,    2,    0,    0,    0,    0]],
       device='cuda:0')
<SOS> i m not comparing tom to mary <EOS> <PAD> <PAD>
<SOS> you re a wonderful friend <EOS> <PAD> <PAD> <PAD> <PAD>


----
# Model

In [12]:
#ref: https://pytorch.org/tutorials/beginner/translation_transformer.html

from torch import Tensor
import torch
import torch.nn as nn
from torch.nn import Transformer
import math
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# helper Module that adds positional encoding to the token embedding to introduce a notion of word order.
class PositionalEncoding(nn.Module):
    def __init__(self, emb_size, dropout, maxlen = 5000):
        super(PositionalEncoding, self).__init__()
        den = torch.exp(- torch.arange(0, emb_size, 2)* math.log(10000) / emb_size)
        pos = torch.arange(0, maxlen).reshape(maxlen, 1)
        pos_embedding = torch.zeros((maxlen, emb_size))
        pos_embedding[:, 0::2] = torch.sin(pos * den)
        pos_embedding[:, 1::2] = torch.cos(pos * den)
        pos_embedding = pos_embedding.unsqueeze(0) #(b=1, maxlen=seq_len, emb_size=d_model)

        self.dropout = nn.Dropout(dropout)
        self.register_buffer('pos_embedding', pos_embedding)

    def forward(self, token_embedding):
        #token_embedding (b,t_src,c) / (b,t_tgt,c)
        seq_len = token_embedding.size(1)
        return self.dropout(token_embedding + self.pos_embedding[:,:seq_len]) #(b,t_src,c) + (b:1,t_src,c)

# helper Module to convert tensor of input indices into corresponding tensor of token embeddings
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size: int, emb_size):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.emb_size = emb_size

    def forward(self, tokens: Tensor):
        #tokens (b,t_src) / (b,t_tgt) dtype=torch.int64
        return self.embedding(tokens.long()) * math.sqrt(self.emb_size) #self.embedding(tokens.long()) (b,t_src) / (b,t_tgt)

# Seq2Seq Network
class Seq2SeqTransformer(nn.Module):
    def __init__(self, num_encoder_layers, num_decoder_layers,
                 emb_size, nhead, src_vocab_size, tgt_vocab_size,
                 dim_feedforward = 512, dropout = 0.1):
        super(Seq2SeqTransformer, self).__init__()
        self.transformer = Transformer(d_model=emb_size, nhead=nhead,
                                       num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers,
                                       dim_feedforward=dim_feedforward, dropout=dropout,
                                       batch_first=True)
        self.generator = nn.Linear(emb_size, tgt_vocab_size)
        self.src_tok_emb = TokenEmbedding(src_vocab_size, emb_size)
        self.tgt_tok_emb = TokenEmbedding(tgt_vocab_size, emb_size)
        self.positional_encoding = PositionalEncoding(emb_size, dropout=dropout)

    def forward(self, src, trg, src_mask, tgt_mask,
                src_padding_mask, tgt_padding_mask, memory_key_padding_mask):
        src_emb = self.positional_encoding(self.src_tok_emb(src))
        tgt_emb = self.positional_encoding(self.tgt_tok_emb(trg))
        outs = self.transformer(src_emb, tgt_emb, src_mask, tgt_mask, None,
                                src_padding_mask, tgt_padding_mask, memory_key_padding_mask)

        #outs = (b, t_tgt, c)
        logits = self.generator(outs)
        #logits = (b, t_tgt, vocab)
        return logits

    def encode(self, src, src_mask):
        return self.transformer.encoder(self.positional_encoding(
                            self.src_tok_emb(src)), src_mask)

    def decode(self, tgt, memory, tgt_mask):
        return self.transformer.decoder(self.positional_encoding(
                          self.tgt_tok_emb(tgt)), memory,
                          tgt_mask)

In [13]:
def generate_square_subsequent_mask(size):
    #size = t_tgt
    mask = (torch.triu(torch.ones((size, size), device=device)) == 1).transpose(0, 1)
    mask = mask.float().masked_fill(mask == 0, float('-inf')).masked_fill(mask == 1, float(0.0))
    #mask (t_tgt, t_tgt)
    return mask


def create_mask(src, tgt):
    #src (b,t_src)
    #tgt (b,t_tgt)
    src_seq_len = src.shape[1]
    tgt_seq_len = tgt.shape[1]

    tgt_mask = generate_square_subsequent_mask(tgt_seq_len) #tgt_mask(t_tgt, t_tgt)
    src_mask = torch.zeros((src_seq_len, src_seq_len), device=device).type(torch.bool) #src_mask(t_src,t_src)

    src_padding_mask = (src == PAD_token) #src_padding_mask (b,t_src)
    tgt_padding_mask = (tgt == PAD_token) #tgt_padding_mask(b,t_tgt)

    return src_mask, tgt_mask, src_padding_mask, tgt_padding_mask

In [40]:
torch.manual_seed(0)

SRC_VOCAB_SIZE = input_lang.n_words
TGT_VOCAB_SIZE = output_lang.n_words
EMB_SIZE = 512 #for embedding & transformer
NHEAD = 8
FFN_HID_DIM = 512 #for fc layer
NUM_ENCODER_LAYERS = 3
NUM_DECODER_LAYERS = 3

model = Seq2SeqTransformer(NUM_ENCODER_LAYERS, NUM_DECODER_LAYERS, EMB_SIZE, NHEAD, SRC_VOCAB_SIZE, TGT_VOCAB_SIZE, FFN_HID_DIM)
model = model.to(device)

for p in model.parameters():
    if p.dim() > 1:
        nn.init.xavier_uniform_(p)

loss_fn = torch.nn.CrossEntropyLoss(ignore_index=PAD_token)

optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=(0.9, 0.98), eps=1e-9)

In [47]:
from torch.utils.data import DataLoader

def train_epoch(model, train_dataloader, optimizer):
    model.train()
    losses = 0
    count = 0

    for src, tgt in train_dataloader:
        #src (b, t_src:11)
        #tgt (b, t_tgt:11)
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:, :-1] #remove 1 last token, because we will remove <SOS> later
        #tgt_input (b, t_tgt:10)

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)
        #logits (b, t_tgt:10, vocab_size)

        optimizer.zero_grad()

        expect_tgt = tgt[:, 1:] #remove <SOS>
        #expect_tgt (b, t_tgt:10)
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), expect_tgt.reshape(-1))
        loss.backward()

        optimizer.step()
        losses += loss.item()
        count += 1

    return losses / count

from timeit import default_timer as timer
NUM_EPOCHS = 80
input_lang, output_lang, train_dataloader = get_dataloader(64)
for epoch in range(1, NUM_EPOCHS+1):
    start_time = timer()
    train_loss = train_epoch(model, train_dataloader, optimizer)
    end_time = timer()
    #val_loss = evaluate(model)
    print((f"Epoch: {epoch}, Train loss: {train_loss:.3f} "f"Epoch time = {(end_time - start_time):.3f}s"))
    #print((f"Epoch: {epoch}, Train loss: {train_loss:.3f}, Val loss: {val_loss:.3f}, "f"Epoch time = {(end_time - start_time):.3f}s"))


Reading lines...
Read 135842 sentence pairs
Trimmed to 11445 sentence pairs
Counting words...
Counted words:
fra 4602
eng 2992
Epoch: 1, Train loss: 3.889 Epoch time = 5.680s
Epoch: 2, Train loss: 2.689 Epoch time = 10.686s
Epoch: 3, Train loss: 2.221 Epoch time = 5.737s
Epoch: 4, Train loss: 1.892 Epoch time = 7.001s
Epoch: 5, Train loss: 1.634 Epoch time = 6.378s
Epoch: 6, Train loss: 1.419 Epoch time = 6.492s
Epoch: 7, Train loss: 1.236 Epoch time = 6.262s
Epoch: 8, Train loss: 1.082 Epoch time = 6.510s
Epoch: 9, Train loss: 0.946 Epoch time = 5.910s
Epoch: 10, Train loss: 0.832 Epoch time = 5.878s
Epoch: 11, Train loss: 0.735 Epoch time = 5.939s
Epoch: 12, Train loss: 0.642 Epoch time = 5.883s
Epoch: 13, Train loss: 0.567 Epoch time = 6.725s
Epoch: 14, Train loss: 0.500 Epoch time = 6.653s
Epoch: 15, Train loss: 0.440 Epoch time = 6.941s
Epoch: 16, Train loss: 0.392 Epoch time = 5.719s
Epoch: 17, Train loss: 0.343 Epoch time = 6.408s
Epoch: 18, Train loss: 0.303 Epoch time = 5.994s

In [52]:
torch.save(model, "myTranslator.pt")

In [42]:
def evaluate(model, val_dataloader):
    model.eval()
    losses = 0
    count = 0

    for src, tgt in val_dataloader:
        #tgt (b,t)
        src = src.to(device)
        tgt = tgt.to(device)

        tgt_input = tgt[:, :-1] #last token

        src_mask, tgt_mask, src_padding_mask, tgt_padding_mask = create_mask(src, tgt_input)

        logits = model(src, tgt_input, src_mask, tgt_mask,src_padding_mask, tgt_padding_mask, src_padding_mask)

        tgt_out = tgt[1:, :] #remove <SOS>
        loss = loss_fn(logits.reshape(-1, logits.shape[-1]), tgt_out.reshape(-1))
        losses += loss.item()
        count += 1

    return losses / count

In [18]:
# function to generate output sequence using greedy algorithm
def greedy_decode(model, src, src_mask, max_len, start_symbol):
    src = src.to(device)
    src_mask = src_mask.to(device)

    memory = model.encode(src, src_mask)
    #(1,t,embed_size)
    ys = torch.ones(1, 1).fill_(start_symbol).type(torch.long).to(device)
    #ys (1,1)
    for i in range(max_len-1):
        memory = memory.to(device)
        #ys (1, t:1+i)
        tgt_mask = (generate_square_subsequent_mask(ys.size(1))
                    .type(torch.bool)).to(device)
        #tgt_mask (t, t)
        out = model.decode(ys, memory, tgt_mask) #out (b,t,embed_size)
        prob = model.generator(out[:, -1]) #prob (b=1,vocab_size)
        _, next_word = torch.max(prob, dim=1)
        next_word = next_word.item()
        ys = torch.cat([ys,
                        torch.ones(1, 1).type_as(src.data).fill_(next_word)], dim=1)
        if next_word == EOS_token:
            break
    return ys


# actual function to translate input sentence into target language
def translate(model, src_sentence: str):
    model.eval()
    src = tensorFromSentence(input_lang, src_sentence) #sentence to indexs
    #src (1, t)
    num_tokens = src.shape[1]
    src_mask = (torch.zeros(num_tokens, num_tokens)).type(torch.bool)
    tgt_tokens = greedy_decode(model,  src, src_mask, max_len=num_tokens + 5, start_symbol=SOS_token).flatten()
    return " ".join(output_lang.index2word[token] for token in list(tgt_tokens.cpu().numpy()))
    #return " ".join(vocab_transform[TGT_LANGUAGE].lookup_tokens(list(tgt_tokens.cpu().numpy()))).replace("<bos>", "").replace("<eos>", "")

In [14]:
myModel = torch.load('myTranslator.pt')

In [36]:
for i in range(10):
    french, english = random.choice(pairs)
    print(f"===== Example {i+1} =====")
    print("French, X \t=", french)
    print("English, Y \t=", english)
    print("Predicted, Yhat =", translate(myModel, french))

===== Example 1 =====
French, X 	= nous prenons l ascendant
English, Y 	= we re taking over
Predicted, Yhat = <SOS> we re taking over <EOS>
===== Example 2 =====
French, X 	= vous plaisantez bien sur
English, Y 	= you re joking of course
Predicted, Yhat = <SOS> you re joking of course <EOS>
===== Example 3 =====
French, X 	= vous commettez une grosse erreur
English, Y 	= you re making a big mistake
Predicted, Yhat = <SOS> you re making a big mistake <EOS>
===== Example 4 =====
French, X 	= je suis tres serieux
English, Y 	= i m very serious
Predicted, Yhat = <SOS> i m quite serious <EOS>
===== Example 5 =====
French, X 	= tu es bien plus jolie que dans mon souvenir
English, Y 	= you re much prettier than i remember
Predicted, Yhat = <SOS> you re much prettier than i remember <EOS>
===== Example 6 =====
French, X 	= c est bon pour moi pour l instant
English, Y 	= i m good for now
Predicted, Yhat = <SOS> i m good for now <EOS>
===== Example 7 =====
French, X 	= tu me fais marrer
English,

In [None]:
print(translate(model, "je suis enchantee d etre ici"))

In [None]:
print(translate(model, "je ne suis pas encore prete")) #je commence a peine

In [None]:
print(translate(model, "je commence a peine"))