In [2]:
"""
    Encoder-Decoder Model
    - 1. Create anoter-side of Model by using current-side of Model
    when it has any two-targets.
    
    - 2. For example, when we have English and Franch sentences,
    we can create the Model that translate English to Franch.
    
    - 3. We might be can create Auto-Question-Answer when we have
    two-targets, Question and Answer.
    
    - 4. Also, we can create explanation by combining-CNN-Images,
    which mean this model is one of the important model while we
    use DL(Deep-Learning)-Model and NN(Neural-Network)-Model.
    
    - 5. From here, we are going to create a Translation-Model that
    translates English to Spanish.
    
    - 6. Planning to convert this code to be create-file, 
    the new README.md files which contains comments.
"""
import torch
from torch import nn, optim
from torch.utils.data import Dataset, TensorDataset, DataLoader
import tqdm

In [3]:
import re
import collections
import itertools

remove_marks_regex = re.compile(
    "[\,\(\)\[\]\*:;¿¡]|<.*?>"
)
shift_marks_regex = re.compile("([?!\.])")

In [4]:
unk = 0
sos = 1
eos = 2

In [5]:
"""
    - 1.Delete Unnecessary-letter(Charcter) 
    after changed to Lower-case all, and then
    Divide '!?' and word.
    
    - 2. Especially Spanish has the '¿' or '¡' 
    when they use exclamation and interrogative-Sentences,
    But to Simplificate Symbol, I just left some Symbol that follows
    English, and Delete all of Spanish Symbol
    such as '¿' and '¡'. 
"""
def normalize(text):
    text = text.lower()
    
    # Remove Unnessary-Character(Letter).
    text = remove_marks_regex.sub("", text)
    
    # Insert blanks between '?!' and word.
    text = shift_marks_regex.sub(r" \1", text)
    return text

In [6]:
"""
    - 1. Convert 'spa.txt' of first-line to English and Spanish
    as each of Token-List.
    
    - 2. 'build_vocab' is stands of for create Vocabulary.
    
    - 3. To Execute this function, makes sure you have to apply tags,
    named Padding and Start, End.
"""
def parse_line(line):
    line = normalize(line.strip())
    
    # Create each of Token which included Translation-Source(src)
    # and Translation-Target(trg) as a List.
    src, trg = line.split("\t")
    src_tokens = src.strip().split()
    trg_tokens = src.strip().split()
    return src_tokens, trg_tokens

In [7]:
def build_vocab(tokens):
    # Check the count(Number) of appearence of token's 
    # in the All of sentences inside files.
    counts = collections.Counter(tokens)
    
    # Place them in order to the count(Number) of appearence of token's
    # from the oldest(highest).
    sorted_counts = sorted(
        counts.item(),
        key=lambda c: c[1], reverse=True
    )
    
    # Create Reverse-Directory and Forwarding-List Vocabulary
    # by adding three-tags.
    word_list = ["<UNK>", "<SOS>", "<EOS>"] \
        + [x[0] for x in sorted_counts]
    word_dict = dict((w, i) for i, w in enumerate(word_list))
    return word_list, word_dict

In [8]:
"""
    - 1. Convert Word-List as a Tensor.
    
    - 2. Define Max-Length and Pad(Padding) that are lacking.
"""
def words_to_tensor(words, word_dict, max_len, padding=0):
    # Attach Finish(Latest)-Tag to the End.
    words = words + ["<EOS>"]
    
    # Convert Numberic-List by applying 'Dictionary'.
    words = [word_dict.get(w, 0) for w in words]
    seq_len = len(words)
    
    # Padding(Pad) the Length if its or under the 'max_len'.
    if seq_len < max_len + 1:
        words = words + [padding] * (max_len + 1 - seq_len)
    
    # Return by convert Tensor.
    return torch.tensor(words, dtype=torch.int64), seq_len

In [9]:
class TranslationPairDataset(Dataset):
    def __init__(self, path, max_len=15):
        
        # A function that filter-out the several-function 
        # which haves lots of length of word.
        def filter_pair(p):
            return not (len(p[0]) > max_len
                       or len(p[1]) > max_len)
        
        # Filtering and Parse after open a file.
        with open(path) as fp:
            pairs = map(parse_line, fp)
            pairs = filter(filter_pair, pairs)
            pairs = list(pairs)
            
        # Device the Sentence as Source and Target.
        src = [p[0] for p in pairs]
        trg = [p[1] for p in pairs]
        
        # Create each of Vocabulary
        self.src_word_list, self.src_word_dict = \
            build_vocab(itertools.chain.from_iterable(src))
        self.trg_word_list, self.trg_word_dict = \
            build_vocab(itertools.chain.from_iterable(trg))
        
        # Convert as Tensor by applying Vocabulary.
        self.src_data = [words_to_tensor(
            words, self.src_word_dict, max_len)
                         for words in src
        ]
        self.trg_data = [words_to_tensor(
            words, self.trg_word_dict, max_len, -100)
                         for words in trg
        ]
        
        def __len__(self):
            return len(self.src_data)
        
        def __getitem__(self, idx):
            src, lsrc = self.src_data[idx]
            trg, ltrg = self.trg_data[idx]
            return src, lsrc, trg, ltrg
            

In [11]:
batch_size = 64
max_len = 10
path = "../05/spa.txt"
# ds = TranslationPairDataset(path, max_len=max_len)
# loader = DataLoader(ds, batch_size=batch_size, shuffle=True,
#                    num_workers=4)