In [1]:
import torch
from torchtext import data, datasets, vocab
import spacy

In [13]:
import os

In [2]:
fileDir = '/Users/xinyi.ye/Documents/machine_translate/experiments/train4/'

In [3]:
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"

# Field define how to deal with raw data
SRC = data.Field(pad_token=BLANK_WORD) # tokenize, default: string.split
TGT = data.Field(init_token = BOS_WORD,eos_token = EOS_WORD, pad_token=BLANK_WORD)

In [4]:
traindataset = datasets.TranslationDataset(path=fileDir+'train-infoq', exts=('.en','.zh'),fields=(SRC, TGT))

In [5]:
traindataset.__dict__.keys()

dict_keys(['examples', 'fields'])

In [6]:
len(traindataset.examples)

3412

In [7]:
traindataset.fields

{'src': <torchtext.data.field.Field at 0x1227d4190>,
 'trg': <torchtext.data.field.Field at 0x1227d41d0>}

In [8]:
# traindataset[0] is a Example object which has two attriutes, src and trg 

traindataset[0].__dict__.keys()

dict_keys(['src', 'trg'])

In [9]:
traindataset[0].src

['A',
 'security',
 'vulnerability',
 'that',
 'has',
 'hit',
 'Internet',
 'Explorer',
 'through',
 '.NET',
 'has',
 'also',
 'hit',
 'Firefox',
 '.']

In [10]:
traindataset[0].trg

['近日',
 '缘起于',
 '.',
 'NET',
 '的',
 '一',
 '个',
 '安全',
 '漏洞',
 '不仅',
 '对',
 'InternetExplorer',
 '造成',
 '了',
 '影响',
 '，',
 '甚至',
 '连',
 'Firefox',
 '也',
 '未',
 '能',
 '幸免',
 '。']

In [21]:
cache = '/Users/xinyi.ye/Documents/machine_translate/en_zh_transformer/.vector_cache'

srcVectors = vocab.Vectors(name='cc.en.300.vec', cache=cache)
tgtVectors = vocab.Vectors(name='cc.zh.300.vec', cache=cache)

In [22]:
srcVectors.get_vecs_by_tokens(traindataset[0].src)

tensor([[ 0.2200, -0.6874, -0.1392,  ...,  0.1886,  0.1140, -0.0196],
        [-0.0204, -0.0331,  0.0555,  ...,  0.0815,  0.0150, -0.0028],
        [ 0.0018,  0.0069, -0.0017,  ...,  0.0446, -0.0012,  0.0418],
        ...,
        [ 0.0264,  0.0273,  0.0430,  ..., -0.0021,  0.1013, -0.1356],
        [-0.0449,  0.0031, -0.0644,  ...,  0.0494, -0.0604,  0.0139],
        [ 0.0342, -0.0801,  0.1162,  ...,  0.5423, -0.0624,  0.0900]])

In [23]:
tgtVectors.get_vecs_by_tokens(traindataset[0].trg)

tensor([[-0.0364,  0.0463,  0.2820,  ..., -0.0539, -0.0206, -0.0342],
        [-0.0819,  0.0237,  0.1589,  ..., -0.0865, -0.0520, -0.0273],
        [-0.0178,  0.0263,  0.5518,  ...,  0.0122, -0.0048,  0.0595],
        ...,
        [ 0.0076,  0.0725,  0.4513,  ..., -0.0314, -0.0593, -0.0030],
        [ 0.2204,  0.2844,  0.4485,  ..., -0.2818,  0.0012,  0.1817],
        [ 0.0093,  0.0210,  0.7688,  ..., -0.0336,  0.0148, -0.0056]])

In [20]:
srcVectors.get_vecs_by_tokens(traindataset[0].src).size()

torch.Size([15, 300])

In [24]:
tgtVectors.get_vecs_by_tokens(traindataset[0].trg).size()

torch.Size([24, 300])

In [25]:
# creating a map from word to unique integer

# positional parameters are the source of vocabularies
# keyword parameters are passed to Vocab


SRC.build_vocab(traindataset, vectors=srcVectors)
TGT.build_vocab(traindataset, vectors=tgtVectors)

In [26]:
SRC.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x1579fbc10>>,
            {'<unk>': 0,
             '<blank>': 1,
             'the': 2,
             '.': 3,
             ',': 4,
             'to': 5,
             'and': 6,
             'a': 7,
             'of': 8,
             'is': 9,
             'in': 10,
             '-': 11,
             'that': 12,
             'for': 13,
             'on': 14,
             'be': 15,
             'it': 16,
             'with': 17,
             'are': 18,
             'as': 19,
             '.NET': 20,
             'can': 21,
             'The': 22,
             '(': 23,
             ':': 24,
             'this': 25,
             ')': 26,
             'an': 27,
             'not': 28,
             'will': 29,
             '"': 30,
             'by': 31,
             'you': 32,
             'or': 33,
             'has': 34,
             'have': 35,
             'from': 36,
             'which': 37,
        

In [29]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [30]:
# Iterator: Defines an iterator that loads batches of data from a Dataset.
# BucketIterator: Defines an iterator that batches examples of simailer lengths together

train_iter = data.BucketIterator(
 traindataset, # we pass in the datasets we want the iterator to draw data from
 batch_size=64,
 device=device, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.src), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [31]:
train_iter

<torchtext.data.iterator.BucketIterator at 0x122263350>

In [32]:
# Iterator.data(): Return the examples in the dataset in order, sorted, or shuffled.

len(train_iter.data())

3412

In [33]:
train_iter.data()[0].src

['References',
 'to',
 'immutable',
 'data',
 'need',
 'not',
 'be',
 'mutex',
 '-',
 'ed',
 '.']

In [34]:
# train_iter (an BucketIterator object) is iterable
# it returns a Batch object at each loop step

for i in train_iter:
    print(i)
    batchsrc0 = i.src
    batchtrg0 = i.trg
    break


[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 46x64]
	[.trg]:[torch.LongTensor of size 100x64]


In [35]:
# a Batch is a colletion of Examples
# every coloum of the Batch tensor represents an Example

examplesrc0 = batchsrc0[:,0]
exampletrg0 = batchtrg0[:,0]
batchsrc0

tensor([[  22,  671,  139,  ...,  462,   22,   22],
        [  88,  470,  304,  ..., 1222, 1908,  101],
        [1823,  447, 3208,  ...,   13,    8,  378],
        ...,
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1],
        [   1,    1,    1,  ...,    1,    1,    1]])

In [36]:
for es,et in zip(examplesrc0,exampletrg0):
    print(es,' ',SRC.vocab.itos[es],'  ',et,' ',TGT.vocab.itos[et])

tensor(22)   The    tensor(2)   <s>
tensor(88)   “    tensor(2923)   ReflectorPro
tensor(1823)   Pro    tensor(4)   的
tensor(92)   ”    tensor(45)   “
tensor(268)   part    tensor(4824)   Pro
tensor(8)   of    tensor(44)   ”
tensor(769)   Reflector    tensor(193)   部分
tensor(1823)   Pro    tensor(1179)   来自于
tensor(539)   comes    tensor(152)   其
tensor(36)   from    tensor(30)   对
tensor(102)   its    tensor(570)   调试
tensor(835)   debugging    tensor(4)   的
tensor(61)   support    tensor(42)   支持
tensor(3)   .    tensor(6)   。
tensor(1)   <blank>    tensor(3)   </s>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <bl