In [64]:
import torch
from torchtext import data, datasets
import spacy

In [28]:
fileDir = '/Users/xinyi.ye/Documents/machine_translate/experiments/train4/'

In [30]:
BOS_WORD = '<s>'
EOS_WORD = '</s>'
BLANK_WORD = "<blank>"

# Field define how to deal with raw data
SRC = data.Field(pad_token=BLANK_WORD) # tokenize, default: string.split
TGT = data.Field(init_token = BOS_WORD,eos_token = EOS_WORD, pad_token=BLANK_WORD)

In [31]:
traindataset = datasets.TranslationDataset(path=fileDir+'train-infoq', exts=('.en','.zh'),fields=(SRC, TGT))

In [54]:
traindataset.__dict__.keys()

dict_keys(['examples', 'fields'])

In [35]:
len(traindataset.examples)

3412

In [36]:
traindataset.fields

{'src': <torchtext.data.field.Field at 0x12fcae190>,
 'trg': <torchtext.data.field.Field at 0x12fcae1d0>}

In [37]:
traindataset.sort_key

<function torchtext.datasets.translation.TranslationDataset.sort_key(ex)>

In [40]:
# traindataset[0] is a Example object which has two attriutes, src and trg 

traindataset[0].__dict__.keys()

dict_keys(['src', 'trg'])

In [55]:
traindataset.sort_key

<function torchtext.datasets.translation.TranslationDataset.sort_key(ex)>

In [41]:
traindataset[0].src

['A',
 'security',
 'vulnerability',
 'that',
 'has',
 'hit',
 'Internet',
 'Explorer',
 'through',
 '.NET',
 'has',
 'also',
 'hit',
 'Firefox',
 '.']

In [53]:
traindataset[0].trg

['近日',
 '缘起于',
 '.',
 'NET',
 '的',
 '一',
 '个',
 '安全',
 '漏洞',
 '不仅',
 '对',
 'InternetExplorer',
 '造成',
 '了',
 '影响',
 '，',
 '甚至',
 '连',
 'Firefox',
 '也',
 '未',
 '能',
 '幸免',
 '。']

In [46]:
# creating a map from word to unique integer

# here is one-hot vector
# but can also load pretrained vectors

SRC.build_vocab(traindataset)
TGT.build_vocab(traindataset)

In [49]:
SRC.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x130a66cd0>>,
            {'<unk>': 0,
             '<blank>': 1,
             'the': 2,
             '.': 3,
             ',': 4,
             'to': 5,
             'and': 6,
             'a': 7,
             'of': 8,
             'is': 9,
             'in': 10,
             '-': 11,
             'that': 12,
             'for': 13,
             'on': 14,
             'be': 15,
             'it': 16,
             'with': 17,
             'are': 18,
             'as': 19,
             '.NET': 20,
             'can': 21,
             'The': 22,
             '(': 23,
             ':': 24,
             'this': 25,
             ')': 26,
             'an': 27,
             'not': 28,
             'will': 29,
             '"': 30,
             'by': 31,
             'you': 32,
             'or': 33,
             'has': 34,
             'have': 35,
             'from': 36,
             'which': 37,
        

In [50]:
len(SRC.vocab.stoi)

8890

In [51]:
TGT.vocab.stoi

defaultdict(<bound method Vocab._default_unk_index of <torchtext.vocab.Vocab object at 0x130a66f50>>,
            {'<unk>': 0,
             '<blank>': 1,
             '<s>': 2,
             '</s>': 3,
             '的': 4,
             '，': 5,
             '。': 6,
             '在': 7,
             '一': 8,
             '是': 9,
             '了': 10,
             '个': 11,
             '和': 12,
             '中': 13,
             '、': 14,
             '这': 15,
             '使用': 16,
             '会': 17,
             '可以': 18,
             '（': 19,
             '）': 20,
             '：': 21,
             '不': 22,
             '.': 23,
             '它': 24,
             '将': 25,
             '有': 26,
             '应用': 27,
             '种': 28,
             '我们': 29,
             '对': 30,
             '你': 31,
             '上': 32,
             '就': 33,
             '代码': 34,
             '也': 35,
             '数据': 36,
             '程序': 37,
             '提供': 38,
             '开发': 39,
    

In [52]:
len(TGT.vocab.stoi)

7800

In [97]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [71]:
# Iterator: Defines an iterator that loads batches of data from a Dataset.
# BucketIterator: Defines an iterator that batches examples of simailer lengths together

train_iter = data.BucketIterator(
 traindataset, # we pass in the datasets we want the iterator to draw data from
 batch_size=64,
 device=device, # if you want to use the GPU, specify the GPU number here
 sort_key=lambda x: len(x.src), # the BucketIterator needs to be told what function it should use to group the data.
 sort_within_batch=False,
 repeat=False # we pass repeat=False because we want to wrap this Iterator layer.
)

In [90]:
train_iter

<torchtext.data.iterator.BucketIterator at 0x130e893d0>

In [95]:
# Iterator.data(): Return the examples in the dataset in order, sorted, or shuffled.

len(train_iter.data())

3412

In [115]:
train_iter.data()[2].src

['While',
 'the',
 'GitHub',
 'repository',
 'shows',
 'that',
 'pull',
 'requests',
 'on',
 'the',
 'framework',
 'have',
 'continued',
 ',',
 'the',
 'velocity',
 'of',
 'the',
 'framework',
 'has',
 'stalled',
 '.']

In [126]:
# train_iter (an BucketIterator object) is iterable
# it returns a Batch object at each loop step

for i in train_iter:
    print(i)
    batchsrc0 = i.src
    batchtrg0 = i.trg
    break


[torchtext.data.batch.Batch of size 64]
	[.src]:[torch.LongTensor of size 72x64]
	[.trg]:[torch.LongTensor of size 91x64]


In [128]:
# a Batch is a colletion of Examples
# every coloum of the Batch tensor represents an Example

examplesrc0 = batchsrc0[:,0]
exampletrg0 = batchtrg0[:,0]
batchsrc0

tensor([[ 677, 3554, 5945,  ...,   39,   55,   22],
        [  65,  313, 8330,  ...,  619,    2,  256],
        [  21,    3,   44,  ...,    8,  364,  408],
        ...,
        [   1,    1,    1,  ...,    1,   19,    1],
        [   1,    1,    1,  ...,    1, 1311,    1],
        [   1,    1,    1,  ...,    1,    3,    1]])

In [129]:
for es,et in zip(examplesrc0,exampletrg0):
    print(es,' ',SRC.vocab.itos[es],'  ',et,' ',TGT.vocab.itos[et])

tensor(677)   VB    tensor(2)   <s>
tensor(65)   developers    tensor(703)   VB
tensor(21)   can    tensor(39)   开发
tensor(419)   write    tensor(102)   人员
tensor(1645)   console    tensor(18)   可以
tensor(89)   applications    tensor(253)   针对
tensor(6)   and    tensor(23)   .
tensor(197)   class    tensor(56)   NET
tensor(267)   libraries    tensor(162)   Core
tensor(13)   for    tensor(12)   和
tensor(20)   .NET    tensor(1332)   .NETStandard
tensor(49)   Core    tensor(293)   编写
tensor(6)   and    tensor(1573)   控制台
tensor(20)   .NET    tensor(27)   应用
tensor(222)   Standard    tensor(37)   程序
tensor(3)   .    tensor(10)   了
tensor(1)   <blank>    tensor(6)   。
tensor(1)   <blank>    tensor(3)   </s>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(1)   <blank>
tensor(1)   <blank>    tensor(

In [131]:
len(SRC.vocab)

8890