# 对pytorch的transformer的学习

# 设置的几个变量

In [1]:
PAD_TOKEN = '<PAD>'#变成编码后，应该填充是0
UNK_TOKEN = '<UNK>'#未知是1
START_TOKEN = '<StartSent>'#开始是2
END_TOKEN = '<EndSent>'#结束是3

In [2]:
# from os.path import dirname,abspath
# BASE_DIR = dirname(abspath(__file__))
# BASE_DIR
from os.path import dirname, abspath, join, exists
import os
BASE_DIR = os.getcwd() # 等价于这个，那种表示适用于脚本中获取当前的路径的

## 运行的代码
```
python prepare_datasets.py --train_source=data/example/raw/src-train.txt --train_target=data/example/raw/tgt-train.txt --val_source=data/example/raw/src-val.txt --val_target=data/example/raw/tgt-val.txt --save_data_dir=data/example/processed
```

In [3]:
train_source="data/example/raw/src-train.txt"
train_target="data/example/raw/tgt-train.txt"
val_source="data/example/raw/src-val.txt"
val_target="data/example/raw/tgt-val.txt"
save_data_dir="data/example/processed"

# 也要学一下这种表达
# from argparse import ArgumentParser

# parser = ArgumentParser('Prepare datasets')
# parser.add_argument('--train_source', type=str, default='data/example/raw/src-train.txt')
# parser.add_argument('--train_target', type=str, default='data/example/raw/tgt-train.txt')
# parser.add_argument('--val_source', type=str, default='data/example/raw/src-val.txt')
# parser.add_argument('--val_target', type=str, default='data/example/raw/tgt-val.txt')
# parser.add_argument('--save_data_dir', type=str, default='data/example/processed')
# parser.add_argument('--share_dictionary', type=bool, default=False)

# args = parser.parse_args()

In [4]:
class TranslationDataset:

    def __init__(self, data_dir, phase, limit=None):
        assert phase in ('train', 'val'), "Dataset phase must be either 'train' or 'val'"

        self.limit = limit

        self.data = []
        with open(join(data_dir, f'raw-{phase}.txt'), encoding='utf-8') as file:s
            #这里是save_data_dir的路径下，读取的事处理后的数据集
            for line in file:
                source, target = line.strip().split('\t')#source是这里面的英语
                self.data.append((source, target))#target是德语
            #训练集和测试集的数据都保存在data内了

    def __getitem__(self, item):
        if self.limit is not None and item >= self.limit:
            #获取元素的时候，这应该事索引，检查索引是否在人为设定的limit内
            raise IndexError()

        return self.data[item]
        #按照索引返回元素，这应该是重写了Dataset类，在其他项目中也能看见这种写法
        #要么取train，要么取val，只取一个
        #这个是data配对起来的，读取的是处理后的文件

    def __len__(self):
        if self.limit is None:
            return len(self.data)
        else:
            return self.limit

    @staticmethod #写成静态的方法，在调用的时候直接通过类来调用而不是需要设定实例
    def prepare(train_source, train_target, val_source, val_target, save_data_dir):

        if not exists(save_data_dir):
            makedirs(save_data_dir) #生成保存处理后的数据的路径

        for phase in ('train', 'val'):

            if phase == 'train':
                source_filepath = train_source
                target_filepath = train_target
            else:
                source_filepath = val_source
                target_filepath = val_target

            with open(source_filepath, encoding='utf-8') as source_file:
                #我默认的编码是gbk，会报错，直接指定字符集
                source_data = source_file.readlines()# 按行读取，得到结果列表

            with open(target_filepath, encoding='utf-8') as target_filepath:
                target_data = target_filepath.readlines()

            with open(join(save_data_dir, f'raw-{phase}.txt'), 'w', encoding='utf-8') as file:
                for source_line, target_line in zip(source_data, target_data):#匹配生成数据和标签
                    source_line = source_line.strip()#去掉换行符
                    target_line = target_line.strip()
                    line = f'{source_line}\t{target_line}\n'#以制表符为单位去区分数据和目标（label）
                    file.write(line)
        #这样之后，会生成处理好的文件“raw-train.txt”以及“raw-val.txt”

In [5]:
class TranslationDatasetOnTheFly:
    # TranslationDatasetOnTheFly是一个实时加载的数据集表示类，可以在调用getitem方法的时候动态地读取相应的数据文件。
    # 这种实时加载的方式对于数据量比较大的数据集比较适用，可以减少在数据预处理时对内存的要求。

    def __init__(self, phase, limit=None):
        assert phase in ('train', 'val'), "Dataset phase must be either 'train' or 'val'"

        self.limit = limit #默认限制为None

        if phase == 'train':
            source_filepath = join(BASE_DIR, 'data', 'example', 'raw', 'src-train.txt')# 读取文件的原始数据
            target_filepath = join(BASE_DIR, 'data', 'example', 'raw', 'tgt-train.txt')
        elif phase == 'val':
            source_filepath = join(BASE_DIR, 'data', 'example', 'raw', 'src-val.txt')
            target_filepath = join(BASE_DIR, 'data', 'example', 'raw', 'tgt-val.txt')
        else:
            raise NotImplementedError()

        with open(source_filepath, encoding='utf-8') as source_file:
            self.source_data = source_file.readlines()

        with open(target_filepath, encoding='utf-8') as target_filepath:
            self.target_data = target_filepath.readlines()

    def __getitem__(self, item):
        if self.limit is not None and item >= self.limit:
            #获取元素的时候，这应该事索引，检查索引是否在人为设定的limit内
            raise IndexError()

        source = self.source_data[item].strip()#去掉
        target = self.target_data[item].strip()
        return source, target
        #没有配对起来，读取的是处理前的文件，但其实感觉也没什么区别。

    def __len__(self):
        if self.limit is None:
            return len(self.source_data)
        else:
            return self.limit


# @staticmethod静态语法糖举例

In [6]:
class MyClass:
    @staticmethod
    def my_static_method():
        print("This is a static method")

MyClass.my_static_method()

This is a static method


# 来步进地去学习

In [7]:
TranslationDataset.prepare(train_source, train_target, val_source, val_target, save_data_dir)#处理生成目标文件
translation_dataset = TranslationDataset(save_data_dir, 'train')#读取train的数据集
translation_dataset_on_the_fly = TranslationDatasetOnTheFly('train')#一样，读取train的数据集

In [8]:
#也就来源不一样，一个需要数据预处理后才能运行，另一个不要，直接在源文件里面操作

In [9]:
translation_dataset[0]

('It is not acceptable that , with the help of the national bureaucracies , Parliament &apos;s legislative prerogative should be made null and void by means of implementing provisions whose content , purpose and extent are not laid down in advance .',
 'Es geht nicht an , dass über Ausführungsbestimmungen , deren Inhalt , Zweck und Ausmaß vorher nicht bestimmt ist , zusammen mit den nationalen Bürokratien das Gesetzgebungsrecht des Europäischen Parlaments ausgehebelt wird .')

In [10]:
translation_dataset_on_the_fly[0]

('It is not acceptable that , with the help of the national bureaucracies , Parliament &apos;s legislative prerogative should be made null and void by means of implementing provisions whose content , purpose and extent are not laid down in advance .',
 'Es geht nicht an , dass über Ausführungsbestimmungen , deren Inhalt , Zweck und Ausmaß vorher nicht bestimmt ist , zusammen mit den nationalen Bürokratien das Gesetzgebungsrecht des Europäischen Parlaments ausgehebelt wird .')

# 先文字分割，取的时候切割字符串

In [11]:
class TokenizedTranslationDataset:

    def __init__(self, data_dir, phase, limit=None):

        self.raw_dataset = TranslationDataset(data_dir, phase, limit)

    def __getitem__(self, item):
        raw_source, raw_target = self.raw_dataset[item]
        tokenized_source = raw_source.split()# 字符串切割，取的时候才切割的
        tokenized_target = raw_target.split()
        return tokenized_source, tokenized_target

    def __len__(self):
        return len(self.raw_dataset)

In [12]:
tokenized_dataset = TokenizedTranslationDataset(save_data_dir, 'train')
print(len(tokenized_dataset[0][0]))
tokenized_dataset[0]

42


(['It',
  'is',
  'not',
  'acceptable',
  'that',
  ',',
  'with',
  'the',
  'help',
  'of',
  'the',
  'national',
  'bureaucracies',
  ',',
  'Parliament',
  '&apos;s',
  'legislative',
  'prerogative',
  'should',
  'be',
  'made',
  'null',
  'and',
  'void',
  'by',
  'means',
  'of',
  'implementing',
  'provisions',
  'whose',
  'content',
  ',',
  'purpose',
  'and',
  'extent',
  'are',
  'not',
  'laid',
  'down',
  'in',
  'advance',
  '.'],
 ['Es',
  'geht',
  'nicht',
  'an',
  ',',
  'dass',
  'über',
  'Ausführungsbestimmungen',
  ',',
  'deren',
  'Inhalt',
  ',',
  'Zweck',
  'und',
  'Ausmaß',
  'vorher',
  'nicht',
  'bestimmt',
  'ist',
  ',',
  'zusammen',
  'mit',
  'den',
  'nationalen',
  'Bürokratien',
  'das',
  'Gesetzgebungsrecht',
  'des',
  'Europäischen',
  'Parlaments',
  'ausgehebelt',
  'wird',
  '.'])

In [13]:
#pipe.py`是一个工具模块，主要提供针对列表和迭代器的一些管道操作函数

def source_tokens_generator(dataset):
    for source, target in dataset:
        for token in source:
            yield token
            #`yield`是一个关键字，可以在函数内部将函数包装为迭代器。

def target_tokens_generator(dataset):
    for source, target in dataset:
        for token in target:
            yield token

In [14]:
source_generator = source_tokens_generator(tokenized_dataset)#生成一个迭代器
source_generator

#对迭代器的理解
for i, item in enumerate(source_generator):#迭代器大概就是这个意思
    if i < 50:
        print(i, item)
    else:
        break

0 It
1 is
2 not
3 acceptable
4 that
5 ,
6 with
7 the
8 help
9 of
10 the
11 national
12 bureaucracies
13 ,
14 Parliament
15 &apos;s
16 legislative
17 prerogative
18 should
19 be
20 made
21 null
22 and
23 void
24 by
25 means
26 of
27 implementing
28 provisions
29 whose
30 content
31 ,
32 purpose
33 and
34 extent
35 are
36 not
37 laid
38 down
39 in
40 advance
41 .
42 Federal
43 Master
44 Trainer
45 and
46 Senior
47 Instructor
48 of
49 the


In [15]:
from collections import Counter
# `Counter` 是一个 Python 内置类，用于统计可迭代对象中元素出现的次数。它返回一个字典，
# 字典的 key 是元素，value 是对应元素出现的次数。

class IndexDictionary:#词汇表类，用于生成词汇表

    def __init__(self, iterable=None, mode='shared', vocabulary_size=None):

        self.special_tokens = [PAD_TOKEN, UNK_TOKEN, START_TOKEN, END_TOKEN]
        #填充字符，超出词表的字符，开始的字符，结束的字符
        # On-the-fly mode
        if iterable is not None:
            #第一个读进来的参数时迭代器
            self.vocab_tokens, self.token_counts = self._build_vocabulary(iterable, vocabulary_size)
            #vocabulary_size是None，应该指的是无限制吧。获取到字符串的列表，也可以理解为做了一步set吧。还有一个是频数的列表
            self.token_index_dict = {token: index for index, token in enumerate(self.vocab_tokens)}
            #再将列表改为迭代器，主要是为了拿到索引，就是做了一步one-hot编码，
            #最后的结果是，写了个词汇表字典，将字符符号映射为数字
            self.vocabulary_size = len(self.vocab_tokens)
            #词汇表的大小

        self.mode = mode#就定义了模式

    def token_to_index(self, token):
        #用来映射字符串的，如果是没在词汇表里的字符串就映射为UNK，原来实际上是这么做的
        try:
            return self.token_index_dict[token]#从字典中，按照字符串取到映射的数字
        except KeyError:
            return self.token_index_dict[UNK_TOKEN]

    def index_to_token(self, index):
        if index >= self.vocabulary_size:
            return self.vocab_tokens[UNK_TOKEN]
        else:
            return self.vocab_tokens[index]

    def index_sentence(self, sentence):
        # 紧接其后的IndexedInputTargetTranslationDataset就用到了这一块的内容
        return [self.token_to_index(token) for token in sentence]
        #一个列表的分割后的字符串进来，一堆映射的数字出去

    def tokenify_indexes(self, token_indexes):
        return [self.index_to_token(token_index) for token_index in token_indexes]

    def _build_vocabulary(self, iterable, vocabulary_size):
        #应该是建立一个词汇表
        counter = Counter()#Counter类用来建立频数字典的
        for token in iterable:
            counter[token] += 1
            #频数字典
        if vocabulary_size is not None:
            most_commons = counter.most_common(vocabulary_size - len(self.special_tokens))
            frequent_tokens = [token for token, count in most_commons]
            vocab_tokens = self.special_tokens + frequent_tokens
            token_counts = [0] * len(self.special_tokens) + [count for token, count in most_commons]
        else:
            all_tokens = [token for token, count in counter.items()]#获得的是字典里的key，也就是字符本身
            vocab_tokens = self.special_tokens + all_tokens#把那四个人为规定的字符加进去
            token_counts = [0] * len(self.special_tokens) + [count for token, count in counter.items()]
            #那这个获得的是频数的列表

        return vocab_tokens, token_counts

    def save(self, data_dir):

        vocabulary_filepath = join(data_dir, f'vocabulary-{self.mode}.txt')#保存的路径
        with open(vocabulary_filepath, 'w', encoding='utf-8') as file:
            for vocab_index, (vocab_token, count) in enumerate(zip(self.vocab_tokens, self.token_counts)):
                file.write(str(vocab_index) + '\t' + vocab_token + '\t' + str(count) + '\n')
                # 这样source是英语的词汇表，target是德语的词汇表

    @classmethod #classmethod是用来指定一个类的方法为类方法，没有此参数指定的类的方法为实例方法，
    #这样的好处就是你以后重构类的时候不必要修改构造函数，只需要额外添加你要处理的函数，然后使用
    #装饰符 @classmethod 就可以了。
    def load(cls, data_dir, mode='shared', vocabulary_size=None):
        #python中cls代表的是类的本身，相对应的self则是类的一个实例对象。
        #因为cls等同于类本身，类方法中可以通过使用cls来实例化一个对象。
        vocabulary_filepath = join(data_dir, f'vocabulary-{mode}.txt')#文件

        vocab_tokens = {}
        token_counts = []
        with open(vocabulary_filepath, encoding='utf-8') as file:
            for line in file:
                vocab_index, vocab_token, count = line.strip().split('\t')
                vocab_index = int(vocab_index)
                vocab_tokens[vocab_index] = vocab_token
                token_counts.append(int(count))#读取文件

        if vocabulary_size is not None:
            vocab_tokens = {k: v for k, v in vocab_tokens.items() if k < vocabulary_size}
            token_counts = token_counts[:vocabulary_size]#截取特定长度的词汇表

        instance = cls(mode=mode)#就是创了个新实例，原来如此
        instance.vocab_tokens = vocab_tokens
        instance.token_counts = token_counts
        instance.token_index_dict = {token: index for index, token in vocab_tokens.items()}
        instance.vocabulary_size = len(vocab_tokens)

        return instance
        #懂了


In [16]:
share_dictionary=False
# 跑的是这段
source_dictionary = IndexDictionary(source_generator, mode='source')
#source_generator是个迭代器，这里的iterable为source_generator
source_dictionary#这就只是个类而已，定义了很多的方法
target_generator = target_tokens_generator(tokenized_dataset)
target_dictionary = IndexDictionary(target_generator, mode='target')
source_dictionary.save(save_data_dir)#这不用解释了，一目了然，保存为文件的代码
target_dictionary.save(save_data_dir)

#读取，其实只要这么读就可以了，处理完后只需要运行这一个就行了
source_dictionary = IndexDictionary.load(save_data_dir, mode='source')
target_dictionary = IndexDictionary.load(save_data_dir, mode='target')


In [17]:
source_dictionary = IndexDictionary.load(save_data_dir, mode='source')
target_dictionary = IndexDictionary.load(save_data_dir, mode='target')

source_dictionary.vocabulary_size,target_dictionary.vocabulary_size

(24995, 35820)

In [18]:
source_dictionary.token_index_dict

{'<PAD>': 0,
 '<UNK>': 1,
 '<StartSent>': 2,
 '<EndSent>': 3,
 'Federation': 4,
 'of': 5,
 'Aerobic': 6,
 'Fitness': 7,
 ',': 8,
 'Group': 9,
 'Postural': 10,
 'Gym': 11,
 'Stretching': 12,
 'and': 13,
 'Pilates;': 14,
 'from': 15,
 '2004': 16,
 'he': 17,
 'has': 18,
 'been': 19,
 'collaborating': 20,
 'with': 21,
 'Antiche': 22,
 'Terme': 23,
 'as': 24,
 'personal': 25,
 'Trainer': 26,
 'Instructor': 27,
 'Pilates': 28,
 '.': 29,
 '&quot;': 30,
 'Two': 31,
 'soldiers': 32,
 'came': 33,
 'up': 34,
 'to': 35,
 'me': 36,
 'told': 37,
 'that': 38,
 'if': 39,
 'I': 40,
 'refuse': 41,
 'sleep': 42,
 'them': 43,
 'they': 44,
 'will': 45,
 'kill': 46,
 'They': 47,
 'beat': 48,
 'ripped': 49,
 'my': 50,
 'clothes': 51,
 'Yes': 52,
 'we': 53,
 'also': 54,
 'say': 55,
 'the': 56,
 'European': 57,
 'budget': 58,
 'is': 59,
 'not': 60,
 'about': 61,
 'duplication': 62,
 'national': 63,
 'budgets': 64,
 'but': 65,
 'delivering': 66,
 'common': 67,
 'goals': 68,
 'beyond': 69,
 'capacity': 70,
 'nat

# 最后一块的，准备数据集的最后一块了，但完全没有看的干劲啊啊啊啊啊啊啊

In [19]:
class InputTargetTranslationDataset:
    #在IndexedInputTargetTranslationDataset的prepare中就用到了
    def __init__(self, data_dir, phase, limit=None):
        #phase值为'train' 或者 'val'
        self.tokenized_dataset = TokenizedTranslationDataset(data_dir, phase, limit)
        #对字符串进行切割的类，在获取元素的时候切割为list

    def __getitem__(self, item):
        tokenized_source, tokenized_target = self.tokenized_dataset[item]#获取切割后的字符list
        #target是德语，source是英语
        #source的词汇表映射的结果都是1，为什么呢。哦，是没有运行完23333，英语词汇表只有UNK啊，那不得都是1了
        #运行完全就正常了2333
        full_target = [START_TOKEN] + tokenized_target + [END_TOKEN]#加上开始和结束的字符
        inputs = full_target[:-1]#输入去掉了最后一个停止符
        targets = full_target[1:]#目标去掉了最前面的开始符号
        return tokenized_source, inputs, targets 
        #返回的是英语的切割后的字符串列表，去掉了最后的停止符的德语，去掉了开始符号的德语。
        #原因还没有理解

    def __len__(self):
        return len(self.tokenized_dataset)

In [20]:
UNK_INDEX = 1#词汇表未知的词的编码

class IndexedInputTargetTranslationDataset:
    #首先调用的是这个类
    def __init__(self, data_dir, phase, vocabulary_size=None, limit=None):
        #prepare完全后，得到编码后的结果文件，后就是从文件中获取数据了
        self.data = []#数据都在data里了，[(source,input,target)]

        unknownify = lambda index: index if index < vocabulary_size else UNK_INDEX
        #其输入参数为 index，表示一个单词在词典中的下标；返回值为 index，如果该下标小于词汇表大小，
        #否则返回 UNK_INDEX，表示未出现在词典中的单词。
        with open(join(data_dir, f'indexed-{phase}.txt'), encoding='utf-8') as file:#读取文件
            for line in file:
                sources, inputs, targets = line.strip().split('\t')#英文，带开始符的德文，带结束符的德文
                if vocabulary_size is not None:#正常就跑这段
                    indexed_sources = [unknownify(int(index)) for index in sources.strip().split(' ')]
                    #做词汇表的未知符号判断，返回都转为int的数字了
                    indexed_inputs = [unknownify(int(index)) for index in inputs.strip().split(' ')]
                    indexed_targets = [unknownify(int(index)) for index in targets.strip().split(' ')]
                else:
                    indexed_sources = [int(index) for index in sources.strip().split(' ')]
                    indexed_inputs = [int(index) for index in inputs.strip().split(' ')]
                    indexed_targets = [int(index) for index in targets.strip().split(' ')]
                self.data.append((indexed_sources, indexed_inputs, indexed_targets))
                if limit is not None and len(self.data) >= limit:
                    break

        self.vocabulary_size = vocabulary_size
        self.limit = limit

    def __getitem__(self, item):
        if self.limit is not None and item >= self.limit:
            raise IndexError()

        indexed_sources, indexed_inputs, indexed_targets = self.data[item]
        return indexed_sources, indexed_inputs, indexed_targets

    def __len__(self):
        if self.limit is None:
            return len(self.data)
        else:
            return self.limit

    @staticmethod
    def preprocess(source_dictionary):

        def preprocess_function(source):
            source_tokens = source.strip().split()
            indexed_source = source_dictionary.index_sentence(source_tokens)
            return indexed_source

        return preprocess_function

    @staticmethod
    def prepare(data_dir, source_dictionary, target_dictionary):
    # 最先看的是这里，最先运行的是这个
        join_indexes = lambda indexes: ' '.join(str(index) for index in indexes)
        #这个函数接受一个参数`indexes`，作用是将`indexes`中的元素转换为字符串并用空格拼接起来。
        for phase in ('train', 'val'):
            input_target_dataset = InputTargetTranslationDataset(data_dir, phase)#返回的是三个东西

            with open(join(data_dir, f'indexed-{phase}.txt'), 'w', encoding='utf-8') as file:
                #保存文件到“indexed-train.txt”类似这样的文件名里面
                for sources, inputs, targets in input_target_dataset:
                    #英语的切割后的字符串列表，去掉了最后的停止符的德语，去掉了开始符号的德语。
                    indexed_sources = join_indexes(source_dictionary.index_sentence(sources))#为什么source都是1
                    #.index_sentence(sources)这步字符串映射为数字，差不多one-hot编码
                    indexed_inputs = join_indexes(target_dictionary.index_sentence(inputs))
                    indexed_targets = join_indexes(target_dictionary.index_sentence(targets))
                    file.write(f'{indexed_sources}\t{indexed_inputs}\t{indexed_targets}\n')


In [21]:
class IndexedInputTargetTranslationDatasetOnTheFly:

    def __init__(self, phase, source_dictionary, target_dictionary, limit=None):

        self.input_target_dataset = InputTargetTranslationDatasetOnTheFly(phase, limit)
        self.source_dictionary = source_dictionary
        self.target_dictionary = target_dictionary

    def __getitem__(self, item):
        source, inputs, targets = self.input_target_dataset[item]
        indexed_source = self.source_dictionary.index_sentence(source)
        indexed_inputs = self.target_dictionary.index_sentence(inputs)
        indexed_targets = self.target_dictionary.index_sentence(targets)

        return indexed_source, indexed_inputs, indexed_targets

    def __len__(self):
        return len(self.input_target_dataset)

    @staticmethod
    def preprocess(source_dictionary):

        def preprocess_function(source):
            source_tokens = source.strip().split()
            indexed_source = source_dictionary.index_sentence(source_tokens)
            return indexed_source

        return preprocess_function

class InputTargetTranslationDatasetOnTheFly:

    def __init__(self, phase, limit=None):
        self.tokenized_dataset = TokenizedTranslationDatasetOnTheFly(phase, limit)

    def __getitem__(self, item):
        tokenized_source, tokenized_target = self.tokenized_dataset[item]
        full_target = [START_TOKEN] + tokenized_target + [END_TOKEN]
        inputs = full_target[:-1]
        targets = full_target[1:]
        return tokenized_source, inputs, targets

    def __len__(self):
        return len(self.tokenized_dataset)
    
class TokenizedTranslationDatasetOnTheFly:

    def __init__(self, phase, limit=None):

        self.raw_dataset = TranslationDatasetOnTheFly(phase, limit)

    def __getitem__(self, item):
        raw_source, raw_target = self.raw_dataset[item]
        tokenized_source = raw_source.split()
        tokenized_target = raw_target.split()
        return tokenized_source, tokenized_target

    def __len__(self):
        return len(self.raw_dataset)

In [22]:
IndexedInputTargetTranslationDataset.prepare(save_data_dir, source_dictionary, target_dictionary)
indexed_translation_dataset = IndexedInputTargetTranslationDataset(save_data_dir, 'train')
#就是获取到int数据类型的编码后的数据

indexed_translation_dataset_on_the_fly = IndexedInputTargetTranslationDatasetOnTheFly('train', source_dictionary, target_dictionary)
#这个做一样的东西，就是没有数据预处理的，动态得从源数据文件中取读取数据去处理
assert indexed_translation_dataset[0] == indexed_translation_dataset_on_the_fly[0]

In [23]:
indexed_translation_dataset.data[0]

([805,
  59,
  60,
  7624,
  38,
  8,
  21,
  56,
  1814,
  5,
  56,
  63,
  1,
  8,
  396,
  268,
  5497,
  1,
  116,
  89,
  223,
  9547,
  13,
  1,
  93,
  577,
  5,
  2978,
  423,
  1617,
  1920,
  8,
  6173,
  13,
  2409,
  180,
  60,
  4686,
  1377,
  101,
  4064,
  29],
 [2,
  4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  8,
  12,
  13,
  8,
  14,
  15,
  16,
  17,
  6,
  18,
  19,
  8,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32],
 [4,
  5,
  6,
  7,
  8,
  9,
  10,
  11,
  8,
  12,
  13,
  8,
  14,
  15,
  16,
  17,
  6,
  18,
  19,
  8,
  20,
  21,
  22,
  23,
  24,
  25,
  26,
  27,
  28,
  29,
  30,
  31,
  32,
  3])

# 呼，结束，到下一步的文件去学习
啦啦啦啦啦，四舍五入，结束1/3了，开森