# 数据处理模块

## 目录

* 数据集加载
* 读取双语语料
* 创建word2id
* 将数据转化成id

In [4]:
from torch.utils import data
import os
import nltk
import numpy as np
import pickle
from collections import Counter

In [5]:
# 数据集加载
raw_source_data = open("./data/iwslt14/train.tags.de-en.de",encoding="utf-8").readlines()
raw_target_data = open("./data/iwslt14/train.tags.de-en.en",encoding="utf-8").readlines()
raw_source_data = [x[0:-1] for x in raw_source_data]
raw_target_data = [x[0:-1] for x in raw_target_data]
print (len(raw_target_data))
print (len(raw_source_data))
print (raw_source_data[0:5])
print (raw_target_data[0:5])

178526
178526
['<url>http://www.ted.com/talks/lang/de/stephen_palumbi_following_the_mercury_trail.html</url>', 'Das Meer kann ziemlich kompliziert sein.', 'Und was menschliche Gesundheit ist, kann auch ziemlich kompliziert sein.', 'Und diese zwei zusammen zu bringen, erscheint vielleicht wie eine gewaltige Aufgabe. Aber was ich Ihnen zu sagen versuche ist, dass es trotz dieser Komplexität einige einfache Themen gibt, von denen ich denke, wenn wir diese verstehen, können wir uns wirklich weiter entwickeln.', 'Und diese einfachen Themen sind eigentlich keine komplexen wissenschaftlichen Zusammenhänge, sondern Tatsachen,die wir alle gut kennen.']
['<url>http://www.ted.com/talks/stephen_palumbi_following_the_mercury_trail.html</url>', 'It can be a very complicated thing, the ocean.', 'And it can be a very complicated thing, what human health is.', "And bringing those two together might seem a very daunting task, but what I'm going to try to say is that even in that complexity, there's some

In [6]:
source_data = []
target_data = []
for i in range(len(raw_source_data)):
    if raw_target_data[i]!="" and raw_source_data[i]!="" and raw_source_data[i][0]!="<" and raw_target_data[i][0]!="<":
        source_sentence = nltk.word_tokenize(raw_source_data[i],language="german")
        target_sentence = nltk.word_tokenize(raw_target_data[i],language="english")
        if len(source_sentence)<=100 and len(target_sentence)<=100:
            source_data.append(source_sentence)
            target_data.append(target_sentence)
print (source_data[0:5])
print (target_data[0:5])

[['Das', 'Meer', 'kann', 'ziemlich', 'kompliziert', 'sein', '.'], ['Und', 'was', 'menschliche', 'Gesundheit', 'ist', ',', 'kann', 'auch', 'ziemlich', 'kompliziert', 'sein', '.'], ['Und', 'diese', 'zwei', 'zusammen', 'zu', 'bringen', ',', 'erscheint', 'vielleicht', 'wie', 'eine', 'gewaltige', 'Aufgabe', '.', 'Aber', 'was', 'ich', 'Ihnen', 'zu', 'sagen', 'versuche', 'ist', ',', 'dass', 'es', 'trotz', 'dieser', 'Komplexität', 'einige', 'einfache', 'Themen', 'gibt', ',', 'von', 'denen', 'ich', 'denke', ',', 'wenn', 'wir', 'diese', 'verstehen', ',', 'können', 'wir', 'uns', 'wirklich', 'weiter', 'entwickeln', '.'], ['Und', 'diese', 'einfachen', 'Themen', 'sind', 'eigentlich', 'keine', 'komplexen', 'wissenschaftlichen', 'Zusammenhänge', ',', 'sondern', 'Tatsachen', ',', 'die', 'wir', 'alle', 'gut', 'kennen', '.'], ['Und', 'ich', 'werde', 'mit', 'dieser', 'hier', 'anfangen', ':', 'Wenn', 'die', 'Mama', 'nicht', 'glücklich', 'ist', ',', 'ist', 'keiner', 'glücklich', '.']]
[['It', 'can', 'be', '

In [8]:
# 源语言word2id
words = []
for sentence in source_data:
    for word in sentence:
        words.append(word)
word_freq = dict(Counter(words).most_common(30000-4))
source_word2id = {"<pad>":0,"<unk>":1,"<start>":2,"<end>":3}
for word in word_freq:
    source_word2id[word] = len(source_word2id)
source_word2id

{'<pad>': 0,
 '<unk>': 1,
 '<start>': 2,
 '<end>': 3,
 ',': 4,
 '.': 5,
 'die': 6,
 'und': 7,
 'der': 8,
 'ist': 9,
 'das': 10,
 'in': 11,
 'zu': 12,
 'ich': 13,
 'wir': 14,
 'es': 15,
 'Sie': 16,
 'sie': 17,
 'ein': 18,
 'Und': 19,
 'von': 20,
 'dass': 21,
 'nicht': 22,
 'eine': 23,
 'den': 24,
 'mit': 25,
 'Ich': 26,
 'auf': 27,
 'sich': 28,
 'wie': 29,
 'sind': 30,
 'haben': 31,
 '?': 32,
 'für': 33,
 ':': 34,
 '``': 35,
 "''": 36,
 'man': 37,
 'war': 38,
 'an': 39,
 'Das': 40,
 'als': 41,
 'so': 42,
 'Es': 43,
 'was': 44,
 'um': 45,
 'diese': 46,
 'dem': 47,
 'im': 48,
 'Wir': 49,
 'wenn': 50,
 'einen': 51,
 'uns': 52,
 'können': 53,
 'werden': 54,
 'oder': 55,
 'hat': 56,
 'aus': 57,
 'er': 58,
 'des': 59,
 'aber': 60,
 'Aber': 61,
 '–': 62,
 'auch': 63,
 'über': 64,
 'einer': 65,
 'kann': 66,
 'einem': 67,
 'sehr': 68,
 'Die': 69,
 'hier': 70,
 'wird': 71,
 'nur': 72,
 'Menschen': 73,
 'gibt': 74,
 'dann': 75,
 'mich': 76,
 'also': 77,
 'sein': 78,
 'etwas': 79,
 'sehen': 80,
 '-

In [9]:
# 目标语言word2id
words = []
for sentence in target_data:
    for word in sentence:
        words.append(word)
word_freq = dict(Counter(words).most_common(30000-4))
target_word2id = {"<pad>":0,"<unk>":1,"<start>":2,"<end>":3}
for word in word_freq:
    target_word2id[word] = len(target_word2id)
target_word2id

{'<pad>': 0,
 '<unk>': 1,
 '<start>': 2,
 '<end>': 3,
 ',': 4,
 '.': 5,
 'the': 6,
 'to': 7,
 'of': 8,
 'a': 9,
 'and': 10,
 'that': 11,
 'I': 12,
 'in': 13,
 'is': 14,
 'you': 15,
 'it': 16,
 "'s": 17,
 'we': 18,
 'And': 19,
 'this': 20,
 'was': 21,
 '--': 22,
 'for': 23,
 'are': 24,
 'have': 25,
 'they': 26,
 'on': 27,
 'do': 28,
 'with': 29,
 'So': 30,
 '?': 31,
 "n't": 32,
 'what': 33,
 'can': 34,
 'about': 35,
 'be': 36,
 'at': 37,
 'not': 38,
 'as': 39,
 'all': 40,
 '``': 41,
 "'re": 42,
 "''": 43,
 'there': 44,
 'It': 45,
 'one': 46,
 'people': 47,
 'like': 48,
 'my': 49,
 'from': 50,
 'so': 51,
 'an': 52,
 'but': 53,
 'just': 54,
 'very': 55,
 'But': 56,
 'We': 57,
 'or': 58,
 'these': 59,
 'The': 60,
 'our': 61,
 'if': 62,
 'out': 63,
 'going': 64,
 'know': 65,
 'me': 66,
 'them': 67,
 'up': 68,
 'by': 69,
 ':': 70,
 'had': 71,
 'he': 72,
 'because': 73,
 'when': 74,
 'which': 75,
 'see': 76,
 'think': 77,
 'more': 78,
 'would': 79,
 'really': 80,
 'were': 81,
 'get': 82,
 'yo

In [10]:
# 源语言数据转id
for i, sentence in enumerate(source_data):
    for j, word in enumerate(sentence):
        source_data[i][j] = source_word2id.get(word,1)
    source_data[i] = source_data[i][0:100] +[0]*(100-len(source_data[i]))
    source_data[i].reverse()
source_data[0:5]

[[0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  78,
  2207,
  256,
  66,
  1273,
  40],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  5,
  78,
  2207,
  256,
  63,
  66,
  4,
  9,


In [11]:
# 目标语言数据转id
for i, sentence in enumerate(target_data):
    for j, word in enumerate(sentence):
        target_data[i][j] = target_word2id.get(word,1)
    target_data[i] = target_data[i][0:99]+ [3] + [0] * (99 - len(target_data[i]))
target_data[0:5]

[[45,
  34,
  36,
  9,
  55,
  1055,
  122,
  4,
  6,
  616,
  5,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0],
 [19,
  16,
  34,
  36,
  9,
  55,
  1055,
  122,
  4,
  33,
  186,
  388,
  14,
  5,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  

<img src="./imgs/nmt_model.png"  width="700" height="700" align="bottom" />

In [12]:
# 训练错位输入
target_data_input = [[2]+sentence[0:-1] for sentence in target_data]
print (target_data_input[0])
print (target_data[0])

[2, 45, 34, 36, 9, 55, 1055, 122, 4, 6, 616, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[45, 34, 36, 9, 55, 1055, 122, 4, 6, 616, 5, 3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
