#### README
corpora_tools에서 만든 함수가 어떻게 작용하는지 확인하기

In [1]:
from corpora_tools import *

In [2]:
sen_l1,sen_l2 = retrieve_corpora()
print("# A sentence in the two lan")
print("DE:", sen_l1[50])
print("EN:", sen_l2[50])

Retrieving corpora: alignment-de-en.txt
# A sentence in the two lan
DE: ['Die', 'Ablehnung', 'einer', 'Verlängerung', 'des', 'Embargos', 'seitens', 'der', 'EU-Mitgliedstaaten', 'ist', 'unverantwortlich', '.']
EN: ['It', 'is', 'irresponsible', 'of', 'EU', 'Member', 'States', 'to', 'refuse', 'to', 'renew', 'the', 'embargo', '.']


In [3]:
print("# Corpora length (i.e. number of sentences)")
print(len(sen_l1))
assert len(sen_l1) == len(sen_l2), "길이가 다름" #true가 아니면 asserterror

# Corpora length (i.e. number of sentences)
33334


In [4]:
clean_sen_l1 = [clean_sentences(s) for s in sen_l1]
clean_sen_l2 = [clean_sentences(s) for s in sen_l2]

print("# Same sentence as before, but chunked and cleand")
print("DE:", clean_sen_l1[50])
print("EN:", clean_sen_l2[50])

# Same sentence as before, but chunked and cleand
DE: ['die', 'ablehnung', 'einer', 'verlängerung', 'des', 'embargos', 'seitens', 'der', 'eu-mitgliedstaaten', 'ist', 'unverantwortlich', '.']
EN: ['it', 'is', 'irresponsible', 'of', 'eu', 'member', 'states', 'to', 'refuse', 'to', 'renew', 'the', 'embargo', '.']


In [5]:
filt_clean_sen_l1, filt_clean_sen_l2 = filter_sentence_length(clean_sen_l1, clean_sen_l2, max_len=30)
print("# Filtered Corpora length (i.e. number of sentences)")
print(len(filt_clean_sen_l1))
assert len(filt_clean_sen_l1) == len(filt_clean_sen_l2), "길이가 다름"

# Filtered Corpora length (i.e. number of sentences)
25889


In [6]:
dict_l1 = create_indexed_dictionary(filt_clean_sen_l1, dict_size=25000, storage_path='/tmp/l1_dict.p')
dict_l2 = create_indexed_dictionary(filt_clean_sen_l2, dict_size=15000, storage_path='/tmp/l2_dict.p')

In [7]:
idx_sentences_l1 = sentences_to_indexs(filt_clean_sen_l1, dict_l1)
idx_sentences_l2 = sentences_to_indexs(filt_clean_sen_l2, dict_l2)

[sentences_to_indexs] Did not find 1843 words
[sentences_to_indexs] Did not find 0 words


In [8]:
print("# Same sentences as before, with their dictionary ID")
print("DE:", list(zip(filt_clean_sen_l1[0], idx_sentences_l1[0])))
print("EN:", list(zip(filt_clean_sen_l2[0], idx_sentences_l2[0])))

# Same sentences as before, with their dictionary ID
DE: [('wiederaufnahme', 2575), ('der', 7), ('sitzungsperiode', 1220)]
EN: [('resumption', 2736), ('of', 8), ('the', 4), ('session', 896)]


In [9]:
max_length_l1 = extract_max_length(idx_sentences_l1)
max_length_l2 = extract_max_length(idx_sentences_l2)

In [10]:
print("# Max sentence sizes:")
print("DE:", max_length_l1)
print("EN:", max_length_l2)

# Max sentence sizes:
DE: 30
EN: 30


In [11]:
data_set = prepare_sentences(idx_sentences_l1, idx_sentences_l2, max_length_l1, max_length_l2)

In [13]:
print("# Prepared minibatch with paddings and extra stuff")
print("DE:", data_set[0][0])
print("EN:", data_set[0][1])

# Prepared minibatch with paddings and extra stuff
DE: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2575, 7, 1220]
EN: [1, 2736, 8, 4, 896, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [14]:
print("# The sentence pass from X to Y tokens")
print("DE:", len(idx_sentences_l1[0]), "->", len(data_set[0][0]))
print("EN:", len(idx_sentences_l2[0]), "->", len(data_set[0][1]))

# The sentence pass from X to Y tokens
DE: 3 -> 30
EN: 4 -> 32


In [14]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

[nltk_data] Downloading package punkt to /Users/jiae/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [15]:
word_tokenize('Wiederaufnahme der Sitzungsperiode')

['Wiederaufnahme', 'der', 'Sitzungsperiode']