In [1]:
import nltk
import string
import re
from numpy import array, argmax, random, take
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras.preprocessing.text import Tokenizer
from keras.callbacks import ModelCheckpoint
from keras.preprocessing.sequence import pad_sequences
from keras.models import load_model
from keras import optimizers
import matplotlib.pyplot as plt
%matplotlib inline
pd.set_option('display.max_colwidth', 200)

In [2]:
nltk.download('comtrans')

[nltk_data] Downloading package comtrans to
[nltk_data]     C:\Users\Garrett\AppData\Roaming\nltk_data...
[nltk_data]   Package comtrans is already up-to-date!


True

In [3]:
from nltk.corpus import comtrans
print(comtrans.aligned_sents('alignment-en-fr.txt')[3])

<AlignedSent: 'Please rise , then ,...' -> 'Je vous invite à vou...'>


In [4]:
len(comtrans.aligned_sents('alignment-en-fr.txt'))

33334

In [5]:
print(comtrans.aligned_sents('alignment-en-fr.txt')[3].words)

['Please', 'rise', ',', 'then', ',', 'for', 'this', 'minute', "'", 's', 'silence', '.']


In [6]:
print(comtrans.aligned_sents('alignment-en-fr.txt')[3].mots)

['Je', 'vous', 'invite', 'à', 'vous', 'lever', 'pour', 'cette', 'minute', 'de', 'silence', '.']


In [7]:
print(comtrans.aligned_sents('alignment-en-fr.txt')[3].alignment)

0-0 0-1 0-2 0-4 1-5 5-6 6-7 7-8 8-10 9-9 9-10 10-10 11-11


In [8]:
import pickle
import re
import string
from collections import Counter

In [9]:
english_sent = [sent.words for sent in comtrans.aligned_sents('alignment-en-fr.txt')]
french_sent = [sent.mots for sent in comtrans.aligned_sents('alignment-en-fr.txt')]

In [10]:
english_sent[0]

['Resumption', 'of', 'the', 'session']

In [11]:
french_sent[0]

['Reprise', 'de', 'la', 'session']

In [12]:
len(english_sent)

33334

In [13]:
len(french_sent)

33334

In [14]:
def clean_sentence(sentence):
    sentence2 = [s.translate(str.maketrans('','',string.punctuation)) for s in sentence]
    clean_words = [word.lower() for word in sentence2 if word != '']
    return clean_words

In [15]:
clean_sen_en = [clean_sentence(s) for s in english_sent]

In [16]:
clean_sen_fr = [clean_sentence(s) for s in french_sent]

In [17]:
def filter_sentence_length(sentences_l1, sentences_l2, min_len=0, max_len=20):
    filtered_sentences_l1=[]
    filtered_sentences_l2=[]
    for i in range(len(sentences_l1)):
        if min_len <= len(sentences_l1[i]) <= max_len and min_len <= len(sentences_l2[i]) <= max_len:
            filtered_sentences_l1.append(sentences_l1[i])
            filtered_sentences_l2.append(sentences_l2[i])
    return filtered_sentences_l1, filtered_sentences_l2

In [18]:
filt_clean_sen_en, filt_clean_sen_fr = filter_sentence_length(clean_sen_en, clean_sen_fr)

In [19]:
def create_indexed_dictionary(sentences, dict_size=10000, storage_path=None):
    count_words = Counter()
    dict_words = {}
    for sen in sentences:
        for word in sen:
            count_words[word] += 1
    
    for idx, item in enumerate(count_words.most_common(dict_size)):
        dict_words[item[0]] = idx +1
        
    if storage_path:
        pickle.dump(dict_words, open(storage_path, 'wb'))
    return dict_words
            

In [20]:
def sentences_to_indexes(sentences, indexed_dictionary):
    indexed_sentences = []
    not_found_counter = 0
    for sent in sentences:
        idx_sent = []
        for word in sent:
            try:
                idx_sent.append(indexed_dictionary[word])
            except KeyError:
                idx_sent.append(data_utils.UNK_ID)
                not_found_counter += 1
        indexed_sentences.append(idx_sent)

    print('[sentences_to_indexes] Did not find {} words'.format(not_found_counter))
    return indexed_sentences

In [21]:
dict_en = create_indexed_dictionary(filt_clean_sen_en, dict_size=15000, storage_path='en_dict.pickle')

In [22]:
dict_fr = create_indexed_dictionary(filt_clean_sen_fr, dict_size=15000, storage_path='fr_dict.pickle')

In [23]:
idx_sentences_en = sentences_to_indexes(filt_clean_sen_en, dict_en)

[sentences_to_indexes] Did not find 0 words


In [24]:
idx_sentences_fr = sentences_to_indexes(filt_clean_sen_fr, dict_fr)

[sentences_to_indexes] Did not find 0 words


In [25]:
def extract_max_length(corpora):
    return max([len(sentence) for sentence in corpora])

In [26]:
max_length_en = extract_max_length(idx_sentences_en)
max_length_fr = extract_max_length(idx_sentences_fr)

In [27]:
def prepare_sentences(sentences_en, sentences_fr, len_en, len_fr):
    assert len(sentences_en) == len(sentences_fr)
    data_set = []
    for i in range(len(sentences_en)):
        padding_en = len_en - len(sentences_en[i])
        pad_sentence_en = sentences_en[i] + ([0] * padding_en)
        padding_fr = len_fr - len(sentences_fr[i])
        pad_sentence_fr = sentences_fr[i] + ([0] * padding_fr)
        data_set.append([pad_sentence_en, pad_sentence_fr])
    return data_set

In [28]:
data_set = prepare_sentences(idx_sentences_en, idx_sentences_fr, max_length_en, max_length_fr)

In [29]:
data_set = array(data_set)

In [30]:
from sklearn.model_selection import train_test_split

In [31]:
en_encoded = []
fr_encoded = []
for i in range(len(data_set)):
    en_encoded.append(data_set[i][0])
    fr_encoded.append(data_set[i][1])

In [32]:
en_encoded = array(en_encoded)
fr_encoded = array(fr_encoded)

In [33]:
en_train, en_test, fr_train, fr_test = train_test_split(en_encoded, fr_encoded, random_state = 50)

In [34]:
model = Sequential()
model.add(Embedding(len(dict_fr), 256, input_length = len(fr_train[0]), mask_zero=True))
model.add(LSTM(256))
model.add(RepeatVector(len(en_train[0])))
model.add(LSTM(256, return_sequences=True))
model.add(Dense(len(dict_en), activation='softmax'))

In [35]:
from tensorflow.keras import optimizers

In [36]:
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy')

In [40]:
filename = 'model.h1.16_sep_21'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=False, mode='min')

In [None]:
history = model.fit(fr_train, en_train.reshape(en_train.shape[0], en_train.shape[1], 1),
                   epochs=30, batch_size=256, validation_split = 0.2, callbacks=[checkpoint],
                   verbose=1)

In [35]:
model = load_model('model.h1.16_sep_21')

In [36]:
preds = argmax(model.predict(fr_test.reshape((fr_test.shape[0],fr_test.shape[1]))), axis=-1)

ResourceExhaustedError: OOM when allocating tensor with shape[4209,20,10670] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:ConcatV2] name: concat

In [None]:
def get_word(n, dictionary):
    for word, value in dictionary.items():
    if value == n:
        return word
    return None

In [None]:
preds_text = []
for i in preds:
    temp = []
    for j in range(len(i)):
        t = get_word(i[j], dict_en)
        if j > 0:
            if (t == get_word(i[j-1], dict_en)) or (t == None):
                temp.append('')
            else:
                temp.append(t)
        else:
            if(t == None):
                temp.append('')
        else:
            temp.append(t)
    preds_text.append(' '. join(temp))