In [1]:
#importing libraries 
import pandas as pd
import numpy as np 
import re
import string
import unicodedata

import tensorflow as tf
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.text import text_to_word_sequence
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, LSTM, Dense
from keras.layers.embeddings import Embedding
from keras import backend as K
from keras.callbacks import ModelCheckpoint 

Using TensorFlow backend.


In [None]:
#file loading function----
def load_doc(filename):
    file = open(filename, mode='rt', encoding='utf-8')
    text = file.read()
    file.close()
    return text

In [None]:
#Spliting the text into individual sentences 
def to_sentences(doc):
    return doc.strip().split('\n')

In [None]:
# finding the maximum and minimum sentence lengths 
def sentence_lengths(sentences):
    lengths = [len(s.split()) for s in sentences]
    return min(lengths), max(lengths)

In [None]:
def clean_lines(lines):
    cleaned = list()
    re_print = re.compile('[^%s]' % re.escape(string.printable))
    table = str.maketrans('', '', string.punctuation)
    for line in lines:        
        line = unicodedata.normalize('NFD', line).encode('ascii', 'ignore').decode('UTF-8').split()       
        line = [word.lower() for word in line]     
        line = [word.translate(table) for word in line]      
        line = [re_print.sub('', w) for w in line]       
        line = [word for word in line if word.isalpha()]       
        cleaned.append(' '.join(line))
    return cleaned

In [None]:
# load English data
filename_english = r'C:\Users\racha\Desktop\cognitive computing\Project\fr-en\europarl-v7.fr-en.en'
doc = load_doc(filename_english)
sentences = to_sentences(doc)
minlen, maxlen = sentence_lengths(sentences)
print('English data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))
print(type(sentences))
 

In [None]:
#Loading french data 
filename_french = r'C:\Users\racha\Desktop\cognitive computing\Project\fr-en\europarl-v7.fr-en.fr'
doc_fr = load_doc(filename_french)
sentences_fr = to_sentences(doc_fr)
minlen, maxlen = sentence_lengths(sentences_fr)
print('French data: sentences=%d, min=%d, max=%d' % (len(sentences), minlen, maxlen))

In [None]:
#Cleaning both french and english sentences 
cleaned_french = clean_lines(sentences_fr)
cleaned_english = clean_lines(sentences)

In [None]:
#Creating DataFrames
french_sentences = pd.DataFrame(cleaned_french, columns = ['French_sentences'])
english_sentences = pd.DataFrame(cleaned_english, columns = ['English_sentences'])

In [None]:
Storing the cleaned sentences
french_sentences.to_csv(r"C:\Users\racha\Desktop\cognitive computing\Project\fr-en\french_sentences.csv")
english_sentences.to_csv(r"C:\Users\racha\Desktop\cognitive computing\Project\fr-en\english_sentences.csv")

# loaded the data to S3

In [None]:
french_sentences = pd.read_csv(r"s3://cognitivecomputing/Finalproject/french_100000.csv")
english_sentences = pd.read_csv(r"s3://cognitivecomputing/Finalproject/english_100000.csv")

In [None]:
#Taking a sample 
n = 50016
french_sample = french_sentences.iloc[:n,:]
english_sample = english_sentences.iloc[:n,:]


In [3]:
english_sentences = english_sentences.iloc[:,-1:] # taking 
french_sentences = french_sentences.iloc[:,-1:]
print(english_sentences.columns)
print(french_sentences.columns)

Index(['English_sentences'], dtype='object')
Index(['French_sentences'], dtype='object')


In [4]:
n = 20000
french_sample = french_sentences.iloc[:n,:]
english_sample = english_sentences.iloc[:n,:]

In [5]:
english_sample.columns
english_sample['English_sentences']
for i in range(len(english_sample['English_sentences'].index)):
    if type(english_sample['English_sentences'][i]) != str:
        english_sample['English_sentences'][i] = str(english_sample['English_sentences'][i])
        
french_sample['French_sentences']
for i in range(len(french_sample['French_sentences'].index)):
    if type(french_sample['French_sentences'][i]) != str:
        french_sample['French_sentences'][i] = str(french_sample['French_sentences'][i])

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [6]:
#Keras Tokenizer 
tokenizer = Tokenizer()
#fitting tokenizer on text 
tokenizer.fit_on_texts(english_sample['English_sentences'])
#converting text to integer sequences
sequences_eng = tokenizer.texts_to_sequences(english_sample['English_sentences'])
print(len(tokenizer.word_index))
print(np.max([len(i) for i in sequences_eng]))
max_len_en= np.max([len(i) for i in sequences_eng])
english_data = pad_sequences(sequences_eng, maxlen=max_len_en)

14624
147


In [7]:
def eod(col):
    col_list = list(col)
    col_main_list = []
    for i in col_list:
        row_list=i.split()
        row_list.append('EOD')
        col_main_list.append(row_list)        
    return col_main_list

col_list_ouput = eod(french_sample['French_sentences'])
french_sentences_output = []
for i in range(len(col_list_ouput)):
    sentence_output = ' '.join(col_list_ouput[i])
    french_sentences_output.append(sentence_output)
    
decoder_output = pd.DataFrame(french_sentences_output, columns = ['decoder_ouput']) 
french_sentences_output

['reprise de la session EOD',
 'je declare reprise la session du parlement europeen qui avait ete interrompue le vendredi decembre dernier et je vous renouvelle tous mes vux en esperant que vous avez passe de bonnes vacances EOD',
 'comme vous avez pu le constater le grand bogue de lan ne sest pas produit en revanche les citoyens dun certain nombre de nos pays ont ete victimes de catastrophes naturelles qui ont vraiment ete terribles EOD',
 'vous avez souhaite un debat a ce sujet dans les prochains jours au cours de cette periode de session EOD',
 'en attendant je souhaiterais comme un certain nombre de collegues me lont demande que nous observions une minute de silence pour toutes les victimes des tempetes notamment dans les differents pays de lunion europeenne qui ont ete touches EOD',
 'je vous invite a vous lever pour cette minute de silence EOD',
 'le parlement debout observe une minute de silence EOD',
 'madame la presidente cest une motion de procedure EOD',
 'vous avez probable

In [8]:
#French Output 
tokenizer_fr = Tokenizer()
#fitting tokenizer on text 
tokenizer_fr.fit_on_texts(decoder_output['decoder_ouput'])
#converting text to integer sequences
sequences_fr = tokenizer_fr.texts_to_sequences(decoder_output['decoder_ouput'])
print(len(tokenizer_fr.word_index))
print(np.max([len(i) for i in sequences_fr]))
max_len_fr = np.max([len(i) for i in sequences_fr])
french_data = pad_sequences(sequences_fr, maxlen=max_len_fr)


21438
147


In [9]:
def offset_column(col):
    col_list = list(col)
    col_main_list = []
    for i in col_list:
        row_list=i.split()
        row_main_list = ['START']
        for j in row_list:
            row_main_list.append(j)
        #row_main_str = [''.join(i) for k in row_main_list]
        col_main_list.append(row_main_list)
    return col_main_list
col_list_input = offset_column(french_sample['French_sentences'])

french_sentences_input = []
for i in range(len(col_list_input)):
    sentence = ' '.join(col_list_input[i])
    french_sentences_input.append(sentence)
french_sentences_input     
decoder_input = pd.DataFrame(french_sentences_input, columns = ['decoder_input']) 

    

In [10]:
tokenizer_fr_di = Tokenizer()
#fitting tokenizer on text 
tokenizer_fr_di.fit_on_texts(decoder_input['decoder_input'])
#converting text to integer sequences
sequences_fr_di = tokenizer_fr_di.texts_to_sequences(decoder_input['decoder_input'])
print(len(tokenizer_fr_di.word_index))
print(np.max([len(i) for i in sequences_fr_di]))
max_len_fr_di = np.max([len(i) for i in sequences_fr_di])
french_data_decoder_input  = pad_sequences(sequences_fr_di, maxlen=max_len_fr_di)

21438
147


In [11]:
latent_dim = 50
vocabulary_size_en =len(tokenizer.word_index) + 1
input_length_en = max_len_en

vocabulary_size_fr_di = len(tokenizer_fr_di.word_index) + 1
input_length_fr = max_len_fr_di

vocabulary_size_fr = len(tokenizer_fr.word_index) + 1

print(vocabulary_size_fr)
print(vocabulary_size_fr_di)

batch_size_seq = 32

21439
21439


In [12]:
d = tokenizer_fr_di.word_index
els = list(d.items())
els[-1]
# tokenizer_fr_di.word_index[tokenizer_fr_di.word_index.keys()[-1]]
# els = list(d.items()) # explicitly convert to a list, in case it's Python 3.x

# # get first inserted element 
# els[0]
# => ('first', 1)

# # get last inserted element 
# els[-1]

('enregistrer', 21438)

In [13]:
french_train = french_data[:19520,:]
french_data_decoder_input_train = french_data_decoder_input[:19520,:]
english_train = english_data[:19520,:]
french_val = french_data[19520:20000,:]
french_data_decoder_val = french_data_decoder_input[19520:20000,:]
english_val = english_data[19520:20000,:]

In [14]:
print(french_train.shape)
print(french_data_decoder_input_train.shape)
print(english_train.shape)
print(french_val.shape)
print(french_data_decoder_val.shape)
print(english_val.shape)

(19520, 147)
(19520, 147)
(19520, 147)
(480, 147)
(480, 147)
(480, 147)


In [15]:
#Encoder
french_data_tensor = tf.placeholder(tf.int32, shape=(None, input_length_fr))
encoder_inputs = Input(shape=(english_data.shape[1],))
encoder_embedding = Embedding(vocabulary_size_en, latent_dim, input_length=input_length_en)(encoder_inputs)
encoder_output, state_h, state_c = LSTM(latent_dim,
                           return_state=True)(encoder_embedding)
encoder_states = [state_h, state_c] # Encoder States 

#Decoder layers 
decoder_inputs = Input(shape=(french_data_decoder_input.shape[1],))
decoder_embedding = Embedding(vocabulary_size_fr_di, latent_dim, input_length=input_length_fr)
decoder_lstm = LSTM(latent_dim, return_sequences=True, return_state = True)
decoder_dense = Dense(vocabulary_size_fr, activation='linear')

#Decoder layer inputs 
embeddings = decoder_embedding(decoder_inputs)
decoder_outputs, _ , _ = decoder_lstm(embeddings,initial_state=encoder_states)
decoder_outputs = decoder_dense(decoder_outputs)

labels = tf.cast(decoder_inputs, tf.int32)
def seq2seq_loss(decoder_inputs, decoder_outputs):  
    #labels = tf.cast(decoder_inputs, tf.int32)
    target_weights = tf.constant(np.ones((batch_size_seq, input_length_fr)), tf.float32)
    xent_loss = K.sum(tf.contrib.seq2seq.sequence_loss(decoder_outputs, labels, 
                                                         weights=target_weights,
                                                         average_across_timesteps=False,
                                                         average_across_batch=False), axis=-1)
                                                         #softmax_loss_function=softmax_loss_f), axis=-1)

    xcent_loss = K.mean(xent_loss)
    return xcent_loss

model = Model([encoder_inputs, decoder_inputs], decoder_outputs)
model.compile(optimizer='rmsprop', loss= seq2seq_loss)
model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 147)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 147)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 147, 50)      731250      input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 147, 50)      1071950     input_2[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LS

In [None]:
model.fit([english_train,french_data_decoder_input_train], french_train, epochs = 30, batch_size = batch_size_seq, validation_data = ([english_val,french_data_decoder_val],french_val))

Train on 19520 samples, validate on 480 samples
Epoch 1/30
Epoch 2/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30

In [27]:
encoder_model = Model(encoder_inputs, encoder_states)
decoder_state_input_h = Input(shape=(latent_dim,))
decoder_state_input_c = Input(shape=(latent_dim,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

In [28]:
embeddings = decoder_embedding(decoder_inputs)
decoder_outputs, state_h, state_c = decoder_lstm(embeddings, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = Model([decoder_inputs] + decoder_states_inputs,[decoder_outputs] + decoder_states)

In [29]:
def softmax(x):    
    return np.exp(x) / np.sum(np.exp(x), axis=0)

french_word_index = tokenizer_fr_di.word_index
index2word = {v:k for k,v in french_word_index.items()}

In [30]:
french_word_index

{'de': 1,
 'start': 2,
 'la': 3,
 'a': 4,
 'et': 5,
 'le': 6,
 'les': 7,
 'des': 8,
 'que': 9,
 'en': 10,
 'nous': 11,
 'du': 12,
 'dans': 13,
 'qui': 14,
 'pour': 15,
 'une': 16,
 'ce': 17,
 'je': 18,
 'un': 19,
 'est': 20,
 'au': 21,
 'pas': 22,
 'il': 23,
 'sur': 24,
 'l': 25,
 'ne': 26,
 'par': 27,
 'commission': 28,
 'plus': 29,
 'cette': 30,
 'd': 31,
 'mais': 32,
 'aux': 33,
 'sont': 34,
 'avec': 35,
 'ou': 36,
 'vous': 37,
 'europeenne': 38,
 'se': 39,
 'ces': 40,
 'monsieur': 41,
 'parlement': 42,
 'politique': 43,
 'etre': 44,
 'comme': 45,
 'fait': 46,
 'tout': 47,
 'si': 48,
 'president': 49,
 'ete': 50,
 'pays': 51,
 'ont': 52,
 'rapport': 53,
 'aussi': 54,
 'lunion': 55,
 'cest': 56,
 'cela': 57,
 'dune': 58,
 'quil': 59,
 'm': 60,
 'bien': 61,
 'conseil': 62,
 'egalement': 63,
 'notre': 64,
 'question': 65,
 'faire': 66,
 'y': 67,
 'etats': 68,
 'tres': 69,
 'on': 70,
 'meme': 71,
 'doit': 72,
 'tous': 73,
 'dun': 74,
 'membres': 75,
 'son': 76,
 'europeen': 77,
 'entre'

In [31]:
def decoded_sentence(sentence):
    token = 0
    states_value = encoder_model.predict(sentence)
    target_seq = np.zeros((1, max_len_fr_di))
    target_seq[0][token] = 2
    decoded_sentence = ''
    stop_conditon = False
    while not stop_conditon:    
        output_tokens, states_h_di, states_c_di= decoder_model.predict([target_seq] + states_value)
        outputs = softmax(output_tokens[0][0])
        output_index = np.argmax(outputs)        
        word = index2word.get(output_index + 1)
        decoded_sentence +=  word
        decoded_sentence += ' '        
        if (word == 'eod' or len(decoded_sentence) > max_len_fr or token == max_len_fr):
            stop_conditon = True 
        #target_seq = np.zeros((1, max_len_fr_di))
        target_seq[0][token + 1] = output_index        
        states_value = [states_h_di, states_c_di]
        token += 1         
    return decoded_sentence
    
    

In [32]:
sample = np.reshape(english_train[1], (1,147))
sentences = decoded_sentence(sample)
sentences
    

  from ipykernel import kernelapp as app


'de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de de '

In [None]:
states_value = encoder_model.predict(np.reshape(english_train[5], (1,147)))
target_seq = np.zeros((1, max_len_fr_di))
token = 0
target_seq[0][token] = 2
decoded_sentence = ''


In [None]:
output_tokens, states_h, states_c= decoder_model.predict([target_seq] + states_value)

In [None]:
output_tokens[0][1]

In [None]:
outputs = softmax(output_tokens[0,,:])
outputs

In [None]:
output_index = np.argmax(output_tokens[0,token,:])

In [None]:
word = index2word.get(output_index + 1)
word