Load the dataset into array

In [3]:
with open("./drive/MyDrive/entofr_dataset/en.txt") as file:
    en_data = ["sos "+line.rstrip()+" eos" for line in file]

with open("./drive/MyDrive/entofr_dataset/fr.txt") as file:
    fr_data = ["sos " + line.rstrip() +" eos" for line in file]


Visualize some of the data

In [4]:
for en_sent, fr_sent in zip(en_data[:2], fr_data[:2]):
  print("the original: ", en_sent)
  print("the translation: ", fr_sent)

the original:  sos new jersey is sometimes quiet during autumn , and it is snowy in april . eos
the translation:  sos new jersey est parfois calme pendant l' automne , et il est neigeux en avril . eos
the original:  sos the united states is usually chilly during july , and it is usually freezing in november . eos
the translation:  sos les états-unis est généralement froid en juillet , et il gèle habituellement en novembre . eos


calculate the mean and the size of english **vocab**

In [5]:
import numpy as np

en_sent_lengths = [len(en_sent.split(" ")) for en_sent in en_data]
en_mean_length = np.mean(en_sent_lengths)
print('(English) Mean sentence length: ', en_mean_length)

(English) Mean sentence length:  15.225678224285508


In [6]:
all_words = []
for sent in en_data:
  all_words.extend(sent.split(" "))
en_vocab_size = len(set(all_words))
print("(English) Vocabulary size: ", en_vocab_size)

(English) Vocabulary size:  230


built the encoder

In [7]:
from tensorflow.keras.layers import Input, GRU
from tensorflow.keras import Model

en_inputs = Input(shape=(15, en_vocab_size))

en_gru = GRU(128, return_state=True)
en_out, en_state = en_gru(en_inputs)

encoder = Model(inputs=en_inputs, outputs=en_state)

# en_inputs = Input(shape=(15, en_vocab_size))
# en_gru = GRU(64, return_state=True)
# en_out, en_state = en_gru(en_inputs)

# print(encoder.summary())

calculate the mean and the size of french **vocab**


In [8]:
sent_lengths = [len(fr_sent.split(" ")) for fr_sent in fr_data]
fr_mean_length = np.mean(sent_lengths)
print('(French) Mean sentence length: ', fr_mean_length)

all_words = []
for sent in fr_data:
  all_words.extend(sent.split(" "))
fr_vocab_size = len(set(all_words))
print("(French) Vocabulary size: ", fr_vocab_size)

(French) Mean sentence length:  16.226730015958218
(French) Vocabulary size:  358


built the  decoder

In [9]:
from tensorflow.keras.layers import RepeatVector

de_inputs = RepeatVector(15)(en_state)

decoder_gru = GRU(128, return_sequences=True)

gru_outputs = decoder_gru(de_inputs, initial_state=en_state)

# de_inputs = Input(shape=(15-1, fr_vocab_size))
# de_gru = GRU(64, return_sequences=True)
# de_out = de_gru(de_inputs, initial_state=en_state)


In [10]:
enc_dec = Model(inputs=en_inputs, outputs=gru_outputs) # the sequenece to seq model
print(enc_dec.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15, 230)]    0           []                               
                                                                                                  
 gru (GRU)                      [(None, 128),        138240      ['input_1[0][0]']                
                                 (None, 128)]                                                     
                                                                                                  
 repeat_vector (RepeatVector)   (None, 15, 128)      0           ['gru[0][1]']                    
                                                                                                  
 gru_1 (GRU)                    (None, 15, 128)      99072       ['repeat_vector[0][0]',    

built timedistributed and dense layer

In [11]:
from tensorflow.keras.layers import Dense, TimeDistributed

de_dense = Dense(fr_vocab_size, activation='softmax')
de_dense_time = TimeDistributed(de_dense)
de_pred = de_dense_time(gru_outputs)

# de_dense = TimeDistributed(Dense(fr_vocab_size, activation='softmax'))
# de_pred = de_dense(de_out)

compile the full encoder decoder network

In [12]:
nmt = Model(inputs=en_inputs, outputs=de_pred)
nmt.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])
# nmt_tf = Model(inputs=[en_inputs, de_inputs], outputs=de_pred)
# nmt_tf.compile(optimizer='adam', loss="categorical_crossentropy", metrics=["acc"])

Tokenize the datasets

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define a Keras Tokenizer
en_tok = Tokenizer(num_words=500, oov_token='UNK')
en_tok.fit_on_texts(en_data)
fr_tok= Tokenizer(num_words=500, oov_token='UNK')
fr_tok.fit_on_texts(fr_data)

initialize sentances to seqs that doing padding and reversig vector


In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def en_sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):
    encoded_text = en_tok.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=15)
    if reverse:
      # Reverse the text using numpy axis reversing
      preproc_text = preproc_text[:,::-1]
    if onehot:
        preproc_text = to_categorical(preproc_text, num_classes=en_vocab_size)
    return preproc_text

def fr_sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):
    encoded_text = fr_tok.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=15)
    if reverse:
      # Reverse the text using numpy axis reversing
      preproc_text = preproc_text[:,::-1]
    if onehot:
        preproc_text = to_categorical(preproc_text, num_classes=fr_vocab_size)
    return preproc_text


shuffle and split the dataset

In [15]:
import math

train_size, valid_size = math.floor(len(en_data)*0.8), math.floor(len(en_data)*0.2)
inds = np.arange(len(en_data))
np.random.shuffle(inds)
train_inds = inds[:train_size]
valid_inds = inds[train_size:train_size+valid_size]

tr_en = [en_data[ti] for ti in train_inds]
tr_fr = [fr_data[ti] for ti in train_inds]
v_en = [en_data[ti] for ti in valid_inds]
v_fr = [fr_data[ti] for ti in valid_inds]

print(train_size)

110288


train the nmt

In [16]:
# import tensorflow as tf

# tf.config.run_functions_eagerly(True)


In [17]:
v_en_x = en_sents2seqs('source', v_en, onehot=True, pad_type='pre')
v_de_y = fr_sents2seqs('target', v_fr, onehot=True)
n_epochs, bsize = 8, 250
for ei in range(n_epochs):
  for i in range(0,train_size,bsize):
    en_x = en_sents2seqs('source', tr_en[i:i+bsize], onehot=True, pad_type='pre')
    de_y = fr_sents2seqs('target', tr_fr[i:i+bsize], onehot=True)
    nmt.train_on_batch(en_x, de_y)

  res = nmt.evaluate(v_en_x, v_de_y, batch_size=valid_size, verbose=0)
  print("Epoch: {} => Loss:{}, Val Acc: {}".format(ei+1,res[0], res[1]*100.0))



# Convert validation data to onehot
# v_en_x = en_sents2seqs('source', v_en, onehot=True, reverse=True)
# v_de_y = fr_sents2seqs('target', v_fr, onehot=True)

# n_epochs, bsize = 50, 250
# for ei in range(n_epochs):
#   for i in range(0,train_size,bsize):
#     # Get a single batch of inputs and outputs
#     en_x = en_sents2seqs('source', tr_en[i:i+bsize], onehot=True, reverse=True)
#     de_y = fr_sents2seqs('target', tr_fr[i:i+bsize], onehot=True)
#     # Train the model on a single batch of data
#     nmt_tf.train_on_batch(en_x, de_y)
#   # Evaluate the trained model on the validation data
#   res = nmt_tf.evaluate(v_en_x, v_de_y, batch_size=valid_size, verbose=0)
#   print("{} => Loss:{}, Val Acc: {}".format(ei+1,res[0], res[1]*100.0))

# n_epochs, bsize = 3, 250
# for ei in range(n_epochs):
#   for i in range(0,train_size,bsize):
#     # Encoder inputs, decoder inputs and outputs
#     en_x = en_sents2seqs('source', tr_en[i:i+bsize], onehot=True, reverse=True)
#     de_xy = fr_sents2seqs('target', tr_fr[i:i+bsize], onehot=True)
#     # Separating decoder inputs and outputs
#     de_x = de_xy[:,:-1,:]
#     de_y = de_xy[:,1:,:]
#     # Training and evaulating on a single batch
#     nmt_tf.train_on_batch([en_x,de_x], de_y)
#   v_en_x = en_sents2seqs('source', v_en, onehot=True, reverse=True)
#   v_de_xy = fr_sents2seqs('target', v_fr, onehot=True)
#   v_de_x, v_de_y = v_de_xy[:,:-1,:], v_de_xy[:,1:,:]
#   res = nmt_tf.evaluate([v_en_x, v_de_x], v_de_y, batch_size=valid_size, verbose=0)
#   print("Epoch {} => Loss:{}, Val Acc: {}".format(ei+1,res[0], res[1]*100.0))

Epoch: 1 => Loss:2.0619122982025146, Val Acc: 44.98984515666962
Epoch: 2 => Loss:1.5855821371078491, Val Acc: 55.467867851257324
Epoch: 3 => Loss:1.337695598602295, Val Acc: 61.47516965866089
Epoch: 4 => Loss:1.137661337852478, Val Acc: 66.75081253051758
Epoch: 5 => Loss:0.9961738586425781, Val Acc: 70.59794664382935
Epoch: 6 => Loss:0.8775573372840881, Val Acc: 73.66095781326294
Epoch: 7 => Loss:0.8031511306762695, Val Acc: 76.00536942481995
Epoch: 8 => Loss:0.7090322375297546, Val Acc: 78.8957417011261


built model for inference

In [18]:
# inf_en_inputs = Input(shape=(15, en_vocab_size))
# inf_en_gru = GRU(64, return_state=True)
# inf_en_out, inf_en_state = inf_en_gru(inf_en_inputs)

# inf_de_inputs = Input(shape=(1, fr_vocab_size))
# inf_de_state_in = Input(shape=(64,))

# inf_de_gru = GRU(64, return_state=True)
# inf_de_out, inf_de_state_out = inf_de_gru(inf_de_inputs, initial_state=inf_de_state_in)
# inf_de_dense = Dense(fr_vocab_size, activation='softmax')
# inf_de_pred = inf_de_dense(inf_de_out)

# encoder = Model(inputs=inf_en_inputs, outputs=inf_en_state)
# decoder = Model(inputs=[inf_de_inputs, inf_de_state_in], outputs=[inf_de_pred, inf_de_state_out])

# inf_en_gru.set_weights(en_gru.get_weights())

test

In [19]:
en_sent = ["sos new jersey is sometimes quiet during autumn eos"]
en_seq = en_sents2seqs('source', en_sent, onehot=True, reverse=True)
fr_pred = nmt.predict(en_seq)
fr_seq = np.argmax(fr_pred, axis=-1)[0]

print(fr_seq)
translation = ''
for i in fr_seq:
  if i == 0:break
  translation += ' ' + fr_tok.index_word[i]

print(translation)

# print(fr_tok.sequences_to_text(fr_seq))


########################################33
# def word2onehot(tokenizer, word, vocab_size):
#     sequence = tokenizer.texts_to_sequences([word])[0]
#     onehot = to_categorical(sequence, num_classes=vocab_size)
#     return onehot

# def probs2word(de_prob, fr_tok):
#     """
#     Convert the decoder's output probabilities to the actual word with the highest probability
#     """
#     # Get the index of the word with the highest probability
#     de_idx = np.argmax(de_prob)
#     # Convert the index to the actual word using the French tokenizer object
#     de_word = fr_tok.index_word[de_idx]
#     return de_word

# en_sent = ['new jersey is sometimes quiet during autumn']
# en_seq = en_sents2seqs('source', en_sent, onehot=True, reverse=True)

# de_s_t = encoder.predict(en_seq)
# de_seq = word2onehot(fr_tok, 'sos', fr_vocab_size)
# fr_sent = ''
# for _ in range(15):
#   de_prob, de_s_t = decoder.predict([de_seq[np.newaxis,:],de_s_t])
#   de_w = probs2word(de_prob, fr_tok)
#   de_seq = word2onehot(fr_tok, de_w, fr_vocab_size)
#   if de_w == 'eos': break
#   fr_sent += de_w + ' '

# print(fr_sent)
##########################################

# # Convert the output indices to text
# output_tokens = fr_tok.sequences_to_texts(fr_seq)
# output_text = ' '.join(output_tokens)
# print(output_text)

# pad_seq = sents2seqs('source',en_sent, en_vocab_size, reverse=True)
# print(pad_seq)
# fr_sentence = ' '.join([all_words[i] for i in fr_seq if i != 0])

[ 3 65  2 24 18 45  9  6  2 11  5 45  4  0  0]
 sos californie est chaud au printemps et il est parfois en printemps eos


In [20]:
import numpy as np
from tensorflow.keras.models import load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer

print("Load data")

with open("./drive/MyDrive/entofr_dataset/en.txt") as file:
    en_data = ["sos "+line.rstrip()+" eos" for line in file]

with open("./drive/MyDrive/entofr_dataset/fr.txt") as file:
    fr_data = ["sos " + line.rstrip() +" eos" for line in file]


print("Tockenization started")

# Define a Keras Tokenizer
en_tok = Tokenizer(num_words=230, oov_token='UNK')
en_tok.fit_on_texts(en_data)
fr_tok= Tokenizer(num_words=358, oov_token='UNK')
fr_tok.fit_on_texts(fr_data)

print("Tockenization compeleted")


def en_sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):
    encoded_text = en_tok.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=15)
    if reverse:
      # Reverse the text using numpy axis reversing
      preproc_text = preproc_text[:,::-1]
    if onehot:
        preproc_text = to_categorical(preproc_text, num_classes=230)
    return preproc_text



en_sent = ["sos new jersey is sometimes quiet during autumn eos"]
en_seq = en_sents2seqs('source', en_sent, onehot=True, reverse=False)
print(en_seq)

nmt = load_model('./drive/MyDrive/Colab Files/NLP_Model')

fr_pred = nmt.predict(en_seq)
fr_seq = np.argmax(fr_pred, axis=-1)[0]

print(fr_seq)
translation = ''
for i in fr_seq:
  if i == 0:break
  translation += ' ' + fr_tok.index_word[i]

print(translation)

Load data
Tockenization started
Tockenization compeleted
[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]]
[ 3 38 37  2 11 70 40 14 27  8  6  2 66  5 48]
 sos new jersey est parfois calme pendant l' automne mais il est sec en mars
