In [6]:
import os
import keras
import numpy
import ten
from keras.utils import to_categorical
from keras.preprocessing.
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, Input, TimeDistributed, Embedding, GRU, Bidirectional, Dropout, RepeatVector
from keras.losses import sparse_categorical_crossentropy
from keras.optimizers import Adam

In [2]:
english_file_path = os.path.join('/content/small_vocab_en.txt')
french_file_path = os.path.join('/content/small_vocab_fr.txt')

In [3]:
with open(english_file_path, 'r') as f:
  english_sentences = f.read().split('\n')

In [4]:
with open(french_file_path, 'r') as f:
  french_sentences = f.read().split('\n')

In [5]:
class Preprocessing():
  def lowercasing(self, text):
    for i in range(len(text)):
      text[i] = text[i].lower()
    return text

  def tokenization(self, lowercased_text):
    tokenizer = Tokenizer(split=' ', char_level=False)
    tokenizer.fit_on_texts(lowercased_text)
    tokenized_text = tokenizer.texts_to_sequences(lowercased_text)
    return tokenized_text, tokenizer

  def padding(self, tokenized_text):
    max_length = max([len(sent) for sent in tokenized_text])
    padded_text = pad_sequences(tokenized_text, maxlen=max_length, padding='post', truncating='post')
    return padded_text

In [6]:
#English Text

preprocessing = Preprocessing()
english_lowercase = preprocessing.lowercasing(english_sentences)
english_tokenized_text, english_tokenizer = preprocessing.tokenization(english_lowercase)
english_padded_text = preprocessing.padding(english_tokenized_text)

print(english_padded_text[0:5])

[[17 23  1  8 67  4 39  7  3  1 55  2 44  0  0]
 [ 5 20 21  1  9 62  4 43  7  3  1  9 51  2 45]
 [22  1  9 67  4 38  7  3  1  9 68  2 34  0  0]
 [ 5 20 21  1  8 64  4 34  7  3  1 57  2 42  0]
 [29 12 16 13  1  5 82  6 30 12 16  1  5 83  0]]


In [7]:
#French Text Preprocessing

french_tokenized_text, french_tokenizer = preprocessing.tokenization(french_sentences)
french_padded_text = preprocessing.padding(french_tokenized_text)

print(french_padded_text[0:5])

[[ 34  33   1   8  67  37  11  24   6   3   1 112   2  52   0   0   0   0
    0   0   0]
 [  4  30  29   1  12  19   2  55   6   3  95  69   2  44   0   0   0   0
    0   0   0]
 [101   1  12  67   2  48   6   3   1  12  20   2  43   0   0   0   0   0
    0   0   0]
 [  4  30  29   1   8 263   2  43   6   3 103  19   2  51   0   0   0   0
    0   0   0]
 [ 39  13  15  16   1  10  90   5  40  13  15   1   7  89   0   0   0   0
    0   0   0]]


In [8]:
max_english_sequence_length = english_padded_text.shape[1]
max_french_sequence_length = french_padded_text.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
french_vocab_size = len(french_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max French sentence length:", max_french_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("French vocabulary size:", french_vocab_size)

Data Preprocessed
Max English sentence length: 15
Max French sentence length: 21
English vocabulary size: 199
French vocabulary size: 340


In [9]:
english_padded_text = pad_sequences(english_padded_text[:french_padded_text.shape[0]], max_french_sequence_length)
padded_fre = pad_sequences(french_padded_text[:french_padded_text.shape[0]], max_french_sequence_length)
tmp_x = padded_fre.reshape((-1, french_padded_text.shape[-2], max_french_sequence_length)) #Reshaping into (Batch size, timesteps, sequence length)

In [10]:
#tmp_x_encoded = to_categorical(y=tmp_x[0], num_classes=(french_vocab_size+1)) #One hot encoding
#french_padded_text_encoded = to_categorical(y=french_padded_text, num_classes=(french_vocab_size+1))

In [11]:
#batch_size, sequence_length, max_length = tmp_x.shape

In [12]:
#french_padded_text = numpy.reshape(french_padded_text, (batch_size, sequence_length, max_length))

In [13]:
english_padded_text.shape

(99772, 21)

In [14]:
padded_fre.shape

(99772, 21)

In [15]:
tmp_x.shape

(1, 99772, 21)

In [16]:
french_padded_text.shape

(99772, 21)

In [21]:
#RNN with Embedding

rnn_embed_model = Sequential()
rnn_embed_model.add(Input(shape=(tmp_x.shape[0]), name='Input Layer'))
rnn_embed_model.add(Embedding(input_dim=french_vocab_size+1, output_dim=512, input_length=max_french_sequence_length))
rnn_embed_model.add(GRU(units=64, return_sequences=True))
rnn_embed_model.add(GRU(units=32, return_sequences=True))
rnn_embed_model.add(GRU(units=32, return_sequences=True))
rnn_embed_model.add(TimeDistributed(Dense(units=english_vocab_size+1, activation='softmax')))

In [22]:
rnn_embed_model.summary()

rnn_embed_model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['Accuracy'])

rnn_embed_model.fit(tmp_x[0], english_padded_text, verbose=1, batch_size=32, epochs=15, validation_split=0.2)

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 1, 512)            174592    
                                                                 
 gru_6 (GRU)                 (None, 1, 64)             110976    
                                                                 
 gru_7 (GRU)                 (None, 1, 32)             9408      
                                                                 
 gru_8 (GRU)                 (None, 1, 32)             6336      
                                                                 
 time_distributed_2 (TimeDi  (None, 1, 200)            6600      
 stributed)                                                      
                                                                 
Total params: 307912 (1.17 MB)
Trainable params: 307912 (1.17 MB)
Non-trainable params: 0 (0.00 Byte)
__________________

<keras.src.callbacks.History at 0x7d29718c9bd0>

In [25]:
#Bidirectional RNN

bi_rnn_model = Sequential()
bi_rnn_model.add(Input(shape=(tmp_x.shape[0]), name='Input Layer'))
bi_rnn_model.add(Embedding(input_dim=french_vocab_size+1, output_dim=128, input_length=max_french_sequence_length))
bi_rnn_model.add(Bidirectional(layer=LSTM(32, return_sequences=True)))
bi_rnn_model.add(TimeDistributed(Dense(units=english_vocab_size+1, activation='softmax')))

In [26]:
bi_rnn_model.summary()

bi_rnn_model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['Accuracy'])

bi_rnn_model.fit(tmp_x[0], english_padded_text, verbose=1, batch_size=32, epochs=15, validation_split=0.2)

Model: "sequential_4"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 1, 128)            43648     
                                                                 
 bidirectional_2 (Bidirecti  (None, 1, 64)             41216     
 onal)                                                           
                                                                 
 time_distributed_4 (TimeDi  (None, 1, 200)            13000     
 stributed)                                                      
                                                                 
Total params: 97864 (382.28 KB)
Trainable params: 97864 (382.28 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
E

<keras.src.callbacks.History at 0x7d296828f190>

In [33]:
model = Sequential()

#Encoder
model.add(Input(shape=(tmp_x.shape[0]), name='Input Layer'))
model.add(Embedding(input_dim=french_vocab_size+1, output_dim=512, input_length=max_french_sequence_length))
model.add(LSTM(units=256, return_sequences=True))
model.add(Bidirectional(layer=LSTM(128, return_sequences=False)))

#context vector
model.add(RepeatVector(n=max_french_sequence_length))

#Decoder
model.add(LSTM(units=256, return_sequences=True))
model.add(LSTM(units=128, return_sequences=True))
#model.add(LSTM(units=16, return_sequences=True))
model.add(TimeDistributed(Dense(units=english_vocab_size+1, activation='softmax')))

In [34]:
model.summary()

model.compile(optimizer='Adam', loss='sparse_categorical_crossentropy', metrics=['Accuracy'])

model.fit(tmp_x[0], english_padded_text, verbose=1, batch_size=32, epochs=25, validation_split=0.2)

Model: "sequential_8"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_8 (Embedding)     (None, 1, 512)            174592    
                                                                 
 lstm_20 (LSTM)              (None, 1, 256)            787456    
                                                                 
 bidirectional_8 (Bidirecti  (None, 256)               394240    
 onal)                                                           
                                                                 
 repeat_vector_3 (RepeatVec  (None, 21, 256)           0         
 tor)                                                            
                                                                 
 lstm_22 (LSTM)              (None, 21, 256)           525312    
                                                                 
 lstm_23 (LSTM)              (None, 21, 128)          

<keras.src.callbacks.History at 0x7d29503f2620>

In [35]:
model.save('language_translation_encdec.keras')

In [36]:
def final_predictions():
  sentence = 'chine est généralement agréable en novembre et il est jamais tranquille en octobre'
  sentence = [french_tokenizer.word_index[word] for word in sentence.split()]
  sentence = pad_sequences([sentence], maxlen=max_french_sequence_length, padding='post')
  #print(french_tokenized_text[0])
  sentences = numpy.array([sentence[0], french_padded_text[0]])
  predictions = model.predict(sentences, len(sentences))
  #print(predictions)
  eng_id_to_word = {value: key for key, value in english_tokenizer.word_index.items()}
  eng_id_to_word[0] = ''
  print(' '.join([eng_id_to_word[numpy.argmax(value)] for value in predictions[0]]))
  #print(eng_id_to_word)

final_predictions()

      china is usually pleasant during november and it is never quiet in october  
