In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [1]:
with open("C:\\Users\\Hossam\\Desktop\\NLP_projectDataset\\en.txt") as file:
    en_data = ["sos "+line.rstrip()+" eos" for line in file]

with open("C:\\Users\\Hossam\\Desktop\\NLP_projectDataset\\fr.txt") as file:
    fr_data = ["sos " + line.rstrip() +" eos" for line in file]


In [2]:
for en_sent, fr_sent in zip(en_data[:2], fr_data[:2]):
  print("the original: ", en_sent)
  print("the translation: ", fr_sent)

the original:  sos new jersey is sometimes quiet during autumn , and it is snowy in april . eos
the translation:  sos new jersey est parfois calme pendant l' automne , et il est neigeux en avril . eos
the original:  sos the united states is usually chilly during july , and it is usually freezing in november . eos
the translation:  sos les états-unis est généralement froid en juillet , et il gèle habituellement en novembre . eos


In [3]:
import numpy as np

en_sent_lengths = [len(en_sent.split(" ")) for en_sent in en_data]
en_mean_length = np.mean(en_sent_lengths)
print('(English) Mean sentence length: ', en_mean_length)


all_words = []
for sent in en_data:
  all_words.extend(sent.split(" "))
en_vocab_size = len(set(all_words))
print("(English) Vocabulary size: ", en_vocab_size)

(English) Mean sentence length:  15.225678224285508
(English) Vocabulary size:  230


In [4]:
from tensorflow.keras.layers import Input, GRU
from tensorflow.keras import Model

en_inputs = Input(shape=(15, en_vocab_size))

en_gru = GRU(256, return_state=True)
en_out, en_state = en_gru(en_inputs)

encoder = Model(inputs=en_inputs, outputs=en_state)


In [5]:
sent_lengths = [len(fr_sent.split(" ")) for fr_sent in fr_data]
fr_mean_length = np.mean(sent_lengths)
print('(French) Mean sentence length: ', fr_mean_length)

all_words = []
for sent in fr_data:
  all_words.extend(sent.split(" "))
fr_vocab_size = len(set(all_words))
print("(French) Vocabulary size: ", fr_vocab_size)

(French) Mean sentence length:  16.226730015958218
(French) Vocabulary size:  358


In [6]:
from tensorflow.keras.layers import RepeatVector

de_inputs = RepeatVector(15)(en_state)

decoder_gru = GRU(256, return_sequences=True)

gru_outputs = decoder_gru(de_inputs, initial_state=en_state)


In [7]:
enc_dec = Model(inputs=en_inputs, outputs=gru_outputs) # the sequenece to seq model
print(enc_dec.summary())

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_1 (InputLayer)           [(None, 15, 230)]    0           []                               
                                                                                                  
 gru (GRU)                      [(None, 256),        374784      ['input_1[0][0]']                
                                 (None, 256)]                                                     
                                                                                                  
 repeat_vector (RepeatVector)   (None, 15, 256)      0           ['gru[0][1]']                    
                                                                                                  
 gru_1 (GRU)                    (None, 15, 256)      394752      ['repeat_vector[0][0]',    

In [8]:
from tensorflow.keras.layers import Dense, TimeDistributed

de_dense = Dense(fr_vocab_size, activation='softmax')
de_dense_time = TimeDistributed(de_dense)
de_pred = de_dense_time(gru_outputs)

In [9]:
nmt = Model(inputs=en_inputs, outputs=de_pred)
nmt.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['acc'])

In [10]:
from tensorflow.keras.preprocessing.text import Tokenizer

# Define a Keras Tokenizer
en_tok = Tokenizer(num_words=230, oov_token='UNK')
en_tok.fit_on_texts(en_data)
fr_tok= Tokenizer(num_words=358, oov_token='UNK')
fr_tok.fit_on_texts(fr_data)

In [11]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical

def en_sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):
    encoded_text = en_tok.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=15)
    if reverse:
      # Reverse the text using numpy axis reversing
      preproc_text = preproc_text[:,::-1]
    if onehot:
        preproc_text = to_categorical(preproc_text, num_classes=en_vocab_size)
    return preproc_text

def fr_sents2seqs(input_type, sentences, onehot=False, pad_type='post', reverse=False):
    encoded_text = fr_tok.texts_to_sequences(sentences)
    preproc_text = pad_sequences(encoded_text, padding=pad_type, truncating='post', maxlen=15)
    if reverse:
      # Reverse the text using numpy axis reversing
      preproc_text = preproc_text[:,::-1]
    if onehot:
        preproc_text = to_categorical(preproc_text, num_classes=fr_vocab_size)
    return preproc_text


In [12]:
import math

train_size, valid_size = math.floor(len(en_data)*0.8), math.floor(len(en_data)*0.2)
inds = np.arange(len(en_data))
np.random.shuffle(inds)
train_inds = inds[:train_size]
valid_inds = inds[train_size:train_size+valid_size]

tr_en = [en_data[ti] for ti in train_inds]
tr_fr = [fr_data[ti] for ti in train_inds]
v_en = [en_data[ti] for ti in valid_inds]
v_fr = [fr_data[ti] for ti in valid_inds]

print(train_size)

110288


In [13]:
v_en_x = en_sents2seqs('source', v_en, onehot=True, pad_type='pre')
v_de_y = fr_sents2seqs('target', v_fr, onehot=True)
n_epochs, bsize = 30, 32
for ei in range(n_epochs):
  for i in range(0,train_size,bsize):
    en_x = en_sents2seqs('source', tr_en[i:i+bsize], onehot=True, pad_type='pre')
    de_y = fr_sents2seqs('target', tr_fr[i:i+bsize], onehot=True)
    nmt.train_on_batch(en_x, de_y)

  res = nmt.evaluate(v_en_x, v_de_y, batch_size=valid_size, verbose=0)
  print("Epoch: {} => Loss:{}, Val Acc: {}".format(ei+1,res[0], res[1]*100.0))

Epoch: 1 => Loss:1.032729148864746, Val Acc: 67.95807480812073
Epoch: 2 => Loss:0.45387357473373413, Val Acc: 86.79215908050537
Epoch: 3 => Loss:0.25966891646385193, Val Acc: 92.14227199554443
Epoch: 4 => Loss:0.19778800010681152, Val Acc: 93.98738145828247
Epoch: 5 => Loss:0.16945427656173706, Val Acc: 94.82614994049072
Epoch: 6 => Loss:0.14245742559432983, Val Acc: 95.6487238407135
Epoch: 7 => Loss:0.1310824155807495, Val Acc: 95.96281051635742
Epoch: 8 => Loss:0.13977506756782532, Val Acc: 95.84336876869202
Epoch: 9 => Loss:0.12922358512878418, Val Acc: 96.13303542137146
Epoch: 10 => Loss:0.12405239790678024, Val Acc: 96.40021324157715
Epoch: 11 => Loss:0.11237531900405884, Val Acc: 96.71212434768677
Epoch: 12 => Loss:0.10506541281938553, Val Acc: 96.93142771720886
Epoch: 13 => Loss:0.11304385960102081, Val Acc: 96.84631824493408
Epoch: 14 => Loss:0.12102416902780533, Val Acc: 96.71913385391235
Epoch: 15 => Loss:0.10307396948337555, Val Acc: 97.04385995864868
Epoch: 16 => Loss:0.115

In [35]:
# new jersey is sometimes quiet during autumn
en_sent = ["sos france is busy during august eos"]
en_seq = en_sents2seqs('source', en_sent, onehot=True, reverse=False)
print(en_seq)

fr_pred = nmt.predict(en_seq)
fr_seq = np.argmax(fr_pred, axis=-1)[0]

print(fr_seq)
translation = ''
for i in fr_seq:
  if i == 0:break
  translation += ' ' + fr_tok.index_word[i]

print(translation)

[[[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 1. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]]
[ 3 36  2 68 18 28 82 59  9  6  2 15 22 29 14]
 sos france est occupé au mois d' août et il est généralement froid à l'
