<a href="https://colab.research.google.com/github/Kriti-be21/Language-Translation-/blob/main/Seq2Seq_Language_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
import numpy as np
from nltk.corpus import stopwords #provides list of english stopwords
# stop = stopwords.words('english')

In [3]:
stop = stopwords.words('english')

## **Process Data**

In [4]:
train, test = train_test_split(pd.read_csv('/content/ita.txt', sep='\t',header = None), test_size=.10) #, nrows=100000

In [5]:
train.columns = ['english','italian']
print(train.shape)
print(test.shape)
print(train.head())

(146406, 2)
(16268, 2)
                         english                         italian
57818        I need an umbrella.      Ho bisogno di un ombrello.
73069       It serves you right.                    Ti sta bene.
10027              Try the cake.               Provate la torta.
123090  Could you wait a moment?  Potrebbe aspettare un momento?
16293             You're boring.                   Siete noiose.


In [6]:
train['english_lower'] = train['english'].str.lower()
train['english_no_punctuation'] = train['english_lower'].str.replace('[^\w\s]','')
#train['english_no_stopwords'] = train['english_no_punctuation'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
#train["english_no_stopwords"] = train["english_no_stopwords"].fillna("fillna")
#train["english_no_stopwords"] = train["english_no_stopwords"]

  train['english_no_punctuation'] = train['english_lower'].str.replace('[^\w\s]','')


In [7]:
train['italian_lower'] = train["italian"].str.lower()
train['italian_no_punctuation'] =  '_start_' + ' ' +train['italian_lower'].str.replace('[^\w\s]','')+ ' ' +'_end_'


  train['italian_no_punctuation'] =  '_start_' + ' ' +train['italian_lower'].str.replace('[^\w\s]','')+ ' ' +'_end_'


In [8]:
max_features1 = 5000
maxlen1 = 35

max_features2 = 5000
maxlen2 = 35

In [9]:
tok1 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features1)
tok1.fit_on_texts(list(train['english_no_punctuation'])) #fit to cleaned text
tf_train_english =tok1.texts_to_sequences(list(train['english_no_punctuation']))
tf_train_english =tf.keras.preprocessing.sequence.pad_sequences(tf_train_english, maxlen=maxlen1) #let's execute pad step

In [10]:
#the processing has to be done for both
#two different tokenizers

In [11]:
tok2 = tf.keras.preprocessing.text.Tokenizer(num_words=max_features2, filters = '*')
tok2.fit_on_texts(list(train['italian_no_punctuation'])) #fit to cleaned text
tf_train_italian = tok2.texts_to_sequences(list(train['italian_no_punctuation']))
tf_train_italian = tf.keras.preprocessing.sequence.pad_sequences(tf_train_italian, maxlen=maxlen2, padding ='post')

# **Model Architecture**

In [12]:
vectorized_italian = tf_train_italian
# For Decoder Input, you don't need the last word as that is only for prediction
# when we are training using Teacher Forcing.
decoder_input_data = vectorized_italian[:, :-1]

# Decoder Target Data Is Ahead By 1 Time Step From Decoder Input Data (Teacher Forcing)
decoder_target_data = vectorized_italian[:, 1:]

print(f'Shape of decoder input: {decoder_input_data.shape}')
print(f'Shape of decoder target: {decoder_target_data.shape}')

vectorized_english = tf_train_english
# Encoder input is simply the body of the issue text
encoder_input_data = vectorized_english
doc_length = encoder_input_data.shape[1]
print(f'Shape of encoder input: {encoder_input_data.shape}')

Shape of decoder input: (146406, 34)
Shape of decoder target: (146406, 34)
Shape of encoder input: (146406, 35)


In [13]:
vocab_size_encoder = len(tok1.word_index) + 1 #remember vocab size?
vocab_size_decoder = len(tok1.word_index) + 1

In [14]:
#arbitrarly set latent dimension for embedding and hidden units
latent_dim = 40

In [15]:
encoder_inputs = tf.keras.Input(shape=(doc_length,), name='Encoder-Input')

# Word embeding for encoder (English text)
x = tf.keras.layers.Embedding(vocab_size_encoder, latent_dim, name='Body-Word-Embedding', mask_zero=False)(encoder_inputs)


#Batch normalization is used so that the distribution of the inputs
#to a specific layer doesn't change over time
x = tf.keras.layers.BatchNormalization(name='Encoder-Batchnorm-1')(x)


# We do not need the `encoder_output` just the hidden state.
_, state_h = tf.keras.layers.GRU(latent_dim, return_state=True, name='Encoder-Last-GRU')(x)

# Encapsulate the encoder as a separate entity so we can just
#  encode without decoding if we want to.
encoder_model = tf.keras.Model(inputs=encoder_inputs, outputs=state_h, name='Encoder-Model')

seq2seq_encoder_out = encoder_model(encoder_inputs)

########################
#### Decoder Model ####
decoder_inputs = tf.keras.Input(shape=(None,), name='Decoder-Input')  # for teacher forcing

# Word Embedding For Decoder (Italian text)
dec_emb = tf.keras.layers.Embedding(vocab_size_decoder, latent_dim, name='Decoder-Word-Embedding', mask_zero=False)(decoder_inputs)
#again batch normalization
dec_bn = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-1')(dec_emb)

# Set up the decoder, using `decoder_state_input` as initial state.
decoder_gru = tf.keras.layers.GRU(latent_dim, return_state=True, return_sequences=True, name='Decoder-GRU')
decoder_gru_output, _ = decoder_gru(dec_bn, initial_state=seq2seq_encoder_out) #the decoder "decodes" the encoder output.
x = tf.keras.layers.BatchNormalization(name='Decoder-Batchnorm-2')(decoder_gru_output)

# Dense layer for prediction
decoder_dense = tf.keras.layers.Dense(vocab_size_decoder, activation='softmax', name='Final-Output-Dense')
decoder_outputs = decoder_dense(x)

########################
#### Seq2Seq Model ####

seq2seq_Model = tf.keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
seq2seq_Model.compile(optimizer=tf.keras.optimizers.Nadam(lr=0.001), loss='sparse_categorical_crossentropy')



In [16]:
#from seq2seq_utils import viz_model_architecture
seq2seq_Model.summary()
#viz_model_architecture(seq2seq_Model)

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Decoder-Input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 Decoder-Word-Embedding (Em  (None, None, 40)             324160    ['Decoder-Input[0][0]']       
 bedding)                                                                                         
                                                                                                  
 Encoder-Input (InputLayer)  [(None, 35)]                 0         []                            
                                                                                                  
 Decoder-Batchnorm-1 (Batch  (None, None, 40)             160       ['Decoder-Word-Embedding[0

# **Model Training**

In [17]:
batch_size = 1200
epochs = 6
history = seq2seq_Model.fit([encoder_input_data, decoder_input_data], np.expand_dims(decoder_target_data, -1),
          batch_size=batch_size,  epochs=epochs,  validation_split=0.12)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [18]:
seq2seq_Model.save('seq2seq_full_data_6_epochs.h5')

  saving_api.save_model(


In [19]:
seq2seq_Model1 = seq2seq_Model

In [20]:
test_text = ['today I want to buy food']

# **Results On Holdout Set**

In [21]:
#max_len_title = 30
# get the encoder's features for the decoder
tok1.fit_on_texts(test_text)

In [22]:
raw_tokenized = tok1.texts_to_sequences(test_text)
raw_tokenized = tf.keras.preprocessing.sequence.pad_sequences(raw_tokenized, maxlen=maxlen1)

In [23]:
body_encoding = encoder_model.predict(raw_tokenized)



In [24]:
latent_dim = seq2seq_Model.get_layer('Decoder-Word-Embedding').output_shape[-1]

In [25]:
# Reconstruct the input into the decoder
decoder_inputs = seq2seq_Model.get_layer('Decoder-Input').input
dec_emb = seq2seq_Model.get_layer('Decoder-Word-Embedding')(decoder_inputs)
dec_bn = seq2seq_Model.get_layer('Decoder-Batchnorm-1')(dec_emb)
# Instead of setting the intial state from the encoder and forgetting about it, during inference
# we are not doing teacher forcing, so we will have to have a feedback loop from predictions back into
# the GRU, thus we define this input layer for the state so we can add this capability

In [29]:
gru_inference_state_input = tf.keras.Input(shape=(latent_dim,), name='hidden_state_input')

# we need to reuse the weights that is why we are getting this
# If you inspect the decoder GRU that we created for training, it will take as input
# 2 tensors -> (1) is the embedding layer output for the teacher forcing
#                  (which will now be the last step's prediction, and will be _start_ on the first time step)
#              (2) is the state, which we will initialize with the encoder on the first time step, but then
#                   grab the state after the first prediction and feed that back in again.

In [30]:
gru_out, gru_state_out = seq2seq_Model.get_layer('Decoder-GRU')([dec_bn, gru_inference_state_input])

In [31]:
# Reconstruct dense layers
dec_bn2 = seq2seq_Model.get_layer('Decoder-Batchnorm-2')(gru_out)
dense_out = seq2seq_Model.get_layer('Final-Output-Dense')(dec_bn2)

In [32]:
decoder_model = tf.keras.Model([decoder_inputs, gru_inference_state_input],
                          [dense_out, gru_state_out])

In [33]:
# we want to save the encoder's embedding before its updated by decoder
#   because we can use that as an embedding for other tasks.
original_body_encoding = body_encoding

In [34]:
state_value = np.array(tok2.word_index['_start_']).reshape(1, 1)

In [35]:
state_value

array([[1]])

In [36]:
decoded_sentence = []
stop_condition = False

In [37]:
vocabulary_inv = dict((v, k) for k, v in tok2.word_index.items())
#vocabulary_inv[0] = "<PAD/>"
#vocabulary_inv[1] = "unknown"

In [38]:
while not stop_condition:
    #print(1)
    preds, st = decoder_model.predict([state_value, body_encoding])
    #preds = preds[preds>0]
    # We are going to ignore indices 0 (padding) and indices 1 (unknown)
    # Argmax will return the integer index corresponding to the
    # prediction + 2 b/c we chopped off first two
    pred_idx = np.argmax(preds[:, :, 2:]) + 2
    #print(np.argmax(preds[:, :, 2:]))
    # retrieve word from index prediction
    #pred_word_str = tok.id2token[pred_idx]
    pred_word_str = vocabulary_inv[pred_idx]
    #print(pred_idx)
    print(pred_word_str)
    if pred_word_str == '_end_' or len(decoded_sentence) >= maxlen2:
        stop_condition = True
        break
    decoded_sentence.append(pred_word_str)

    # update the decoder for the next word
    body_encoding = st
    state_value = np.array(pred_idx).reshape(1, 1)
    #print(state_value)

a
sono
che
io
tom
non
è
stato
_end_
