## Task-1
 You need to translate each word or sentence from English to Spanish, French and German 


## German to English Translation : 

### Text data Preparation

In [14]:
import string 
import re 
from pickle import dump 
from unicodedata import normalize 
from numpy import array
# load doc into memory 
def load_doc(filename): 
    # open the file as read only 
    file = open(filename, mode= 'rt' , encoding = 'utf-8' )
    # read all text 
    text = file.read() 
    # close the file 
    file.close() 
    return text
# split a loaded document into sentences 
def to_pairs(doc): 
    lines = doc.strip().split('\n' ) 
    pairs = [line.split('\t') for line in lines] 
    return pairs
# clean a list of lines 
def clean_pairs(lines): 
    cleaned = list() 
    # prepare regex for char filtering 
    re_punc = re.compile('[%s]' % re.escape(string.punctuation)) 
    re_print = re.compile('[^%s]' % re.escape(string.printable)) 
    for pair in lines: 
        clean_pair = list() 
        for line in pair: 
            # normalize unicode characters 
            line = normalize('NFD' , line).encode('ascii' , 'ignore' ) 
            line = line.decode('UTF-8' )
            # tokenize on white space 
            line = line.split() 
            # convert to lowercase 
            line = [word.lower() for word in line] 
            # remove punctuation from each token 
            line = [re_punc.sub('' , w) for w in line] 
            # remove non-printable chars form each token 
            line = [re_print.sub('' , w) for w in line] 
            # remove tokens with numbers in them 
            line = [word for word in line if word.isalpha()] 
            # store as string 
            clean_pair.append(' '.join(line)) 
        cleaned.append(clean_pair) 
    return array(cleaned)
# save a list of clean sentences to file 
def save_clean_data(sentences, filename): 
    dump(sentences, open(filename, 'wb' )) 
    print('Saved: {}'.format(filename))

In [None]:
# load dataset 
filename = 'deu.txt' 
doc = load_doc(filename) 
# split into english-german pairs 
pairs = to_pairs(doc) 
# clean sentences 
clean_pairs = clean_pairs(pairs) 
# save clean pairs to file 
save_clean_data(clean_pairs, 'english-german.pkl' ) 
# spot check 
for i in range(80,100): 
    print('{} => {}'.format(clean_pairs[i,0], clean_pairs[i,1]))

In [None]:
#doc

In [None]:
pairs[5]

In [None]:
clean_pairs[7]

### Splitting the text

In [None]:
from pickle import load 
from pickle import dump 
from numpy.random import shuffle

# load a clean dataset 
def load_clean_sentences(filename): 
    return load(open(filename, 'rb' ))
# load dataset 
raw_dataset = load_clean_sentences('english-german.pkl' )
# reduce dataset size 
n_sentences = 10000 
dataset = raw_dataset[:n_sentences, :] 
# random shuffle 
shuffle(dataset) 
# split into train/test 
train, test = dataset[:9000], dataset[9000:] 
# save
save_clean_data(dataset, 'english-german-both.pkl' ) 
save_clean_data(train, 'english-german-train.pkl' ) 
save_clean_data(test, 'english-german-test.pkl' )

In [None]:
dataset

### Train Translation Model

In [1]:
from pickle import load 
from numpy import array 
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from tensorflow.keras.utils import to_categorical 
from keras.utils.vis_utils import plot_model 
from keras.models import Sequential 
from keras.layers import LSTM 
from keras.layers import Dense 
from keras.layers import Embedding 
from keras.layers import RepeatVector 
from keras.layers import TimeDistributed 
from keras.callbacks import ModelCheckpoint

In [6]:
# load a clean dataset 
def load_clean_sentences(filename): 
    return load(open(filename, 'rb' ))
# fit a tokenizer 
def create_tokenizer(lines): 
    tokenizer = Tokenizer() 
    tokenizer.fit_on_texts(lines) 
    return tokenizer
# max sentence length 
def max_length(lines): 
    return max(len(line.split()) for line in lines)
# encode and pad sequences 
def encode_sequences(tokenizer, length, lines): 
    # integer encode sequences 
    X = tokenizer.texts_to_sequences(lines) 
    # pad sequences with 0 values 
    X = pad_sequences(X, maxlen=length, padding= 'post' )
    return X

In [3]:
# one hot encode target sequence 
def encode_output(sequences, vocab_size): 
    ylist = list() 
    for sequence in sequences: 
        encoded = to_categorical(sequence, num_classes=vocab_size) 
        ylist.append(encoded)
    y = array(ylist) 
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size) 
    return y


In [4]:
# define NMT model 
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units): 
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True)) 
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps)) 
    model.add(LSTM(n_units, return_sequences=True)) 
    model.add(TimeDistributed(Dense(tar_vocab, activation= 'softmax' ))) 
    # compile model  
    model.compile(optimizer= 'adam' , loss= 'categorical_crossentropy' ) 
    # summarize defined model 
    model.summary() 
    plot_model(model, to_file= 'model.png' , show_shapes=True) 
    return model

In [7]:

# load datasets 
dataset = load_clean_sentences('english-german-both.pkl' ) 
train = load_clean_sentences('english-german-train.pkl' ) 
test = load_clean_sentences('english-german-test.pkl' ) 
# prepare english tokenizer 
eng_tokenizer = create_tokenizer(dataset[:, 0]) 
eng_vocab_size = len(eng_tokenizer.word_index) + 1 
eng_length = max_length(dataset[:, 0]) 
print('English Vocabulary Size: {}'.format(eng_vocab_size) )
print('English Max Length: {}'.format(eng_length)) 
# prepare german tokenizer 
ger_tokenizer = create_tokenizer(dataset[:, 1]) 
ger_vocab_size = len(ger_tokenizer.word_index) + 1 
ger_length = max_length(dataset[:, 1]) 
print('German Vocabulary Size: {}'.format(ger_vocab_size) )
print('German Max Length: {}'.format(ger_length)) 


English Vocabulary Size: 2176
English Max Length: 5
German Vocabulary Size: 3534
German Max Length: 9


In [23]:
# prepare training data 
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1]) 
trainY = encode_sequences(eng_tokenizer, eng_length, train[:, 0]) 

In [25]:
trainX.shape

(9000, 9)

In [28]:
trainX

array([[  19,    5,  126, ...,    0,    0,    0],
       [   3,    6,   26, ...,    0,    0,    0],
       [   2,   63,  222, ...,    0,    0,    0],
       ...,
       [   4,   17,    8, ...,    0,    0,    0],
       [  36,    2,  178, ...,    0,    0,    0],
       [   1,   46, 1795, ...,    0,    0,    0]])

In [26]:
trainY.shape

(9000, 5)

In [30]:
trainY = encode_output(trainY, eng_vocab_size) 

In [31]:
# prepare validation data 
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1]) 
testY = encode_sequences(eng_tokenizer, eng_length, test[:, 0]) 
testY = encode_output(testY, eng_vocab_size) 

In [27]:
testX

array([[  49,  762,  641, ...,    0,    0,    0],
       [   5,    3,   94, ...,    0,    0,    0],
       [   1,    7,  333, ...,    0,    0,    0],
       ...,
       [   2, 1873,   27, ...,    0,    0,    0],
       [  51,  137,  200, ...,    0,    0,    0],
       [  34,  230,  112, ...,    0,    0,    0]])

In [32]:
# define model 
model = define_model(ger_vocab_size, eng_vocab_size, ger_length, eng_length, 256) 
# fit model 
checkpoint = ModelCheckpoint('model.h5' , monitor = 'val_loss' , verbose=1, save_best_only=True, mode= 'min' )
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 9, 256)            904704    
                                                                 
 lstm (LSTM)                 (None, 256)               525312    
                                                                 
 repeat_vector (RepeatVector  (None, 5, 256)           0         
 )                                                               
                                                                 
 lstm_1 (LSTM)               (None, 5, 256)            525312    
                                                                 
 time_distributed (TimeDistr  (None, 5, 2176)          559232    
 ibuted)                                                         
                                                                 
Total params: 2,514,560
Trainable params: 2,514,560
Non-

<keras.callbacks.History at 0x17d69455910>

In [13]:
from pickle import load 
from numpy import argmax 
from keras.preprocessing.text import Tokenizer 
from keras.preprocessing.sequence import pad_sequences 
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu

In [11]:
# load a clean dataset 
def load_clean_sentences(filename): 
    return load(open(filename, 'rb' ))

# fit a tokenizer 
def create_tokenizer(lines): 
    tokenizer = Tokenizer() 
    tokenizer.fit_on_texts(lines) 
    return tokenizer

# max sentence length 
def max_length(lines): 
    return max(len(line.split()) for line in lines)

# map an integer to a word 
def word_for_id(integer, tokenizer): 
    for word, index in tokenizer.word_index.items(): 
        if index == integer: 
            return word
    return None
# generate target given source sequence 
def predict_sequence(model, tokenizer, source): 
    prediction = model.predict(source, verbose=0)[0] 
    integers = [argmax(vector) for vector in prediction] 
    target = list() 
    for i in integers: 
        word = word_for_id(i, tokenizer) 
        if word is None: 
            break
        target.append(word) 
    return ' ' .join(target)

# evaluate the skill of the model 
def evaluate_model(model, sources, raw_dataset): 
    actual, predicted = list(), list() 
    for i, source in enumerate(sources): 
        # translate encoded source text 
        source = source.reshape((1, source.shape[0])) 
        translation = predict_sequence(model, eng_tokenizer, source) 
        raw_target, raw_src,_ = raw_dataset[i] 
        if i < 10: 
            print(' src=[%s], target=[%s], predicted=[%s]' % (raw_src, raw_target, translation)) 
        actual.append(raw_target.split()) 
        predicted.append(translation.split())
    # calculate BLEU score
    print(' BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0))) 
    print(' BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0))) 
    print(' BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0))) 
    print(' BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))

In [9]:
# load datasets 
dataset = load_clean_sentences('english-german-both.pkl' ) 
train = load_clean_sentences('english-german-train.pkl' ) 
test = load_clean_sentences('english-german-test.pkl' ) 
# prepare english tokenizer 
eng_tokenizer = create_tokenizer(dataset[:, 0]) 
eng_vocab_size = len(eng_tokenizer.word_index) + 1 
eng_length = max_length(dataset[:, 0]) 
# prepare german tokenizer 
ger_tokenizer = create_tokenizer(dataset[:, 1]) 
ger_vocab_size = len(ger_tokenizer.word_index) + 1 
ger_length = max_length(dataset[:, 1]) 
# prepare data 
trainX = encode_sequences(ger_tokenizer, ger_length, train[:, 1]) 
testX = encode_sequences(ger_tokenizer, ger_length, test[:, 1]) 
# load model 
model = load_model('model.h5' ) 

In [66]:
train[:,1],train[:,1].shape

(array(['sind sie glucklich', 'ist das eine fledermaus',
        'tom wurde weinen', ..., 'es war nicht tom', 'lass tom fahren',
        'ich mag filme'], dtype='<U527'),
 (9000,))

In [35]:
trainX

array([[  19,    5,  126, ...,    0,    0,    0],
       [   3,    6,   26, ...,    0,    0,    0],
       [   2,   63,  222, ...,    0,    0,    0],
       ...,
       [   4,   17,    8, ...,    0,    0,    0],
       [  36,    2,  178, ...,    0,    0,    0],
       [   1,   46, 1795, ...,    0,    0,    0]])

In [65]:
trainX.shape

(9000, 9)

In [10]:
# test on some training sequences 
print('train' )
evaluate_model(model, trainX, train) 
# test on some test sequences 
print(' test' )
evaluate_model(model, testX, test)

train
 src=[sind sie glucklich], target=[are you happy], predicted=[are you happy]
 src=[ist das eine fledermaus], target=[is that a bat], predicted=[is that a bat]
 src=[tom wurde weinen], target=[tom would cry], predicted=[tom was out]
 src=[ich ziehe marmelade vor], target=[i prefer jam], predicted=[i prefer jam]
 src=[du weit das], target=[you know that], predicted=[you know that]
 src=[nicht nachlassen], target=[hang on], predicted=[keep it]
 src=[ich heie tom], target=[im called tom], predicted=[i names tom]
 src=[sie sind deine], target=[theyre yours], predicted=[theyre yours]
 src=[ich habe erbrochen], target=[i was sick], predicted=[i been sick]
 src=[frag nach tom], target=[ask for tom], predicted=[ask for tom]


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


 BLEU-1: 0.087605
 BLEU-2: 0.000000
 BLEU-3: 0.000000
 BLEU-4: 0.000000
 src=[mach ne fliege], target=[go away], predicted=[get away]
 src=[sie ist gerade gegangen], target=[she just left], predicted=[she came left]
 src=[ich bin immer noch hier], target=[im still here], predicted=[im in here]
 src=[koche fur mich], target=[cook for me], predicted=[pick for me]
 src=[ich bin ein moslem], target=[i am a muslim], predicted=[im a prisoner]
 src=[er lugt], target=[hes lying], predicted=[he is]
 src=[ich hoffe es], target=[i hope so], predicted=[i hope it]
 src=[wer hat dich geschlagen], target=[who hit you], predicted=[who hit you]
 src=[ich war verwirrt], target=[i was confused], predicted=[i was outraged]
 src=[verzieh dich], target=[get lost], predicted=[go away]
 BLEU-1: 0.083034
 BLEU-2: 0.000000
 BLEU-3: 0.000000
 BLEU-4: 0.000000


In [72]:
german_txt_encoded = encode_sequences(ger_tokenizer,ger_length,['mach ne fliege'])
predict_sequence(model,eng_tokenizer,german_txt_encoded)

'get away'

In [68]:
german_txt_encoded.shape

(1, 9)

In [19]:
model

<keras.engine.sequential.Sequential at 0x1b41388c4f0>

In [25]:
eng_tokenizer

<keras_preprocessing.text.Tokenizer at 0x1b41387da00>

In [71]:
german_lang_txt = ['ich liebe dich']
german_txt_encoded = encode_sequences(ger_tokenizer,ger_length,german_lang_txt)
predict_sequence(model,eng_tokenizer,german_txt_encoded)

'i love you'

In [75]:
eng_txt_encoded

array([[ 1, 37,  4,  0,  0]])

In [76]:
eng_length

5

# English to German translation : 

In [9]:
# prepare training data 
trainX = encode_sequences(eng_tokenizer, eng_length, train[:, 0]) 
trainY = encode_sequences(ger_tokenizer, ger_length, train[:, 1]) #Y -
trainY = encode_output(trainY, ger_vocab_size) 

# prepare validation data 
testX = encode_sequences(eng_tokenizer, eng_length, test[:, 0])
testY = encode_sequences(ger_tokenizer, ger_length, test[:, 1])  
testY = encode_output(testY, ger_vocab_size) 

# define model 
model_en2ge = define_model(eng_vocab_size, ger_vocab_size, eng_length, ger_length, 256) 
# fit model 
checkpoint = ModelCheckpoint('model_en2ge.h5' , monitor = 'val_loss' , verbose=1, save_best_only=True, mode= 'min' )
model_en2ge.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), callbacks=[checkpoint], verbose=2)

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 5, 256)            557056    
                                                                 
 lstm_2 (LSTM)               (None, 256)               525312    
                                                                 
 repeat_vector_1 (RepeatVect  (None, 9, 256)           0         
 or)                                                             
                                                                 
 lstm_3 (LSTM)               (None, 9, 256)            525312    
                                                                 
 time_distributed_1 (TimeDis  (None, 9, 3534)          908238    
 tributed)                                                       
                                                                 
Total params: 2,515,918
Trainable params: 2,515,918
No

<keras.callbacks.History at 0x176701f56d0>

In [14]:
print(' train' )
evaluate_model(model_en2ge, trainX, train) 
# test on some test sequences 
print(' test' )
evaluate_model(model_en2ge, testX, test)

 train
 src=[sind sie glucklich], target=[are you happy], predicted=[the we late]
 src=[ist das eine fledermaus], target=[is that a bat], predicted=[it im youre pushed]
 src=[tom wurde weinen], target=[tom would cry], predicted=[tom know had]
 src=[ich ziehe marmelade vor], target=[i prefer jam], predicted=[i was horrible]
 src=[du weit das], target=[you know that], predicted=[we kill im]
 src=[nicht nachlassen], target=[hang on], predicted=[home]
 src=[ich heie tom], target=[im called tom], predicted=[i was tom looked]
 src=[sie sind deine], target=[theyre yours], predicted=[is dont up]
 src=[ich habe erbrochen], target=[i was sick], predicted=[i are good]
 src=[frag nach tom], target=[ask for tom], predicted=[cat tom tom]


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


 BLEU-1: 0.104680
 BLEU-2: 0.009604
 BLEU-3: 0.000000
 BLEU-4: 0.000000
 test
 src=[mach ne fliege], target=[go away], predicted=[home ive]
 src=[sie ist gerade gegangen], target=[she just left], predicted=[is it youre better]
 src=[ich bin immer noch hier], target=[im still here], predicted=[i a use in]
 src=[koche fur mich], target=[cook for me], predicted=[sick do do]
 src=[ich bin ein moslem], target=[i am a muslim], predicted=[i a that diabetic]
 src=[er lugt], target=[hes lying], predicted=[he it]
 src=[ich hoffe es], target=[i hope so], predicted=[i care i]
 src=[wer hat dich geschlagen], target=[who hit you], predicted=[were go is]
 src=[ich war verwirrt], target=[i was confused], predicted=[i are late]
 src=[verzieh dich], target=[get lost], predicted=[home ill]
 BLEU-1: 0.103321
 BLEU-2: 0.000000
 BLEU-3: 0.000000
 BLEU-4: 0.000000


The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [19]:
eng_lang_txt = ['I like you']
eng_txt_encoded = encode_sequences(eng_tokenizer,eng_length,eng_lang_txt)
predict_sequence(model_en2ge,ger_tokenizer,eng_txt_encoded)

'ich mag dich'