# Translator using Deep Learning

Steps : 
1. Importing Libraries
2. Importing Dataset
3. Data Preprocessing
4. Model Creation
5. Model Training
6. Model Evaluation

In [None]:
#Step 1 : Importing Libraries
import numpy as np
import re
import pandas as pd
import string
from unicodedata import normalize
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.utils.vis_utils import plot_model
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from keras.layers import Embedding
from keras.layers import RepeatVector
from keras.layers import TimeDistributed
from keras.callbacks import ModelCheckpoint
from keras.models import load_model
from nltk.translate.bleu_score import corpus_bleu


In [None]:
#Step 2 : Importing Dataset
dataset = pd.read_csv("Dataset.csv")
n_sentences = 10000
english_set = dataset.iloc[1:n_sentences, 0].values
french_set = dataset.iloc[1:n_sentences, 1].values

In [None]:
english_set

array(['Run!', 'Wow!', 'Fire!', ..., "I'm the teacher.",
       "I'm the teacher.", "I'm tired of it."], dtype=object)

In [None]:
french_set

array(['Courez\u202f!', 'Ça alors\u202f!', 'Au feu !', ...,
       'Je suis le professeur.', 'Je suis la professeur.',
       "J'en ai ras le bol."], dtype=object)

In [None]:
# Step 3 : Data Preprocessing
clean_english_data = []
table = str.maketrans('', '', string.punctuation)
for line in english_set:
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    line = line.split()
    # convert to lowercase
    line = [word.lower() for word in line]
    # remove punctuation
    line = [word.translate(table) for word in line]
    # remove words with numbers
    line = [word for word in line if word.isalpha()]
    line = ' '.join(line)
    clean_english_data.append(line)

In [None]:
clean_english_data

['run',
 'wow',
 'fire',
 'help',
 'jump',
 'stop',
 'stop',
 'stop',
 'wait',
 'wait',
 'i see',
 'i try',
 'i won',
 'i won',
 'oh no',
 'attack',
 'attack',
 'cheers',
 'cheers',
 'cheers',
 'cheers',
 'get up',
 'got it',
 'got it',
 'got it',
 'got it',
 'got it',
 'hop in',
 'hop in',
 'hug me',
 'hug me',
 'i fell',
 'i fell',
 'i know',
 'i left',
 'i left',
 'i lost',
 'im',
 'im ok',
 'im ok',
 'listen',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'no way',
 'really',
 'really',
 'really',
 'thanks',
 'we try',
 'we won',
 'we won',
 'we won',
 'we won',
 'ask tom',
 'awesome',
 'be calm',
 'be calm',
 'be calm',
 'be cool',
 'be fair',
 'be fair',
 'be fair',
 'be fair',
 'be fair',
 'be fair',
 'be kind',
 'be nice',
 'be nice',
 'be nice',
 'be nice',
 'be nice',
 'be nice',
 'beat it',
 'call me',
 'call me',
 'call us',
 'call us',
 'come in',
 'come in',
 'come in',
 'come in',
 'come on',
 'come on',
 'come on',
 'come on',
 'drop it',

In [None]:
clean_french_data = []
table = str.maketrans('', '', string.punctuation)
for line in french_set:
    line = normalize('NFD', line).encode('ascii', 'ignore')
    line = line.decode('UTF-8')
    line = line.split()
    # convert to lowercase
    line = [word.lower() for word in line]
    # remove punctuation
    line = [word.translate(table) for word in line]
    # remove words with numbers
    line = [word for word in line if word.isalpha()]
    line = ' '.join(line)
    clean_french_data.append(line)

In [None]:
clean_french_data

['courez',
 'ca alors',
 'au feu',
 'a laide',
 'saute',
 'ca suffit',
 'stop',
 'arretetoi',
 'attends',
 'attendez',
 'je comprends',
 'jessaye',
 'jai gagne',
 'je lai emporte',
 'oh non',
 'attaque',
 'attaquez',
 'sante',
 'a votre sante',
 'merci',
 'tchintchin',
 'levetoi',
 'jai pige',
 'compris',
 'pige',
 'compris',
 'tas capte',
 'monte',
 'montez',
 'serremoi dans tes bras',
 'serrezmoi dans vos bras',
 'je suis tombee',
 'je suis tombe',
 'je sais',
 'je suis parti',
 'je suis partie',
 'jai perdu',
 'jai ans',
 'je vais bien',
 'ca va',
 'ecoutez',
 'cest pas possible',
 'impossible',
 'en aucun cas',
 'cest hors de question',
 'il nen est pas question',
 'cest exclu',
 'en aucune maniere',
 'hors de question',
 'vraiment',
 'vrai',
 'ah bon',
 'merci',
 'on essaye',
 'nous avons gagne',
 'nous gagnames',
 'nous lavons emporte',
 'nous lemportames',
 'demande a tom',
 'fantastique',
 'sois calme',
 'soyez calme',
 'soyez calmes',
 'sois detendu',
 'sois juste',
 'soyez ju

In [None]:
# Split into test and train sets
from sklearn.model_selection import train_test_split
english_train, english_test, french_train, french_test = train_test_split(clean_english_data, clean_french_data, test_size = 0.25, random_state = 0)

In [None]:
# fit a tokenizer
def create_tokenizer(lines):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(lines)
    return tokenizer

# max sentence length
def max_length(lines):
    return max(len(line.split()) for line in lines)

# prepare english tokenizer
eng_tokenizer = create_tokenizer(clean_english_data)
eng_vocab_size = len(eng_tokenizer.word_index) + 1
eng_length = max_length(clean_english_data)
print('English Vocabulary Size: %d' % eng_vocab_size)
print('English Max Length: %d' % (eng_length))
# prepare french tokenizer
fr_tokenizer = create_tokenizer(clean_french_data)
fr_vocab_size = len(fr_tokenizer.word_index) + 1
fr_length = max_length(clean_french_data)
print('French Vocabulary Size: %d' % fr_vocab_size)
print('French Max Length: %d' % (fr_length))


English Vocabulary Size: 2201
English Max Length: 5
French Vocabulary Size: 4464
French Max Length: 10


In [None]:
# encode and pad sequences
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    X = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    X = pad_sequences(X, maxlen=length, padding='post')
    return X

# one hot encode target sequence
def encode_output(sequences, vocab_size):
    ylist = list()
    for sequence in sequences:
        encoded = to_categorical(sequence, num_classes=vocab_size)
        ylist.append(encoded)
    y = np.array(ylist)
    y = y.reshape(sequences.shape[0], sequences.shape[1], vocab_size)
    return y

# prepare training data
trainX = encode_sequences(fr_tokenizer, fr_length, french_train)
trainY = encode_sequences(eng_tokenizer, eng_length, english_train)
trainY = encode_output(trainY, eng_vocab_size)
# prepare validation data
testX = encode_sequences(fr_tokenizer, fr_length, french_test)
testY = encode_sequences(eng_tokenizer, eng_length, english_test)
testY = encode_output(testY, eng_vocab_size)

In [None]:
# Step 4 and 5 : Model Creation and Training

# define NMT model
def define_model(src_vocab, tar_vocab, src_timesteps, tar_timesteps, n_units):
    model = Sequential()
    model.add(Embedding(src_vocab, n_units, input_length=src_timesteps, mask_zero=True))
    model.add(LSTM(n_units))
    model.add(RepeatVector(tar_timesteps))
    model.add(LSTM(n_units, return_sequences=True))
    model.add(TimeDistributed(Dense(tar_vocab, activation='softmax')))
    return model

# define model
model = define_model(fr_vocab_size, eng_vocab_size, fr_length, eng_length, 256)
model.compile(optimizer='adam', loss='categorical_crossentropy')
# summarize defined model
print(model.summary())
plot_model(model, to_file='model.png', show_shapes=True)
model.fit(trainX, trainY, epochs=30, batch_size=64, validation_data=(testX, testY), verbose=2)

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 10, 256)           1142784   
_________________________________________________________________
lstm_3 (LSTM)                (None, 256)               525312    
_________________________________________________________________
repeat_vector_2 (RepeatVecto (None, 5, 256)            0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 5, 256)            525312    
_________________________________________________________________
time_distributed_2 (TimeDist (None, 5, 2201)           565657    
Total params: 2,759,065
Trainable params: 2,759,065
Non-trainable params: 0
_________________________________________________________________
None

Train on 7499 samples, validate on 2500 samples
Epoch 1/30
 - 24s - loss: 4.4539 - val_loss: 3.5892
Epoc

<keras.callbacks.callbacks.History at 0x214e439aa08>

In [None]:
# Step 6 : Model Evaluation

# map an integer to a word
def word_for_id(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

# generate target given source sequence
def predict_sequence(model, tokenizer, source):
    prediction = model.predict(source, verbose=0)[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = list()
    for i in integers:
        word = word_for_id(i, tokenizer)
        if word is None:
            break
        target.append(word)
    return ' '.join(target)

# evaluate the skill of the model
def evaluate_model(model, tokenizer, sources, raw_target, raw_src):
    actual, predicted = list(), list()
    for i, source in enumerate(sources):
        # translate encoded source text
        source = source.reshape((1, source.shape[0]))
        translation = predict_sequence(model, eng_tokenizer, source)
        if i < 10:
            print('src=[%s], target=[%s], predicted=[%s]' % (raw_src[i], raw_target[i], translation))
        actual.append([raw_target[i].split()])
        predicted.append(translation.split())
    # calculate BLEU score
    print('BLEU-1: %f' % corpus_bleu(actual, predicted, weights=(1.0, 0, 0, 0)))
    print('BLEU-2: %f' % corpus_bleu(actual, predicted, weights=(0.5, 0.5, 0, 0)))
    print('BLEU-3: %f' % corpus_bleu(actual, predicted, weights=(0.3, 0.3, 0.3, 0)))
    print('BLEU-4: %f' % corpus_bleu(actual, predicted, weights=(0.25, 0.25, 0.25, 0.25)))
    
    
print('train')
evaluate_model(model, eng_tokenizer, trainX, english_train, french_train)

print('test')
evaluate_model(model, eng_tokenizer, testX, english_test, french_test)


train
src=[je pourrais marcher], target=[i could walk], predicted=[i could walk]
src=[laissezvous aller], target=[loosen up], predicted=[loosen up]
src=[cest veritable], target=[its genuine], predicted=[its lame]
src=[faitesle de nouveau], target=[do it again], predicted=[do it again]
src=[nous avons chaud], target=[were hot], predicted=[were starving]
src=[les filles sont bargeots], target=[girls are crazy], predicted=[girls are crazy]
src=[vous avez fait ca], target=[you did that], predicted=[you did that]
src=[arretez de hurler], target=[stop yelling], predicted=[stop screaming]
src=[je suis plutot occupee], target=[im rather busy], predicted=[im rather busy]
src=[prenez tout], target=[take it all], predicted=[take this all]
BLEU-1: 0.794398
BLEU-2: 0.712552
BLEU-3: 0.616138
BLEU-4: 0.345425
test
src=[je suis presse], target=[i have to hurry], predicted=[im am in]
src=[arrete], target=[cut it out], predicted=[stop over]
src=[tom va arreter], target=[tomll quit], predicted=[toms is]
