### Importing modules

In [1]:
import pandas as pd
import numpy as np
import string

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Embedding, RepeatVector
from keras import optimizers
from keras.callbacks import ModelCheckpoint

In [32]:
#This function takes the language as input and opens the file
def generate_data(lang_code):    
    # open the file
    file = open(lang_code + '.txt', encoding='utf-8') #Default mode is "rt"
       
    # read all text
    text = file.read()
    file.close()
    return text


#Preprocessing of the data and conversion to numpy array.
def make_array(lang_code):
    
    raw_data = generate_data(lang_code)
    processed_data = raw_data.strip().split('\n')
    processed_data = [i.split('\t') for i in processed_data]
    return np.array(processed_data)

In [33]:
lang = input().lower()
if( lang[:3] == 'ger'):
    lang_code = 'deu'
elif( lang[:2] == 'fr'):
    lang_code = 'fra'
elif( lang[:4] == 'ital'):
    lang_code = 'ita'
elif( lang[:4] == 'japa'):
    lang_code = 'jpn'
elif( lang[:4] == 'mara'):
    lang_code = 'mar'
data = make_array(lang_code)

french


In [29]:
data[:,0] = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in data[:,0]]
data[:,1] = [s.translate(str.maketrans('', '', string.punctuation)).lower() for s in data[:,1]]

training_dataset = data[:,:2]

In [5]:
training_dataset

array([['go', 'geh'],
       ['hi', 'hallo'],
       ['hi', 'grüß gott'],
       ...,
       ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker',
        'wenn jemand fremdes dir sagt dass du dich wie ein muttersprachler anhörst bedeutet das wahrscheinlich er hat etwas an deinem sprechen bemerkt dass dich als nichtmuttersprachler verraten hat mit anderen worten du hörst dich nicht wirklich wie ein muttersprachler an'],
       ['if someone who doesnt know your background says that you sound like a native speaker it means they probably noticed something about your speaking that made them realize you werent a native speaker in other words you dont really sound like a native speaker',
        'wenn jemand der nicht weiß woher man kommt sagt man erwecke doch den eindruck muttersprachler 

In [6]:
#Tokenizing data
def tokenization(corpus):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(corpus)
    return tokenizer

In [7]:
#English Tokenization
eng_tokenizer = tokenization(training_dataset[:, 0])
eng_vocab_size = len(eng_tokenizer.word_index) + 1

eng_length = 8

#tmp tokenizer
tmp_tokenizer = tokenization(training_dataset[:, 1])
tmp_vocab_size = len(tmp_tokenizer.word_index) + 1

tmp_length = 8

In [8]:
def encode_sequences(tokenizer, length, lines):
    # integer encode sequences
    seq = tokenizer.texts_to_sequences(lines)
    # pad sequences with 0 values
    seq = pad_sequences(seq, maxlen=length, padding='post')
    return seq

In [9]:
trainX = encode_sequences(eng_tokenizer, eng_length, training_dataset[:, 0])
trainY = encode_sequences(tmp_tokenizer, tmp_length, training_dataset[:, 1])

In [10]:
#Building model
model = Sequential()
model.add(Embedding(eng_vocab_size, 512, input_length = eng_length, mask_zero=True))
model.add(LSTM(512))
model.add(RepeatVector(tmp_length))
model.add(LSTM(512, return_sequences = True))
model.add(Dense(tmp_vocab_size, activation='softmax'))

#Compiling Model
rms = optimizers.RMSprop(lr=0.001)
model.compile(optimizer=rms, loss='sparse_categorical_crossentropy')

In [None]:
filename = lang_code + '.h1'
try:
    model = load_model(lang_code + '.h1')
except:
    checkpoint = ModelCheckpoint(filename, monitor = 'val_loss', verbose = 1, save_best_only = True, mode = 'min')

    # train model
    history = model.fit(trainX, trainY.reshape(trainY.shape[0], trainY.shape[1], 1),
                        epochs=10, batch_size=512, validation_split = 0.2,callbacks=[checkpoint],
                        verbose=1)

Epoch 1/25
  3/340 [..............................] - ETA: 2:28 - loss: 1.5423

KeyboardInterrupt: ignored

In [20]:
# preprocessing function
def preprocess(sentence):
    sentence = sentence.translate(str.maketrans('', '', string.punctuation)).lower()
    return sentence
#https://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string


# function to get words from tokens
def get_word(n, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == n:
            return word
    return None

#function to form sentences
def form_sentence(tmp_list):
    sentence = ''
    for i in tmp_list:
        if i != 0:
            word = get_word(i, tmp_tokenizer)
            sentence = sentence + str(word) + ' '
    return sentence

In [21]:
#Final translation functon
def translate(sentences):
    translations = []
    for sentence in sentences:
        sentence = preprocess(sentence)
        a = encode_sequences(eng_tokenizer, eng_length, [sentence])
        b = model.predict_classes(a)[0]
        translations.append(form_sentence(b))
    return translations

In [23]:
translate([
           'How are you',
           'I love You'
])

['wie geht du ', 'ich liebe dich ']