In [None]:
!pip install rich
!pip install contractions
import tensorflow as tf
import os
import contractions
import tensorflow as tf
import pandas as pd
import numpy as np

import time
import rich
from rich.progress import track
import spacy

# Constants

class params():
    pass

params.batch_size = 64

params.embed_size = 300
params.gru_units = 128
params.learning_rate = .001
params.optimizer = tf.keras.optimizers.Adam(params.learning_rate, clipvalue=1)
params.epochs = 100


params.num_samples = 30000 
params.eng_vocab = 5776
params.ger_vocab = 8960
params.dec_max_len = 17
params.en_max_len = 20



In [None]:
#Preprocessing Text
class preprocess_text():

    def __init__(self):
        pass
    
    def remove_pattern(self, text, pattern= r'[^a-zA-Z0-9.!?, ]', replace_with= ""):
        return re.sub(pattern, replace_with, text)
    
    def tokenize_sent(self, text, nlp):
        doc= nlp(text)
        return [sent.text for sent in doc.sents]
    
    def tokenize_words(self, text, nlp):
        doc= nlp(text)
        return " ".join(tok.text for tok in doc)
    
    def expand_contractions(self, text):

        return contractions.fix(text)
        
    def do_lemmatization(self, text, nlp):
        doc= nlp(text)
        return ' '.join(tok.lemma_ if tok.lemma_ != "-PRON-" else tok.text for tok in doc)
        
    def add_sos_eos(self, text, sos= False, eos= False):
        if (sos and eos):
            return "<sos> " + text + " <eos>" 
        if eos:
            return text + " <eos>"
        if sos:
            return "<sos> " + text
        return text
        
    def remove_accents(self, text):

        return unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('UTF-8', 'ignore')

def call_preprocessing(df_col, nlp_en= True, lower_= True, remove_pattern_= False, tokenize_words_= False,
               expand_contractions_= False, do_lemmatization_= False,
               sos= False, eos= False, remove_accents_= False):
    
    nlp= spacy.load('en_core_web_sm') if nlp_en else spacy.load('de_core_news_sm')
    prep= preprocess_text()
    
    if expand_contractions_:
        df_col= df_col.map(lambda text: prep.expand_contractions(text))
        
    if remove_accents_:
        df_col= df_col.map(lambda text: prep.remove_accents(text))
        
    if do_lemmatization_:
        df_col= df_col.map(lambda text: prep.do_lemmatization(text, nlp))
        
    if tokenize_words_:
        df_col= df_col.map(lambda text: prep.tokenize_words(text, nlp))
        
    if remove_pattern_:
        df_col= df_col.map(lambda text: prep.remove_pattern_(text))
    
    if eos or sos:
        df_col= df_col.map(lambda text: prep.add_sos_eos(text, sos, eos))
        

    if lower_:
        df_col= df_col.map(lambda text: text.lower())
    return df_col

def tokenizer(df_col, nlp_en= True):
    vocab= set()
    _= [[vocab.update([tok]) for tok in text.split(" ")] for text in df_col]

    if not nlp_en:
        vocab.update(["<sos>"])
        vocab.update(["<eos>"])

    tokenize= dict(zip(vocab, range(1, 1+len(vocab))))
    detokenize= dict(zip(range(1, 1+len(vocab)), vocab))
    return tokenize, detokenize, len(vocab)

def padding(txt_toks, max_len):
    curr_ls= txt_toks.split(" ")
    len_ls= len(curr_ls)
    _= [curr_ls.append("<pad>") for i in range(max_len-len_ls) if len(curr_ls)<max_len]
    return " ".join(curr_ls)

def make_minibatches(df, col1= 'rev_eng_tok', col2= 'teach_force_tok', col3= 'target_tok'):
    enc_seq= np.array([df[col1].values[i] for i in range(len(df[col1]))])
    enc_seq= tf.data.Dataset.from_tensor_slices(enc_seq).batch(params.batch_size)

    teach_force_seq= np.array([df[col2].values[i] for i in range(len(df[col2]))])
    teach_force_seq= tf.data.Dataset.from_tensor_slices(teach_force_seq).batch(params.batch_size)

    y= np.array([df[col3].values[i] for i in range(len(df[col3]))])
    y= tf.data.Dataset.from_tensor_slices(y).batch(params.batch_size)
    return enc_seq, teach_force_seq, y

In [None]:
class Encoder(tf.keras.Model):
    def __init__(self, params):
        super(Encoder, self).__init__()

        self.embed = tf.keras.layers.Embedding(input_dim=params.eng_vocab,
                                               output_dim=params.embed_size)
        
        self.gru1 = tf.keras.layers.GRU(params.gru_units, kernel_initializer='glorot_normal',
                                        return_sequences=True, return_state=True)
        
        self.gru2 = tf.keras.layers.GRU(params.gru_units, kernel_initializer='glorot_normal',
                                        return_sequences=True, return_state=True)
        
    def call(self, input_seq):

        x = self.embed(input_seq)

        output_seq1, hidden1 = self.gru1(x)

        output_seq2, hidden2 = self.gru2(output_seq1)

        return output_seq2, hidden1, hidden2



In [None]:
class LuongAttention(tf.keras.layers.Layer):
    def __init__(self, params):
        super(LuongAttention, self).__init__()

        self.tdfc = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(params.gru_units))

    def call(self, en_seq, dec_out):
        scores = tf.keras.backend.batch_dot(self.tdfc(en_seq), dec_out, axes=(2, 2))

        attention_weights = tf.nn.softmax(scores, axis=1)

        mul = en_seq * attention_weights

        context_vector = tf.reduce_mean(mul, axis=1)


        return context_vector, attention_weights


In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, params):

        super(Decoder, self).__init__()

        self.embed = tf.keras.layers.Embedding(input_dim=params.ger_vocab,
                                               output_dim=params.embed_size)
        
        self.gru1 = tf.keras.layers.GRU(params.gru_units, kernel_initializer='glorot_normal',
                                        return_sequences=True, return_state=True)
        
        self.gru2 = tf.keras.layers.GRU(params.gru_units, kernel_initializer='glorot_normal',
                                        return_sequences=True, return_state=True)
        
        self.attention = LuongAttention(params)

        self.fc = tf.keras.layers.TimeDistributed(tf.keras.layers.Dense(params.ger_vocab))

    def call(self, enc_seq, teach_force_seq, init_hidden1, init_hidden2):

        x = self.embed(teach_force_seq)

        output_seq1, hidden1 = self.gru1(x, initial_state=init_hidden1)

        output_seq2, hidden2 = self.gru2(output_seq1, initial_state=init_hidden2)

        context_vector, attention_weights = self.attention(enc_seq, output_seq2)

        x = tf.concat([output_seq2, tf.expand_dims(context_vector, 1)], axis= -1)

        x = tf.nn.tanh(x)

        y = self.fc(x)

        return y, hidden1, hidden2, attention_weights

In [None]:
def loss(y, ypred, sce):

    loss_ = sce(y, ypred)

    mask = tf.cast(tf.not_equal(y, 0), tf.float32)

    loss_ = mask * loss_

    return tf.reduce_mean(loss_)

In [None]:
@tf.function
def train_step(params, x, ger_inp, ger_out, encoder, decoder, sce):
    with tf.GradientTape() as tape:

        tot_loss = 0

        enc_seq, hidden1, hidden2 = encoder(x)

        for i in range(params.dec_max_len):

            dec_inp = tf.expand_dims(ger_inp[:, i], axis=1)

            ypred, hidden1, hidden2, attention_weights = decoder(enc_seq, dec_inp, hidden1, hidden2)

            timestep_loss = loss(tf.expand_dims(ger_out[:, i], 1), ypred, sce)

            tot_loss += timestep_loss

        avg_timestep_loss = tot_loss/params.dec_max_len

    total_vars = encoder.trainable_variables + decoder.trainable_variables

    grads = tape.gradient(avg_timestep_loss, total_vars)
    params.optimizer.apply_gradients(zip(grads, total_vars))

    return grads, avg_timestep_loss

In [None]:
def save_checkpoints(params, encoder, decoder):
    checkpoint_dir = '/content/model_checkpoints'
    checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
    ckpt = tf.train.Checkpoint(optimizer=params.optimizer,
                               encoder=encoder,
                               decoder=decoder)
    ckpt.save(file_prefix=checkpoint_prefix)

def restore_checkpoint(params, encoder, decoder):
    checkpoint_dir = '/content/model_checkpoints'
    ckpt= tf.train.Checkpoint(optimizer=params.optimizer,
                              encoder=encoder,
                              decoder=decoder)
    ckpt.restore(tf.train.latest_checkpoint(checkpoint_dir))

In [None]:
df = pd.read_csv('/content/eng2ger.csv')

tokenize_eng, detokenize_eng, len_eng= tokenizer(df['eng_input'], True)
tokenize_ger, detokenize_ger, len_ger= tokenizer(df['ger_input'], False)

tokenize_eng['<pad>'] = 0
detokenize_eng[0] = "<pad>"
tokenize_ger["<pad>"] = 0
detokenize_ger[0] = "<pad>"


num_samples = df.shape[0]
eng_vocab = len_eng + 1
ger_vocab = len_ger + 1 


df['eng_input'] = df['eng_input'].map(lambda txt: padding(txt, params.en_max_len))
df['ger_input'] = df['ger_input'].map(lambda txt: padding(txt, params.dec_max_len))
df['ger_target'] = df['ger_target'].map(lambda txt: padding(txt, params.dec_max_len))

df['eng_tok'] = df['eng_input'].map(lambda txt: [tokenize_eng[tok] for tok in txt.split(' ')])

df['teach_force_tok'] = df['ger_input'].map(lambda txt: [tokenize_ger[tok] for tok in txt.split(' ')])
df['target_tok'] = df['ger_target'].map(lambda txt: [tokenize_ger[tok] for tok in txt.split(' ')])

df['rev_eng_tok'] = df['eng_tok'].map(lambda ls: ls[:: -1])

enc_seq, teach_force_seq, y = make_minibatches(df, col1='rev_eng_tok', col2='teach_force_tok', col3='target_tok')
print(enc_seq, teach_force_seq, y)


<BatchDataset shapes: (None, 20), types: tf.int64> <BatchDataset shapes: (None, 17), types: tf.int64> <BatchDataset shapes: (None, 17), types: tf.int64>


In [None]:
#tf.keras.backend.clear_session()
encoder = Encoder(params)
decoder = Decoder(params)

def train():

    sce = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')
    
    start = time.time()
    avg_loss = []

    for e in track(range(0, params.epochs)):

        losses = []

        st = time.time()

        for enc_seq_batch, teach_force_seq_batch, y_batch in zip(enc_seq, teach_force_seq, y):

            grads, loss = train_step(params, enc_seq_batch, teach_force_seq_batch, y_batch, encoder, decoder, sce)

            losses.append(loss.numpy())

        avg_loss.append(np.mean(losses))

        print(f'EPOCH - {e+1} ---- LOSS - {np.mean(losses)} ---- TIME - {time.time()- st}')

    save_checkpoints(params, encoder, decoder)
    print(f'total time taken: {time.time()-start}')

    return grads, avg_loss

grads, avg_loss = train()


EPOCH - 1 ---- LOSS - 3.0386593341827393 ---- TIME - 62.81144380569458
EPOCH - 2 ---- LOSS - 2.6535072326660156 ---- TIME - 23.105008125305176
EPOCH - 3 ---- LOSS - 2.5410878658294678 ---- TIME - 23.378941297531128
EPOCH - 4 ---- LOSS - 2.4738378524780273 ---- TIME - 22.946999549865723
EPOCH - 5 ---- LOSS - 2.371509552001953 ---- TIME - 22.54237723350525
EPOCH - 6 ---- LOSS - 2.2057316303253174 ---- TIME - 22.759089946746826
EPOCH - 7 ---- LOSS - 2.0625908374786377 ---- TIME - 23.128815412521362
EPOCH - 8 ---- LOSS - 1.9562010765075684 ---- TIME - 22.614665269851685
EPOCH - 9 ---- LOSS - 1.8663301467895508 ---- TIME - 22.657027006149292
EPOCH - 10 ---- LOSS - 1.7836918830871582 ---- TIME - 22.776947259902954
EPOCH - 11 ---- LOSS - 1.7059952020645142 ---- TIME - 22.822686672210693
EPOCH - 12 ---- LOSS - 1.6364350318908691 ---- TIME - 23.543668270111084
EPOCH - 13 ---- LOSS - 1.5675668716430664 ---- TIME - 23.049397945404053
EPOCH - 14 ---- LOSS - 1.496752142906189 ---- TIME - 23.3331646

In [None]:
df = pd.read_csv('/content/eng2ger.csv')

tokenize_eng, detokenize_eng, params.len_eng = tokenizer(df['eng_input'], True)
tokenize_ger, detokenize_ger, params.len_ger = tokenizer(df['ger_input'], False)

def make_prediction(txt, params, greedy=False, random_sampling=True, beam_search=False):

    nlp = spacy.load('en_core_web_sm')
    txt = contractions.fix(txt)

    x = tf.expand_dims(tf.constant([tokenize_eng[tok.text.lower()] for tok in nlp(txt)]), 0)

    #encoder = Encoder(params)
    #decoder = Decoder(params)

    #restore_checkpoint(params, encoder, decoder)

    dec_inp = tf.reshape(tokenize_ger['<sos>'], (1,1))
    final_tok, i = '<sos>', 0

    sent, att = [], []
    enc_seq, hidden1, hidden2 = encoder(x)

    while final_tok != '<eos>':

        ypred, hidden1, hidden2, attention_weights = decoder(enc_seq, dec_inp, hidden1, hidden2)

        if random_sampling:
            idx = tf.random.categorical(ypred[:, 0, :], num_samples= 1)

        elif greedy:
            idx = tf.argmax(ypred[:, 0, :], axis= -1)

        elif beam_search:
            pass

        sent.append(detokenize_ger[tf.squeeze(idx).numpy()])

        att.append(attention_weights)
        dec_inp = idx
        if i == 10:
            break
        else:
            i += 1
    return " ".join(sent), att

txt = input('Type anything: ')
sent, att = make_prediction(txt, params)
print('[bold blue]' + sent)


Type anything: hello
[bold blue]er macht ? sage tom aufgeregt er studiert manner beschutzen benutzen
