P.S. Оскільки виникли деякі технічні проблеми, довелось скористатись стареньким ноубуком, а йому було дуже важко і навіть після того як я залишила таку доволі куценьку модель воно дуже довго працювало. Можливо, якщо я полікую новенький ноутбук, то зроблю перекладач з кращою точністю, але і цей в принципі щось перекладає)

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import re
import string

from sklearn.model_selection import train_test_split

import tensorflow as tf
#tf.config.experimental_run_functions_eagerly(False)
#from tensorflow_text.tools.wordpiece_vocab import bert_vocab_from_dataset as bert_vocab
#import tensorflow_text as text

In [2]:
from tensorflow import keras
from keras import layers
from keras.layers import TextVectorization, LeakyReLU, Conv2DTranspose, Embedding, Bidirectional, Reshape, Dense, BatchNormalization, LSTM
from keras.layers import GRU, Conv2D, MaxPooling2D, Flatten, Dropout, MultiHeadAttention, LayerNormalization, Add, StringLookup
from keras import Sequential
from keras.models import save_model, load_model

In [40]:
import tensorflow_text as tf_text
import random

## Tokenizers

Переклад: Англійська - Французька

In [4]:
file = open("fra-eng.txt", 'r', encoding='utf-8')
text = file.read()
file.close()

In [5]:
def cleaning(text):
    text = text.lower()
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)
    text = re.sub('\s{2,}', " ", text)
    return text

In [6]:
lines = text.split("\n")[:-1]
eng = []
fra = []
for l in lines:
    eng.append(cleaning(l.split("\t")[0]))
    fra.append(cleaning(l.split("\t")[1]))

In [7]:
print(eng[111], fra[11])

i left  cours 


In [8]:
punct = string.punctuation
punct.replace('[', '')
punct.replace(']', '')

def standardization(text):
    text = tf_text.normalize_utf8(text, 'NFKD')
    text = tf.strings.lower(text)
    text = tf.strings.regex_replace(text, "[%s]" % re.escape(punct), "")
    return tf.strings.join(['[START]', text, '[END]'], separator=' ')

In [9]:
num_words = 10000
sequence_length = 100
batch_size = 64
buffer_size = len(eng)

In [10]:
eng_vectorization = TextVectorization(max_tokens = num_words, output_mode="int",
                                      standardize=standardization #output_sequence_length=sequence_length
                                     )
fra_vectorization = TextVectorization(max_tokens = num_words, output_mode="int",
                                      standardize=standardization #output_sequence_length=sequence_length
                                     )

In [11]:
eng_vectorization.adapt(eng)
fra_vectorization.adapt(fra)

In [12]:
vocab_fra = np.array(fra_vectorization.get_vocabulary())
vocab_eng = np.array(eng_vectorization.get_vocabulary())

In [13]:
x_train, x_test, y_train, y_test = train_test_split(eng, fra, train_size=0.7)
x_test, x_val, y_test, y_val = train_test_split(x_test, y_test, train_size=0.5)

In [14]:
def vectorize_text(eng, fra):
    eng = eng_vectorization(eng)
    fra = fra_vectorization(fra)
    return (eng, fra[:,:-1]),fra[:,1:]

In [15]:
def make_dataset(eng, fra):
    eng_texts = list(eng)
    spa_texts = list(fra)
    dataset = tf.data.Dataset.from_tensor_slices((eng_texts, spa_texts))
    dataset = dataset.batch(batch_size)
    dataset = dataset.map(vectorize_text)
    return dataset.shuffle(2048).prefetch(16).cache()

In [16]:
train_data = make_dataset(x_train, y_train)
test_data = make_dataset(x_test, y_test)
val_data = make_dataset(x_val,y_val)

## Attention layers

In [19]:
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization()
        self.add = tf.keras.layers.Add()

In [20]:
class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, attn_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True)

        self.last_attn_scores = attn_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)

        return x

In [21]:
class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x)
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

## The encoder

In [23]:
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
        super().__init__()

        self.d_model = d_model

        self.embedding = Embedding(vocab_size, d_model)

        self.self_attention = GlobalSelfAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)
        
        self.seq = Sequential([Dense(dff, activation='relu')])

        self.layer_norm = LayerNormalization()
        
        #self.dropout = Dropout(dropout_rate)

    @tf.function
    def call(self, x):
        x = self.embedding(x)
        #x = self.dropout(x)
        x = self.self_attention(x)
        x = self.seq(x)
        x = self.layer_norm(x)
        return x 

## The decoder

In [24]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, *, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
        super(Decoder, self).__init__()
        
        self.d_model = d_model

        self.embedding = Embedding(vocab_size, d_model)
        
        self.dropout = Dropout(dropout_rate)

        self.cross_attention = CrossAttention(
            num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)

        self.seq = Sequential([Dense(dff, activation='relu')])

        self.layer_norm = LayerNormalization()

        self.last_attn_scores = None

    @tf.function
    def call(self, x, context):
        x = self.embedding(x)  

        x = self.cross_attention(x=x, context=context)

        self.last_attn_scores = self.cross_attention.last_attn_scores

        x = self.seq(x)
        x = self.layer_norm(x)
        return x

## Translator

In [25]:
d_model = 128
dff = 128
num_heads = 8
dropout_rate = 0.1
embed_dim = 128

In [26]:
def masked_acc(y_true, y_pred):
    # Calculate the loss for each item in the batch.
    y_pred = tf.argmax(y_pred, axis=-1)
    y_pred = tf.cast(y_pred, y_true.dtype)

    match = tf.cast(y_true == y_pred, tf.float32)
    mask = tf.cast(y_true != 0, tf.float32)

    return tf.reduce_sum(match)/tf.reduce_sum(mask)

In [27]:
def masked_loss(y_true, y_pred):
    loss_fn = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_fn(y_true, y_pred)

    mask = tf.cast(y_true != 0, loss.dtype)
    loss *= mask

    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

In [28]:
encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
#x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
encoder_outputs = Encoder(d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=eng_vectorization.vocabulary_size(),
                           dropout_rate=dropout_rate)(encoder_inputs)
encoder = keras.Model(encoder_inputs, encoder_outputs)

In [29]:
decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
#x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
x = Decoder(d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=fra_vectorization.vocabulary_size(),
                           dropout_rate=dropout_rate)(decoder_inputs, encoded_seq_inputs)
#x = layers.Dropout(0.5)(x)
decoder_outputs = layers.Dense(fra_vectorization.vocabulary_size(), activation="softmax")(x)
decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
decoder_outputs = decoder([decoder_inputs, encoder_outputs])


In [30]:
transformer = keras.Model(
    [encoder_inputs, decoder_inputs], decoder_outputs, name="transformer"
)

In [31]:
transformer.compile(optimizer='adam',
              loss=masked_loss, 
              metrics=[masked_acc, masked_loss])

In [32]:
transformer.summary()

Model: "transformer"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 encoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 decoder_inputs (InputLayer)    [(None, None)]       0           []                               
                                                                                                  
 encoder (Encoder)              (None, None, 128)    1296512     ['encoder_inputs[0][0]']         
                                                                                                  
 model_1 (Functional)           (None, None, 10000)  3114256     ['decoder_inputs[0][0]',         
                                                                  'encoder[0][0]']      

In [33]:
transformer.fit(train_data, epochs=20, validation_data=val_data, callbacks=[tf.keras.callbacks.EarlyStopping(patience=3)])

Epoch 1/20


  output, from_logits = _get_logits(


Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20


<keras.callbacks.History at 0x24c258e1b80>

In [35]:
transformer.evaluate(test_data)



[2.1913132667541504, 0.5695332884788513, 2.1915767192840576]

In [36]:
fra_vocab = fra_vectorization.get_vocabulary()
fra_index_lookup = dict(zip(range(len(fra_vocab)), fra_vocab))
max_decoded_sentence_length = 40

In [67]:
def decode_sequence(input_sentence):
    tokenized_input_sentence = eng_vectorization([input_sentence])
    decoded_sentence = "[START]"
    for i in range(max_decoded_sentence_length):
        tokenized_target_sentence = fra_vectorization([decoded_sentence])[:, :-1]
        
        predictions = transformer([tokenized_input_sentence, tokenized_target_sentence])
        
        
        sampled_token_index = np.argmax(predictions[0, i, :])
        sampled_token = fra_index_lookup[sampled_token_index]

        decoded_sentence += " " + sampled_token

        if sampled_token == "[END]" or sampled_token == "[UNK]":
            break
    return decoded_sentence

In [70]:
def print_translation(sentence, translated_text, ground_truth):
    print("eng sentence:             " + sentence)
    print("translated fra sentence:  " + translated_text)
    print("actually fra sentence:    " + ground_truth)

In [72]:
sentence = 'tom was all worn out'
ground_truth = 'tom était tout usé'

translated_text = decode_sequence(sentence)
print_translation(sentence, translated_text, ground_truth)

eng sentence:             tom was all worn out
translated fra sentence:  [START] tom tout était ce arrangé que [END]
actually fra sentence:    tom était tout usé


In [73]:
sentence = 'they ve washed their hands'
ground_truth = 'ils se sont lavé les mains'

translated_text = decode_sequence(sentence)
print_translation(sentence, translated_text, ground_truth)

eng sentence:             they ve washed their hands
translated fra sentence:  [START] ils leurs ont mains [UNK]
actually fra sentence:    ils se sont lavé les mains


In [74]:
sentence = 'let s make a trade'
ground_truth = 'faisons un échange'

translated_text = decode_sequence(sentence)
print_translation(sentence, translated_text, ground_truth)

eng sentence:             let s make a trade
translated fra sentence:  [START] faisons secrète un [END]
actually fra sentence:    faisons un échange


In [75]:
sentence = 'her condition got worse last night'
ground_truth = 'son état s\'est aggravé la nuit dernière'

translated_text = decode_sequence(sentence)
print_translation(sentence, translated_text, ground_truth)

eng sentence:             her condition got worse last night
translated fra sentence:  [START] sa la gueule nuit de dernière sa [END]
actually fra sentence:    son état s'est aggravé la nuit dernière
