In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
df = pd.read_csv("data/spa-eng.zip", delimiter="\t", names=["translate", "source", "attr"])
df.head()

Unnamed: 0,translate,source,attr
0,Go.,Ve.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
1,Go.,Vete.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
2,Go.,Vaya.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
3,Go.,Váyase.,CC-BY 2.0 (France) Attribution: tatoeba.org #2...
4,Hi.,Hola.,CC-BY 2.0 (France) Attribution: tatoeba.org #5...


In [3]:
source_sent = np.array(df["source"])
translate_sent = np.array(df["translate"])
source_sent

array(['Ve.', 'Vete.', 'Vaya.', ...,
       'Si quieres sonar como un hablante nativo, debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un músico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado.',
       'Puede que sea imposible obtener un corpus completamente libre de errores debido a la naturaleza de este tipo de esfuerzo de colaboración. Sin embargo, si animamos a los miembros a contribuir frases en sus propios idiomas en lugar de experimentar con los idiomas que están aprendiendo, podríamos ser capaces de minimizar los errores.',
       'Un día, me desperté y vi que Dios me había puesto pelo en la cara. Me lo afeité. Al día siguiente, vi que Dios me lo había vuelto a poner en la cara, así que me lo afeité otra vez. Al tercer día, cuando vi que Dios me había puesto pelo en la cara de nuevo, decidí que Dios se saliera con la suya. Por eso tengo barba.'],
      dtype=obje

In [2]:
import tensorflow as tf
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras import layers
import re
import string

2023-06-23 04:56:19.285802: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


You can use the `TextVectorization` layer to vectorize sentences from the corpus. Learn more about using this layer in this [Text classification](https://www.tensorflow.org/tutorials/keras/text_classification) tutorial. Notice from the first few sentences above that the text needs to be in one case and punctuation needs to be removed. To do this, define a `custom_standardization function` that can be used in the TextVectorization layer.

```
# Now, create a custom standardization function to lowercase the text and
# remove punctuation.
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  return tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation), '')
```

In [23]:
def custom_standardization(input_data):
  lowercase = tf.strings.lower(input_data)
  text = tf.strings.regex_replace(lowercase,
                                  '[%s]' % re.escape(string.punctuation + "¿¡"), '')
  text = tf.strings.join(['[START]', text, '[END]'], separator=' ')
  return text
max_features = 5000
# sequence_length = 250

vectorize_layer = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    # output_sequence_length=sequence_length,
    )


In [24]:
train_text = tf.data.Dataset.from_tensor_slices((translate_sent))
vectorize_layer.adapt(train_text.batch(100))

In [25]:
inverse_vocab = vectorize_layer.get_vocabulary()
print(inverse_vocab[:20])

['', '[UNK]', '[End]', '[Begin]', 'i', 'the', 'to', 'you', 'tom', 'a', 'is', 'he', 'in', 'that', 'of', 'it', 'do', 'was', 'me', 'this']


In [26]:
L = 20
sentences = next(iter(train_text.shuffle(1000).batch(L)))
encoded_sentences = vectorize_layer(sentences)
# print(sentences, encoded_sentences)
for sen, enc in zip(sentences, encoded_sentences):
    print(sen.numpy())
    print(enc.numpy())
    print(*[vectorize_layer.get_vocabulary()[i] for i in enc.numpy()])
    print()

b'We know.'
[ 3 27 39  2  0]
[Begin] we know [End] 

b'I sat up.'
[  3   4 553  62   2]
[Begin] i sat up [End]

b'Try this.'
[  3 215  19   2   0]
[Begin] try this [End] 

b"It's hers."
[   3   44 2310    2    0]
[Begin] its hers [End] 

b"Don't lie."
[  3  22 615   2   0]
[Begin] dont lie [End] 

b'Tell me.'
[ 3 92 18  2  0]
[Begin] tell me [End] 

b'See above.'
[   3   78 1369    2    0]
[Begin] see above [End] 

b'I fled.'
[   3    4 3795    2    0]
[Begin] i fled [End] 

b'Try this.'
[  3 215  19   2   0]
[Begin] try this [End] 

b'Perfect!'
[  3 901   2   0   0]
[Begin] perfect [End]  

b"I'm okay."
[   3   29 2237    2    0]
[Begin] im okay [End] 

b'Go away!'
[  3  42 229   2   0]
[Begin] go away [End] 

b'No way!'
[  3  71 176   2   0]
[Begin] no way [End] 

b"I'm calm."
[   3   29 1056    2    0]
[Begin] im calm [End] 

b'You lost.'
[  3   7 222   2   0]
[Begin] you lost [End] 

b'I will go.'
[ 3  4 56 42  2]
[Begin] i will go [End]

b'I am good.'
[  3   4 128  77   2]
[Begin]

В обратном направлении:

In [58]:
m = 16
embedding = tf.keras.layers.Embedding(max_features+1, m)

In [59]:
embedding(encoded_sentences)

<tf.Tensor: shape=(20, 5, 16), dtype=float32, numpy=
array([[[-1.6608346e-02, -2.2104634e-02,  2.3478720e-02, ...,
          4.6716955e-02, -2.8066708e-02, -2.5480783e-02],
        [-1.8573701e-02,  7.6920390e-03, -1.0748878e-03, ...,
         -3.9673388e-02, -2.2727538e-02,  4.8033502e-02],
        [-3.5997104e-02,  1.8770646e-02,  1.8370401e-02, ...,
          1.9790221e-02, -2.3820806e-02, -3.0015349e-02],
        [-3.8259268e-02,  3.0608263e-02, -4.5956898e-02, ...,
          4.7998179e-02, -4.0138554e-02, -2.4500713e-03],
        [-1.9914960e-02, -1.0289848e-02,  3.0805472e-02, ...,
         -4.1862488e-02,  4.8731674e-02,  1.1810042e-02]],

       [[-1.6608346e-02, -2.2104634e-02,  2.3478720e-02, ...,
          4.6716955e-02, -2.8066708e-02, -2.5480783e-02],
        [ 4.6817865e-02,  2.6325714e-02,  1.6582992e-02, ...,
          1.0951161e-02,  4.5461629e-02,  2.2797074e-02],
        [ 3.4739424e-02,  9.4610676e-03, -1.1408735e-02, ...,
          3.0680131e-02,  3.4243714e-02, -4

Можно еще какие-то ragged тензоры использовать вместо padding-а.

In [60]:
vectorize_layer_rag = TextVectorization(
    standardize=custom_standardization,
    max_tokens=max_features,
    output_mode='int',
    # output_sequence_length=sequence_length,
    ragged=True
    )

train_text = tf.data.Dataset.from_tensor_slices((translate_sent))
vectorize_layer_rag.adapt(train_text.batch(100))
inverse_vocab = vectorize_layer_rag.get_vocabulary()
print(inverse_vocab[:20])
L = 20
sentences = next(iter(train_text.shuffle(1000).batch(L)))
encoded_sentences = vectorize_layer_rag(sentences)
# print(sentences, encoded_sentences)
for sen, enc in zip(sentences, encoded_sentences):
    print(sen.numpy())
    print(enc.numpy())
    print(*[vectorize_layer_rag.get_vocabulary()[i] for i in enc.numpy()])
    print()

['', '[UNK]', '[End]', '[Begin]', 'i', 'the', 'to', 'you', 'tom', 'a', 'is', 'he', 'in', 'that', 'of', 'it', 'do', 'was', 'me', 'this']
b'Try hard.'
[  3 215 247   2]
[Begin] try hard [End]

b'I hit Tom.'
[  3   4 535   8   2]
[Begin] i hit tom [End]

b'Is he Tom?'
[ 3 10 11  8  2]
[Begin] is he tom [End]

b'Get real!'
[  3  63 665   2]
[Begin] get real [End]

b'Who am I?'
[  3  82 128   4   2]
[Begin] who am i [End]

b'Keep this.'
[  3 220  19   2]
[Begin] keep this [End]

b'Watch me.'
[  3 310  18   2]
[Begin] watch me [End]

b"I'll stop."
[  3  76 204   2]
[Begin] ill stop [End]

b'Wait up.'
[  3 249  62   2]
[Begin] wait up [End]

b'Am I fat?'
[   3  128    4 1013    2]
[Begin] am i fat [End]

b'Be fair.'
[   3   31 1423    2]
[Begin] be fair [End]

b'Sign this.'
[  3 944  19   2]
[Begin] sign this [End]

b'Hi, guys.'
[   3 1904  763    2]
[Begin] hi guys [End]

b'Hit Tom.'
[  3 535   8   2]
[Begin] hit tom [End]

b'I quit.'
[  3   4 666   2]
[Begin] i quit [End]

b"Don't die."
[  

In [61]:
vectorize_layer(sentences), vectorize_layer_rag(sentences)

(<tf.Tensor: shape=(20, 5), dtype=int64, numpy=
 array([[   3,  215,  247,    2,    0],
        [   3,    4,  535,    8,    2],
        [   3,   10,   11,    8,    2],
        [   3,   63,  665,    2,    0],
        [   3,   82,  128,    4,    2],
        [   3,  220,   19,    2,    0],
        [   3,  310,   18,    2,    0],
        [   3,   76,  204,    2,    0],
        [   3,  249,   62,    2,    0],
        [   3,  128,    4, 1013,    2],
        [   3,   31, 1423,    2,    0],
        [   3,  944,   19,    2,    0],
        [   3, 1904,  763,    2,    0],
        [   3,  535,    8,    2,    0],
        [   3,    4,  666,    2,    0],
        [   3,   22,  539,    2,    0],
        [   3, 3350,   43,    2,    0],
        [   3,  158,   18,   42,    2],
        [   3,    4,  232,   62,    2],
        [   3,  249,    2,    0,    0]])>,
 <tf.RaggedTensor [[3, 215, 247, 2], [3, 4, 535, 8, 2], [3, 10, 11, 8, 2], [3, 63, 665, 2],
  [3, 82, 128, 4, 2], [3, 220, 19, 2], [3, 310, 18, 2], [

In [62]:
# и этом может быть легко конвертированно в обычный тензор с помощью .to_tensor()
vectorize_layer_rag(sentences).to_tensor()

<tf.Tensor: shape=(20, 5), dtype=int64, numpy=
array([[   3,  215,  247,    2,    0],
       [   3,    4,  535,    8,    2],
       [   3,   10,   11,    8,    2],
       [   3,   63,  665,    2,    0],
       [   3,   82,  128,    4,    2],
       [   3,  220,   19,    2,    0],
       [   3,  310,   18,    2,    0],
       [   3,   76,  204,    2,    0],
       [   3,  249,   62,    2,    0],
       [   3,  128,    4, 1013,    2],
       [   3,   31, 1423,    2,    0],
       [   3,  944,   19,    2,    0],
       [   3, 1904,  763,    2,    0],
       [   3,  535,    8,    2,    0],
       [   3,    4,  666,    2,    0],
       [   3,   22,  539,    2,    0],
       [   3, 3350,   43,    2,    0],
       [   3,  158,   18,   42,    2],
       [   3,    4,  232,   62,    2],
       [   3,  249,    2,    0,    0]])>

## Разбираюсь, как пользоваться рекуррентными сетями в tensorflow.

In [69]:
from tensorflow import keras
from keras.layers import RNN
from keras import backend

# First, let's define a RNN Cell, as a layer subclass.
class MinimalRNNCell(keras.layers.Layer):

    def __init__(self, units, **kwargs):
        self.units = units
        self.state_size = units
        super(MinimalRNNCell, self).__init__(**kwargs)

    def build(self, input_shape):
        self.kernel = self.add_weight(shape=(input_shape[-1], self.units),
                                      initializer='uniform',
                                      name='kernel')
        self.recurrent_kernel = self.add_weight(
            shape=(self.units, self.units),
            initializer='uniform',
            name='recurrent_kernel')
        self.built = True

    def call(self, inputs, states):
        prev_output = states[0]
        h = backend.dot(inputs, self.kernel)
        output = h + backend.dot(prev_output, self.recurrent_kernel)
        return output, [output]

# Let's use this cell in a RNN layer:

cell = MinimalRNNCell(32)
x = keras.Input((None, 5))
layer = RNN(cell)
y = layer(x)
print(y)
# Here's how to use the cell to build a stacked RNN:

cells = [MinimalRNNCell(32), MinimalRNNCell(64)]
x = keras.Input((None, 5))
layer = RNN(cells)
y = layer(x)
print(y)

KerasTensor(type_spec=TensorSpec(shape=(None, 32), dtype=tf.float32, name=None), name='rnn_2/strided_slice_3:0', description="created by layer 'rnn_2'")
KerasTensor(type_spec=TensorSpec(shape=(None, 64), dtype=tf.float32, name=None), name='rnn_3/strided_slice_3:0', description="created by layer 'rnn_3'")


# reproduce translate tutorial