## Transformer créé avec Tensorflow

In [65]:
import sys

sys.path.append("..")
from common import split_into_X_y, load_sets

## Code principal

```python
def main(tokens="subwords"):
    train, val, test, encoder = load_sets(tokens=tokens, target_vocab_size=target_vocab_size)
    vocab_size = encoder.vocab_size
    
    X_train, y_train = split_into_X_y(train, seq_length, vocab_size)
    X_test, y_test = split_into_X_y(test, seq_length, vocab_size)
    X_val, y_val = split_into_X_y(val, seq_length, vocab_size)
    
    # Form model
    model = build_transformer(vocab_size=vocab_size)
    # from_logits: Whether y_pred is expected to be a logits tensor
    model.compile(
        # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=[tf.keras.metrics.CategoricalAccuracy()],
    )
    
    print(model.summary())

    history = model.fit(
        X_train,
        y_train,
        # steps_per_epoch=np.ceil(len(X_train)/batch_size),
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(X_val, y_val),
    )
    
    generated_text = generate_sampled(model, encoder, seq_length, 200, "Il y a bien longtemps, dans un pays lointain où les oiseaux", 1)
    print(generated_text)
    
    return model, encoder
```

### Dataset wikipedia

In [66]:
# Chargement du dataset
text = load_data("data/fr.train.top1M.txt")

In [67]:
len(text)

260389755

In [68]:
text[0:300]

"a l' age de 31 ans , a barcelone , il est touche par l' esprit prophetique apres avoir obtenu la connaissance du vrai nom de dieu . il est alors persuade d' avoir atteint , par la meditation des lettres et des nombres , l' inspiration prophetique et l' etat de messie . il quitte a nouveau l' espagne"

## Generation de tokens à partir du texte

In [69]:
# "subwords" or "characters"
tokens="subwords"
target_vocab_size = 1000

train, val, test, encoder = load_sets(tokens=tokens, target_vocab_size=target_vocab_size)
vocab_size = encoder.vocab_size

In [70]:
msg_encode = encoder.encode("Bonjour à toutes et à tous")
print(len(msg_encode))

28


In [71]:
len("Bonjour à toutes et à tous")

26

In [72]:
print(encoder.decode(msg_encode))

Bonjour à toutes et à tous


In [73]:
k = 10
print(msg_encode[0:k])
print(encoder.decode(msg_encode[0:k]))

[149, 194, 193, 189, 194, 200, 197, 115, 278, 243]
Bonjour à


*Petite particularité pour l'accent grave encodé avec deux nombres.*

In [74]:
X_train, y_train = split_into_X_y(train, seq_length, vocab_size)

In [75]:
print(X_train)

[[180 115 191 ... 194 193 184]
 [115 191 122 ... 193 184 115]
 [191 122 115 ... 184 115 127]
 ...
 [199 184 115 ... 115 183 184]
 [184 115 191 ... 183 184 198]
 [115 191 180 ... 184 198 115]]


In [76]:
print(y_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [77]:
print(y_train[0])

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.
 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.

## Génération de texte

<img src="https://www.tensorflow.org/images/tutorials/transformer/transformer.png" />

```python    
    # Form model
    model = build_transformer(vocab_size=vocab_size)
    # from_logits: Whether y_pred is expected to be a logits tensor
    model.compile(
        # loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=False),
        optimizer=tf.keras.optimizers.Adam(1e-3),
        loss=tf.keras.losses.CategoricalCrossentropy(),
        metrics=[tf.keras.metrics.CategoricalAccuracy()],
    )
    
    print(model.summary())

    history = model.fit(
        X_train,
        y_train,
        # steps_per_epoch=np.ceil(len(X_train)/batch_size),
        batch_size=batch_size,
        epochs=epochs,
        validation_data=(X_val, y_val),
    )
```

```
_________________________________________________________________

Layer (type)                 Output Shape              Param #
=================================================================
input_2 (InputLayer)         [(None, 32)]              0
_________________________________________________________________
embedding_1 (Embedding)      (None, 32, 128)           43392
_________________________________________________________________
tf_op_layer_positional_encod [(None, 32, 128)]         0
_________________________________________________________________
encoder_block_6 (EncoderBloc (None, 32, 128)           66560
_________________________________________________________________
encoder_block_7 (EncoderBloc (None, 32, 128)           66560
_________________________________________________________________
time_distributed_1 (TimeDist (None, 32, 16)            2064
_________________________________________________________________
reshape_1 (Reshape)          (None, 512)               0
_________________________________________________________________
dense_36 (Dense)             (None, 128)               65664
_________________________________________________________________
dense_37 (Dense)             (None, 339)               43731
=================================================================
Total params: 287,971
Trainable params: 287,971
Non-trainable params: 0
_________________________________________________________________ 
```

## Training de 10 minutes