<a href="https://colab.research.google.com/github/HedersonSantos/RedesNeurais/blob/main/Transformer.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [8]:
import os
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import pickle

from tensorflow import keras
from keras import layers
from numpy.random import seed
from tensorflow.random import set_seed

In [2]:
from google.colab import files, drive
drive.mount('/gdrive')
%cd /gdrive/My\ Drive/Colab\ Notebooks/Redes\ Neurais/AULA6

Mounted at /gdrive
/gdrive/My Drive/Colab Notebooks/Redes Neurais/AULA6


In [54]:
x_train = pickle.load(open('./dados_/train.pkl','rb'))
y_train = pickle.load(open('./dados_/y_train.pkl','rb'))
x_test_ = pickle.load(open('./dados_/test.pkl','rb'))
y_test_ = pickle.load(open('./dados_/y_test.pkl','rb'))
x_val = x_test_[:x_test_.shape[0]//2]
y_val = y_test_[:y_test_.shape[0]//2]
x_test = x_test_[x_test_.shape[0]//2:]
y_test = y_test_[y_test_.shape[0]//2:]

In [61]:
print(x_train.shape, y_train.shape, x_val.shape, y_val.shape, x_test.shape, y_test.shape)

(522125, 10) (522125,) (46070, 10) (46070,) (46070, 10) (46070,)


## Camada Multi-head Self-attention

In [41]:
class MultiHeadSelfAttention(layers.Layer):
    def __init__(self, embed_dim, num_heads=8):
        super(MultiHeadSelfAttention, self).__init__()
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        if embed_dim % num_heads != 0:
            raise ValueError(
                f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
            )
        self.projection_dim = embed_dim // num_heads
        self.query_dense = layers.Dense(embed_dim)
        self.key_dense = layers.Dense(embed_dim)
        self.value_dense = layers.Dense(embed_dim)
        self.combine_heads = layers.Dense(embed_dim)

    def attention(self, query, key, value):
        score = tf.matmul(query, key, transpose_b=True)
        dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
        scaled_score = score / tf.math.sqrt(dim_key)
        weights = tf.nn.softmax(scaled_score, axis=-1)
        output = tf.matmul(weights, value)
        return output, weights

    def separate_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, inputs):
        # x.shape = [batch_size, seq_len, embedding_dim]
        batch_size = tf.shape(inputs)[0]
        query = self.query_dense(inputs)  # (batch_size, seq_len, embed_dim)
        key = self.key_dense(inputs)  # (batch_size, seq_len, embed_dim)
        value = self.value_dense(inputs)  # (batch_size, seq_len, embed_dim)
        query = self.separate_heads(
            query, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        key = self.separate_heads(
            key, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        value = self.separate_heads(
            value, batch_size
        )  # (batch_size, num_heads, seq_len, projection_dim)
        attention, weights = self.attention(query, key, value)
        attention = tf.transpose(
            attention, perm=[0, 2, 1, 3]
        )  # (batch_size, seq_len, num_heads, projection_dim)
        concat_attention = tf.reshape(
            attention, (batch_size, -1, self.embed_dim)
        )  # (batch_size, seq_len, embed_dim)
        output = self.combine_heads(
            concat_attention
        )  # (batch_size, seq_len, embed_dim)
        return output

## Bloco Transformer com Atenção + combinação residual + normalização + dropout

In [28]:
class TransformerBlock(layers.Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadSelfAttention(embed_dim, num_heads)
        self.ffn = keras.Sequential(
            [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim),]
        )
        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(rate)
        self.dropout2 = layers.Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

## Camada de Embedding, contendo word embedding e vetor com posições das palavras

In [29]:
class TokenAndPositionEmbedding(layers.Layer):
    def __init__(self, maxlen, vocab_size, embed_dim, embedding_matrix):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = layers.Embedding(
            input_dim=maxlen, 
            output_dim=embed_dim,
            embeddings_initializer=keras.initializers.Constant(embedding_matrix),
            trainable=False)
        self.pos_emb = layers.Embedding(
            input_dim=maxlen, 
            output_dim=embed_dim)
        
    def call(self, x):
        maxlen = tf.shape(x)[-1]
        positions = tf.range(start=0, limit=maxlen, delta=1)
        positions = self.pos_emb(positions)
        x = self.token_emb(x)
        return x + positions


## Montando a rede Transformer

In [50]:
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in feed forward network inside transformer

#vocab_size = 20000 
#num_tokens=1000
maxlen = 10
embedding_dim=10


inputs = layers.Input(shape=(maxlen,))
#embedding_layer = TokenAndPositionEmbedding(num_tokens, vocab_size, embedding_dim, embedding_matrix)
x = inputs #embedding_layer(inputs)
transformer_block = TransformerBlock(embedding_dim, num_heads, ff_dim)
x = transformer_block(x)
x = layers.GlobalAveragePooling1D()(x)
x = layers.Dropout(0.1)(x)
x = layers.Dense(16, activation="relu")(x)
x = layers.Dropout(0.2)(x)
outputs = layers.Dense(1, activation="relu")(x)

modelT = keras.Model(inputs=inputs, outputs=outputs)
modelT.summary()

Model: "model"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_4 (InputLayer)        [(None, 10)]              0         
                                                                 
 transformer_block (Transfor  (None, None, 10)         1162      
 merBlock)                                                       
                                                                 
 global_average_pooling1d (G  (None, 10)               0         
 lobalAveragePooling1D)                                          
                                                                 
 dropout_2 (Dropout)         (None, 10)                0         
                                                                 
 dense_6 (Dense)             (None, 16)                176       
                                                                 
 dropout_3 (Dropout)         (None, 16)                0     

In [51]:
modelT.compile("adam", "binary_crossentropy", metrics=["accuracy"])
history = modelT.fit(
    #x_train, y_train, batch_size=32, epochs=20, validation_data=(x_val, y_val)
    x_train, y_train, batch_size=8, epochs=30, validation_data=(x_val, y_val)
)

NameError: ignored