Ejemplo de:https://www.youtube.com/watch?v=p2sTJYoIwj0&t=53s&ab_channel=CodificandoBits


In [1]:
pip install keras-transformer

Collecting keras-transformer
  Downloading keras-transformer-0.40.0.tar.gz (9.7 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-pos-embd==0.13.0 (from keras-transformer)
  Downloading keras-pos-embd-0.13.0.tar.gz (5.6 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-multi-head==0.29.0 (from keras-transformer)
  Downloading keras-multi-head-0.29.0.tar.gz (13 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-layer-normalization==0.16.0 (from keras-transformer)
  Downloading keras-layer-normalization-0.16.0.tar.gz (3.9 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-position-wise-feed-forward==0.8.0 (from keras-transformer)
  Downloading keras-position-wise-feed-forward-0.8.0.tar.gz (4.1 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting keras-embed-sim==0.10.0 (from keras-transformer)
  Downloading keras-embed-sim-0.10.0.tar.gz (3.6 kB)
  Preparing metadata (setup.py) ...

In [2]:
# Import libraries
import numpy as np
from keras_transformer import get_model, decode
from pickle import load
from google.colab import drive
np.random.seed(0)

In [3]:
# Load data and print examples
drive.mount('/content/drive')
filename = '/content/drive/MyDrive/english-spanish.pkl'

dataset = load(open(filename, 'rb'))
print(dataset[120000,0])
print(dataset[120000,1])


Mounted at /content/drive
tom is a new yorker but he doesnt have a new york accent
tom es neoyorquino pero no tiene acento de nueva york


In [4]:
# Make tokens, this is divide string in words
source_tokens = []
for sentence in dataset[:,0]:
  # Column 0, english phrases
  source_tokens.append(sentence.split(' '))
print(source_tokens[120000])

target_tokens = []
for sentence in dataset[:,1]:
   # Column 1, spanish phrases
  target_tokens.append(sentence.split(' '))
print(target_tokens[120000])

['tom', 'is', 'a', 'new', 'yorker', 'but', 'he', 'doesnt', 'have', 'a', 'new', 'york', 'accent']
['tom', 'es', 'neoyorquino', 'pero', 'no', 'tiene', 'acento', 'de', 'nueva', 'york']


In [5]:
# Create dictionaries, this is emmbeding, transform tokens in numbers

def build_token_dict(token_list):
  token_dict = {
      '': 0,
      '': 1,
      '': 2
  }
  for tokens in token_list:
    for token in tokens:
      if token not in token_dict:
        token_dict[token] = len(token_dict)
  return token_dict


source_token_dict = build_token_dict(source_tokens)
target_token_dict = build_token_dict(target_tokens)
target_token_dict_inv = {v:k for k,v in target_token_dict.items()}
# Dict_inv is to use in decoder.

print(source_token_dict)
print(target_token_dict)
print(target_token_dict_inv)



{'': 2, 've': 1, 'vete': 2, 'vaya': 3, 'vayase': 4, 'hola': 5, 'corre': 6, 'corran': 7, 'corra': 8, 'corred': 9, 'quien': 10, 'orale': 11, 'fuego': 12, 'incendio': 13, 'disparad': 14, 'ayuda': 15, 'socorro': 16, 'auxilio': 17, 'salta': 18, 'salte': 19, 'parad': 20, 'para': 21, 'pare': 22, 'espera': 23, 'esperen': 24, 'continua': 25, 'continue': 26, 'date': 27, 'prisa': 28, 'daos': 29, 'dese': 30, 'me': 31, 'oculte': 32, 'escondi': 33, 'ocultaba': 34, 'escondia': 35, 'corri': 36, 'corria': 37, 'lo': 38, 'intento': 39, 'he': 40, 'ganado': 41, 'oh': 42, 'no': 43, 'tomatelo': 44, 'con': 45, 'soda': 46, 'disparen': 47, 'dispara': 48, 'dispare': 49, 'sonrie': 50, 'al': 51, 'ataque': 52, 'atacad': 53, 'ataquen': 54, 'ataca': 55, 'levanta': 56, 'ahora': 57, 'mismo': 58, 'id': 59, 'vayan': 60, 'ya': 61, 'tengo': 62, 'pillas': 63, 'entendiste': 64, 'el': 65, 'corrio': 66, 'metete': 67, 'adentro': 68, 'abrazame': 69, 'preocupo': 70, 'cai': 71, 'hui': 72, 'escape': 73, 'huia': 74, 'escapaba': 75, 

In [6]:
# Add start, end and pad to training set.
encoder_tokens = [[''] + tokens + [''] for tokens in source_tokens]
decoder_tokens = [[''] + tokens + [''] for tokens in target_tokens]
output_tokens = [tokens + [''] for tokens in target_tokens]

# Pad uses the max of the phrases english and spanish
source_max_len = max(map(len, encoder_tokens))
target_max_len = max(map(len, decoder_tokens))

# Add pading
encoder_tokens = [tokens + ['']*(source_max_len-len(tokens)) for tokens in encoder_tokens]
decoder_tokens = [tokens + ['']*(target_max_len-len(tokens)) for tokens in decoder_tokens]
output_tokens = [tokens + ['']*(target_max_len-len(tokens)) for tokens in output_tokens ]


print(encoder_tokens[120000])



['', 'tom', 'is', 'a', 'new', 'yorker', 'but', 'he', 'doesnt', 'have', 'a', 'new', 'york', 'accent', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '', '']


In [7]:
# Using dictionaries convert character to number
encoder_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in encoder_tokens]
decoder_input = [list(map(lambda x: target_token_dict[x], tokens)) for tokens in decoder_tokens]
output_decoded = [list(map(lambda x: [target_token_dict[x]], tokens)) for tokens in output_tokens]

# Show character in number, this is an example
print(encoder_input[120000])


[2, 54, 256, 118, 195, 12664, 2912, 30, 1575, 138, 118, 195, 5383, 4285, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2]


In [8]:
# Build a transfomer model

model = get_model(
    # Using keras
    token_num = max(len(source_token_dict),len(target_token_dict)),
    # Max of tokens
    embed_dim = 32,
    # Elements in embeding ¿why 32?
    encoder_num = 2,
    # Numbers of encoders, in original model uses 6, this is more complex and so uses 2
    decoder_num = 2,
    # Numbers of decodes, ibid
    head_num = 4,
    # Attention blocks. In paper uses 8. People watching the prhases
    hidden_dim = 128,
    # neurons y MLP that is a hidden layer.
    dropout_rate = 0.05,
    # % of neurons that desactivate in each epoc of training. Evitate overfiting.
    use_same_embed = False,
    # The embeding is diferent in coder and decoder block.
)
model.compile('adam', 'sparse_categorical_crossentropy')
model.summary()
token_num = max(len(source_token_dict),len(target_token_dict))
print(token_num)



Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 Encoder-Input (InputLayer)  [(None, None)]               0         []                            
                                                                                                  
 Encoder-Token-Embedding (E  [(None, None, 32),           808544    ['Encoder-Input[0][0]']       
 mbeddingRet)                 (25267, 32)]                                                        
                                                                                                  
 Encoder-Embedding (TrigPos  (None, None, 32)             0         ['Encoder-Token-Embedding[0][0
 Embedding)                                                         ]']                           
                                                                                              

In [9]:
# Training

x = [np.array(encoder_input), np.array(decoder_input)]
# English phrases, spanish phrases. This are inputs
y = np.array(output_decoded)
# This is output. The traduction

In [10]:
# Fit model

model.fit(x,y, epochs=1, batch_size=32)
# Batch represents blocks in each epoch





<keras.src.callbacks.History at 0x7f981c0728f0>

In [15]:
# This is the function to translate

def translate(sentence):
  sentence_tokens = [tokens + ['', ''] for tokens in [sentence.split(' ')]]
  tr_input = [list(map(lambda x: source_token_dict[x], tokens)) for tokens in sentence_tokens][0]
  decoded = decode(
      model,
      tr_input,
      start_token = target_token_dict[''],
      end_token = target_token_dict[''],
      pad_token = target_token_dict['']
  )

  print('Frase original: {}'.format(sentence))
  print('Traducción: {}'.format(' '.join(map(lambda x: target_token_dict_inv[x], decoded[1:-1]))))


In [17]:
# an example
translate('this is the house')


Frase original: this is the house
Traducción: el es que su su su coche de su su su un coche un coche de mi coche un un un un un un un un un
