In [None]:
import keras
import jax
import numpy as np
from keras import ops
import os
os.environ["KERAS_BACKEND"] = "jax" # Set JAX as your backend

## 1. Tokenizer

In [None]:
class SimpleTokenizer:
  def __init__(self, vocabulary):
    self.vocabulary = vocabulary # vocabulary is the dictionary
    self.char_to_idx = {char: idx for idx, char in enumerate(vocabulary)} # this maps a character to it's index e.g. a -> 0
    self.idx_to_char = {idx: char for idx, char in enumerate(vocabulary)} # this maps an index to the character e.g. 0 -> a

  def encode(self, text):
    return [self.char_to_idx.get(char, len(self.vocabulary)) for char in text]

  def decode(self, indices):
    return "".join([self.idx_to_char.get(idx, "<UNK>") for idx in indices])

In [None]:
# Example:
vocab = "abcdefghijklmnopqrstuvwxyz" # bonus question: What else should we include in this vocabulary?
tokenizer = SimpleTokenizer(vocabulary=vocab)

for char in tokenizer.vocabulary:
  print(f"{char} -> {tokenizer.char_to_idx.get(char, len(tokenizer.vocabulary))}")

text = "hello world"
encoded_text = tokenizer.encode(text)
decoded_text = tokenizer.decode(encoded_text)
print(f"encoded text: {encoded_text}\n \
        decoded text: {decoded_text}")

a -> 0
b -> 1
c -> 2
d -> 3
e -> 4
f -> 5
g -> 6
h -> 7
i -> 8
j -> 9
k -> 10
l -> 11
m -> 12
n -> 13
o -> 14
p -> 15
q -> 16
r -> 17
s -> 18
t -> 19
u -> 20
v -> 21
w -> 22
x -> 23
y -> 24
z -> 25
encoded text: [7, 4, 11, 11, 14, 26, 22, 14, 17, 11, 3]
         decoded text: hello<UNK>world


## 2. Embeddings

In [None]:
class EmbeddingLayer(keras.layers.Layer):
  def __init__(self, vocabulary_size, embedding_dimensions):
    super().__init__()
    self.embedding = keras.layers.Embedding(vocabulary_size, embedding_dimensions)


  def call(self, x):
    return self.embedding(x)

## 3. Encoder

In [None]:
class EncoderBlock(keras.layers.Layer):
  def __init__(self, embedding_dimensions, number_of_attention_heads,
               feed_forward_dimensions, dropout_rate=0.1):
    super().__init__()
    self.attention = keras.layers.MultiHeadAttention(num_heads=number_of_attention_heads, key_dim=embedding_dimensions)
    self.feed_forward_network = keras.Sequential([
        keras.layers.Dense(feed_forward_dimensions, activation="relu"),
        keras.layers.Dense(embedding_dimensions),
    ])
    self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout1 = keras.layers.Dropout(rate=dropout_rate)
    self.dropout2 = keras.layers.Dropout(rate=dropout_rate)

  def call(self, inputs, training):
    attention_output = self.attention(inputs, inputs)
    attention_output = self.dropout1(attention_output, training=training)
    output1 = self.layernorm1(inputs + attention_output)
    feed_forward_output = self.feed_forward_network(output1)
    feed_forward_output = self.dropout2(feed_forward_output, training=training)
    return self.layernorm2(output1 + feed_forward_output) # why do we add these here? Is this a residual connection?

## 4. Decoder

In [None]:
class DecoderBlock(keras.layers.Layer):
  def __init__(self, embedding_dimensions, number_of_attention_heads,
               feed_forward_dimensions, dropout_rate=0.1):
    super().__init__()
    self.masked_attention = keras.layers.MultiHeadAttention(num_heads=number_of_attention_heads, key_dim=embedding_dimensions)
    self.cross_attention = keras.layers.MultiHeadAttention(num_heads=number_of_attention_heads, key_dim=embedding_dimensions)
    self.feed_forward_network = keras.Sequential([
        keras.layers.Dense(feed_forward_dimensions, activation='relu'),
        keras.layers.Dense(embedding_dimensions)
    ])
    self.layernorm = keras.layers.LayerNormalization(epsilon=1e-6)
    self.dropout = keras.layers.Dropout(rate=dropout_rate)


  def call(self, x, encoder_output, training, look_ahead_mask, padding_mask):
    masked_attention = self.masked_attention(x, x, attention_mask=look_ahead_mask)
    masked_attention = self.dropout(attention_1, training=training) # why is dropout being applied to the attention outputs?
    output1 = self.layernorm(masked_attention + x)
    cross_attention = self.masked_attention(x, x, encoder_output, attention_mask=padding_mask)
    cross_attention = self.dropout(cross_attention, training=training) # why is dropout being applied to the attention outputs?
    output2 = self.layernorm(cross_attention + output1)
    feed_forward_output = self.feed_forward_network(output2)
    feed_forward_output = self.dropout(feed_forward_output, training=training)
    return self.layernorm(feed_forward_output + output2)


NameError: name 'keras' is not defined

## 5. Output Layer

In [None]:
class OutputLayer(keras.layers.Layer):
  def __init__(self, vocabulary_size):
    super().__init__()
    self.dense = keras.layers.Dense(vocabulary_size)

  def call(self, x):
    return self.dense(x)

## 6. Putting it all together

In [None]:
class SimpleLanguageModel(keras.Model):
  def __init__(self, vocabulary_size, embedding_dimensions,
               number_of_attention_heads, feed_forward_dimensions):
    super().__init__()
    self.embedding = EmbeddingLayer(vocabulary_size=vocabulary_size,
                                    embedding_dimensions=embedding_dimensions)
    self.encoder = EncoderBlock(embedding_dimensions=embedding_dimensions,
                                number_of_attention_heads=number_of_attention_heads,
                                feed_forward_dimensions=feed_forward_dimensions)
    self.decoder = DecoderBlock(embedding_dimensions=embedding_dimensions,
                                number_of_attention_heads=number_of_attention_heads,
                                feed_forward_dimensions=feed_forward_dimensions)
    self.output_layer = OutputLayer(vocabulary_size=vocabulary_size)


  def call(self, inputs, training=False):
    x = self.embedding(inputs)
    encoder_output = self.encoder(x, training=training)
    decoder_output = self.decoder(x, encoder_output=encoder_output,
                                  training=training, look_ahead_mask=None,
                                  padding_mask=None)
    return self.output_layer(decoder_output)

In [None]:
# Example use:

vocabulary_size = 1000
embedding_dimensions = 256
number_of_attention_heads = 8
feed_forward_dimensions = 512

model = SimpleLanguageModel(vocabulary_size=vocabulary_size, embedding_dimensions=embedding_dimensions,
                            number_of_attention_heads=number_of_attention_heads,
                            feed_forward_dimensions=feed_forward_dimensions)
x = keras.random.uniform((1, 50), minval=0, maxval=vocabulary_size, dtype="float32")
output = model(x)
print(output.shape)



(1, 50, 1000)


## 7. Visualizations

In [1]:
!pip install bertviz

Collecting bertviz
  Downloading bertviz-1.4.0-py3-none-any.whl.metadata (19 kB)
Collecting boto3 (from bertviz)
  Downloading boto3-1.35.29-py3-none-any.whl.metadata (6.6 kB)
Collecting botocore<1.36.0,>=1.35.29 (from boto3->bertviz)
  Downloading botocore-1.35.29-py3-none-any.whl.metadata (5.6 kB)
Collecting jmespath<2.0.0,>=0.7.1 (from boto3->bertviz)
  Downloading jmespath-1.0.1-py3-none-any.whl.metadata (7.6 kB)
Collecting s3transfer<0.11.0,>=0.10.0 (from boto3->bertviz)
  Downloading s3transfer-0.10.2-py3-none-any.whl.metadata (1.7 kB)
Downloading bertviz-1.4.0-py3-none-any.whl (157 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m157.6/157.6 kB[0m [31m1.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading boto3-1.35.29-py3-none-any.whl (139 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m139.1/139.1 kB[0m [31m6.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading botocore-1.35.29-py3-none-any.whl (12.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
!pip install huggingface_hub



In [3]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
from transformers import AutoTokenizer, AutoModel, utils
from bertviz import model_view
utils.logging.set_verbosity_error()  # Suppress standard warnings

model_name = "google/gemma-2b"  # Find popular HuggingFace models here: https://huggingface.co/models
model = AutoModel.from_pretrained(model_name, output_attentions=True)  # Configure model to return attention values
tokenizer = AutoTokenizer.from_pretrained(model_name)

config.json:   0%|          | 0.00/627 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/13.5k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

tokenizer_config.json:   0%|          | 0.00/33.6k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/4.24M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.5M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/636 [00:00<?, ?B/s]

In [5]:
input_text = "Hi how are you?"
inputs = tokenizer.encode(input_text, return_tensors='pt')  # Tokenize input text
outputs = model(inputs)  # Run model
attention = outputs[-1]  # Retrieve attention from model outputsb
tokens = tokenizer.convert_ids_to_tokens(inputs[0])  # Convert input ids to token strings
model_view(attention, tokens)  # Display model view

<IPython.core.display.Javascript object>