In [1]:
!nvcc --version

nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2025 NVIDIA Corporation
Built on Tue_May_27_02:24:01_Pacific_Daylight_Time_2025
Cuda compilation tools, release 12.9, V12.9.86
Build cuda_12.9.r12.9/compiler.36037853_0


In [2]:
!pip install cupy-cuda12x



In [3]:
!nvidia-smi

Wed Jul 23 14:47:10 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 576.57                 Driver Version: 576.57         CUDA Version: 12.9     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                  Driver-Model | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  NVIDIA GeForce RTX 3050 ...  WDDM  |   00000000:01:00.0 Off |                  N/A |
| N/A   59C    P0             16W /   85W |       0MiB /   6144MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

In [4]:
import numpy as np
import cupy as cp
def dropout(x, rate=0.1, training=True):
    if not training or rate == 0:
        return x
    mask = (cp.random.rand(*x.shape) > rate).astype(cp.float32)
    return x * mask / (1.0 - rate)

In [None]:
class AdamW:
    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-8, weight_decay=0.01):
        self.params = params
        self.lr = lr
        self.beta1, self.beta2 = betas
        self.eps = eps
        self.weight_decay = weight_decay
        self.m = {name: cp.zeros_like(p) for name, p in params}
        self.v = {name: cp.zeros_like(p) for name, p in params}
        self.t = 0

    def step(self, grads):
        self.t += 1
        lr_t = self.lr

        for name, param in self.params:
            grad = grads[name]
            m = self.m[name]
            v = self.v[name]
            m *= self.beta1
            m += (1 - self.beta1) * grad

            v *= self.beta2
            v += (1 - self.beta2) * (grad * grad) 
            m_hat = m / (1 - self.beta1 ** self.t)
            v_hat = v / (1 - self.beta2 ** self.t)
            param *= (1 - lr_t * self.weight_decay)
            param -= lr_t * m_hat / (cp.sqrt(v_hat) + self.eps)

In [None]:
class Tokenizer:
    def __init__(self, text):
        cleaned = ''.join(c.lower() if c.isalnum() or c.isspace() else ' ' for c in text)
        words = cleaned.split()

        unique_words = sorted(list(set(words)))
        unique_words.append("<unk>")

        self.word2idx = {w: i for i, w in enumerate(unique_words)}
        self.idx2word = {i: w for w, i in self.word2idx.items()}
        self.vocab_size = len(unique_words)

    def encode(self, text):
        cleaned = ''.join(c.lower() if c.isalnum() or c.isspace() else ' ' for c in text)
        words = cleaned.split()
        return [self.word2idx.get(w, self.word2idx["<unk>"]) for w in words]
    
    def decode(self, indices):
        return ' '.join([self.idx2word.get(i, '<unk>') for i in indices])

In [None]:
class Embedding:
    def __init__(self, vocab_size, embed_dim):
        self.vocab_size = vocab_size
        self.embed_dim = embed_dim
        self.weights = cp.random.randn(vocab_size, embed_dim) * 0.01
        self.grad = cp.zeros_like(self.weights) 

    def forward(self, indices):
        self.last_indices = indices
        return self.weights[indices]  

    def backward(self, doutput):
        self.grad[...] = 0
        B, T, D = doutput.shape
        flat_indices = self.last_indices.reshape(B * T)  
        doutput_flat = doutput.reshape(B * T, D)

        for k in range(B * T):
            token_id = flat_indices[k]
            self.grad[token_id] += doutput_flat[k]


    def parameters(self):
        return [('embedding_weights', self.weights)]

    def grads(self):
        return {'embedding_weights': self.grad}

In [None]:
class PositionalEncoding:
    def __init__(self, max_len, embed_dim):
        self.encoding = cp.zeros((max_len, embed_dim), dtype=cp.float32)
        for pos in range(max_len):
            for i in range(0, embed_dim, 2):
                angle = pos / (10000 ** ((2 * i) / embed_dim))
                self.encoding[pos, i] = cp.sin(angle)
                if i + 1 < embed_dim:
                    self.encoding[pos, i + 1] = cp.cos(angle)

    def forward(self, x):
        if x.ndim == 2:
            x = x[cp.newaxis, :, :]
        B, T, D = x.shape

        if T > self.encoding.shape[0]:
            raise ValueError(f"Sequence length ({T}) exceeds max_len used in PositionalEncoding.")
        pos_encoding = self.encoding[:T]  
        return x + pos_encoding[cp.newaxis, :, :]  

    def backward(self, dout):
        return dout

In [None]:
def softmax(x, axis=-1):
    x = x - cp.max(x, axis=axis, keepdims=True) 
    exp_x = cp.exp(x)
    return exp_x / cp.sum(exp_x, axis=axis, keepdims=True)

class ScaledDotProductAttention:
    def __init__(self, embed_dim):
        self.embed_dim = embed_dim
        self.scale = cp.sqrt(embed_dim)

    def forward(self, Q, K, V, mask=None):
        self.Q = Q 
        self.K = K
        self.V = V

        self.scores = cp.matmul(Q, K.transpose(0, 2, 1)) / self.scale  
        if mask is not None:
            assert mask.ndim == 2
            mask = mask[cp.newaxis, :, :]  
            self.scores = cp.where(mask == 0, -1e9, self.scores)

        self.weights = softmax(self.scores, axis=-1) 
        self.output = cp.matmul(self.weights, V) 
        return self.output, self.weights

    def backward(self, doutput):
      """
      doutput: (B, T_q, D)
      Returns: dQ, dK, dV
      """
      B, T_q, D = doutput.shape
      _, T_k, _ = self.V.shape
      dW = cp.matmul(doutput, self.V.transpose(0, 2, 1)) 
      dV = cp.matmul(self.weights.transpose(0, 2, 1), doutput) 
      d_scores = self._softmax_backward(self.weights, dW)
      d_scores /= self.scale
      dQ = cp.matmul(d_scores, self.K)                 
      dK = cp.matmul(d_scores.transpose(0, 2, 1), self.Q)  
      return dQ, dK, dV

    def _softmax_backward(self, softmax_output, grad_output):
      """
      Efficient Jacobian-vector product for softmax.
      softmax_output: (B, T_q, T_k)
      grad_output:    (B, T_q, T_k)
      Returns:        (B, T_q, T_k)
      """
      dot = cp.sum(grad_output * softmax_output, axis=-1, keepdims=True) 
      return softmax_output * (grad_output - dot)

In [None]:
class MultiHeadAttention:
    def __init__(self, embed_dim, num_heads):
        assert embed_dim % num_heads == 0
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.head_dim = embed_dim // num_heads
        self.W_q = cp.random.randn(embed_dim, embed_dim) * 0.01
        self.W_k = cp.random.randn(embed_dim, embed_dim) * 0.01
        self.W_v = cp.random.randn(embed_dim, embed_dim) * 0.01
        self.W_o = cp.random.randn(embed_dim, embed_dim) * 0.01
        self.dW_q = cp.zeros_like(self.W_q)
        self.dW_k = cp.zeros_like(self.W_k)
        self.dW_v = cp.zeros_like(self.W_v)
        self.dW_o = cp.zeros_like(self.W_o)
        self.attn = ScaledDotProductAttention(self.head_dim)

    def split_heads(self, x):
        batch_size, seq_len, emb_dim = x.shape
        assert emb_dim == self.num_heads * self.head_dim, "emb_dim must be divisible by num_heads"
        return x.reshape(batch_size, seq_len, self.num_heads, self.head_dim).transpose(0, 2, 1, 3)

    def combine_heads(self, x):
        return x.reshape(x.shape[0], self.embed_dim)

    def forward(self, Q_input, K_input, V_input, mask=None):
        self.Q_input = Q_input  
        self.K_input = K_input
        self.V_input = V_input

        Q = cp.dot(Q_input, self.W_q)
        K = cp.dot(K_input, self.W_k)
        V = cp.dot(V_input, self.W_v)

        Q = self.split_heads(Q)
        K = self.split_heads(K)
        V = self.split_heads(V)

        self.Q, self.K, self.V = Q, K, V 

        heads_output = []
        self.attn_outputs = []
        for i in range(self.num_heads):
            out, attn = self.attn.forward(Q[:, i], K[:, i], V[:, i], mask)
            heads_output.append(out)
            self.attn_outputs.append(attn)

        concat = cp.concatenate(heads_output, axis=-1)
        self.concat = concat  

        output = cp.dot(concat, self.W_o)
        return output, self.attn_outputs
    def backward(self, dout):
      B, T, D = dout.shape
      H, d = self.num_heads, self.head_dim
      dconcat = cp.dot(dout, self.W_o.T)
      self.dW_o += cp.dot(self.concat.reshape(-1, D).T, dout.reshape(-1, D))
      dheads = dconcat.reshape(B, T, H, d).transpose(0, 2, 1, 3)
      dQ = cp.zeros_like(self.Q)
      dK = cp.zeros_like(self.K)
      dV = cp.zeros_like(self.V)
      for h in range(H):
          dqh, dkh, dvh = self.attn.backward(dheads[:, h]) 
          dQ[:, h], dK[:, h], dV[:, h] = dqh, dkh, dvh
      dQ_flat = dQ.transpose(0, 2, 1, 3).reshape(B, T, D)
      dK_flat = dK.transpose(0, 2, 1, 3).reshape(B, T, D)
      dV_flat = dV.transpose(0, 2, 1, 3).reshape(B, T, D)
      Q_in = self.Q_input.reshape(-1, D)
      K_in = self.K_input.reshape(-1, D)
      V_in = self.V_input.reshape(-1, D)

      self.dW_q += cp.dot(Q_in.T, dQ_flat.reshape(-1, D))
      self.dW_k += cp.dot(K_in.T, dK_flat.reshape(-1, D))
      self.dW_v += cp.dot(V_in.T, dV_flat.reshape(-1, D))
      dQ_input = cp.dot(dQ_flat, self.W_q.T)
      dK_input = cp.dot(dK_flat, self.W_k.T)
      dV_input = cp.dot(dV_flat, self.W_v.T)

      return dQ_input + dK_input + dV_input

    def parameters(self):
        return [
            ("W_q", self.W_q),
            ("W_k", self.W_k),
            ("W_v", self.W_v),
            ("W_o", self.W_o),
        ]

    def grads(self):
        return {
            'W_q': self.dW_q,
            'W_k': self.dW_k,
            'W_v': self.dW_v,
            'W_o': self.dW_o
        }

In [None]:
class FeedForward:
    def __init__(self, embed_dim, hidden_dim):
        self.W1 = cp.random.randn(embed_dim, hidden_dim) * 0.01
        self.b1 = cp.zeros((hidden_dim,))
        self.W2 = cp.random.randn(hidden_dim, embed_dim) * 0.01
        self.b2 = cp.zeros((embed_dim,))
        self.dW1 = cp.zeros_like(self.W1)
        self.db1 = cp.zeros_like(self.b1)
        self.dW2 = cp.zeros_like(self.W2)
        self.db2 = cp.zeros_like(self.b2)

    def relu(self, x):
        return cp.maximum(0, x)

    def relu_grad(self, x):
        return (x > 0).astype(float)

    def forward(self, x):
        self.x = x
        self.h1 = cp.matmul(x, self.W1) + self.b1 
        self.a1 = self.relu(self.h1)
        self.h2 = cp.matmul(self.a1, self.W2) + self.b2 
        return self.h2

    def backward(self, dout):
      B, T, D = dout.shape
      dx2 = dout.reshape(-1, D)                         
      a1_flat = self.a1.reshape(-1, self.a1.shape[-1])  
      self.dW2 += cp.dot(a1_flat.T, dx2)                
      self.db2 += cp.sum(dx2, axis=0)
      da1 = cp.dot(dx2, self.W2.T).reshape(self.a1.shape)
      dh1 = da1 * self.relu_grad(self.h1)
      x_flat = self.x.reshape(-1, self.x.shape[-1])      
      dh1_flat = dh1.reshape(-1, dh1.shape[-1])        
      self.dW1 += cp.dot(x_flat.T, dh1_flat)          
      self.db1 += cp.sum(dh1, axis=(0, 1))
      dx = cp.dot(dh1, self.W1.T)                       
      return dx
    
    def parameters(self):
        return [
            ("W1", self.W1),
            ("b1", self.b1),
            ("W2", self.W2),
            ("b2", self.b2)
        ]

    def grads(self):
        return {
            "W1": self.dW1,
            "b1": self.db1,
            "W2": self.dW2,
            "b2": self.db2
        }

    def param_names(self):
        return ['ffn_W1', 'ffn_b1', 'ffn_W2', 'ffn_b2']

In [None]:
class LayerNorm:
    def __init__(self, embed_dim, eps=1e-5):
        self.gamma = cp.ones((embed_dim,))
        self.beta = cp.zeros((embed_dim,))
        self.eps = eps
        self.grad_gamma = cp.zeros_like(self.gamma)
        self.grad_beta = cp.zeros_like(self.beta)

    def forward(self, x):
        self.x = x  
        self.mean = cp.mean(x, axis=-1, keepdims=True)
        self.var = cp.var(x, axis=-1, keepdims=True)
        self.std = cp.sqrt(self.var + self.eps)
        self.norm = (x - self.mean) / self.std
        return self.gamma * self.norm + self.beta

    def backward(self, dout):
        self.grad_gamma = cp.sum(dout * self.norm, axis=tuple(range(dout.ndim - 1)))
        self.grad_beta = cp.sum(dout, axis=tuple(range(dout.ndim - 1)))

        N = self.x.shape[-1]
        x_mu = self.x - self.mean
        dnorm = dout * self.gamma

        dvar = cp.sum(dnorm * x_mu * -0.5 * (self.std**-3), axis=-1, keepdims=True)
        dmean = cp.sum(dnorm * -1 / self.std, axis=-1, keepdims=True) + dvar * np.mean(-2. * x_mu, axis=-1, keepdims=True)

        dx = (dnorm / self.std) + (dvar * 2 * x_mu / N) + (dmean / N)
        return dx

    def parameters(self):
        return [('layernorm_gamma', self.gamma), ('layernorm_beta', self.beta)]

    def grads(self):
        return {'layernorm_gamma': self.grad_gamma, 'layernorm_beta': self.grad_beta}

In [None]:
class DecoderBlock:
    def __init__(self, embed_dim, num_heads, hidden_dim, dropout_rate=0.1):
        self.ln1 = LayerNorm(embed_dim)
        self.attn = MultiHeadAttention(embed_dim, num_heads)
        self.ln2 = LayerNorm(embed_dim)
        self.ffn = FeedForward(embed_dim, hidden_dim)
        self.dropout_rate = dropout_rate

    def forward(self, x, mask, training=True):
        self.x_input = x  
        self.x_ln1 = self.ln1.forward(x)
        self.attn_out, _ = self.attn.forward(self.x_ln1, self.x_ln1, self.x_ln1, mask)
        self.attn_out = dropout(self.attn_out, rate=self.dropout_rate, training=training)
        self.x_after_attn = x + self.attn_out 
        self.x_ln2 = self.ln2.forward(self.x_after_attn)
        self.ffn_out = self.ffn.forward(self.x_ln2)
        self.ffn_out = dropout(self.ffn_out, rate=self.dropout_rate, training=training)
        self.x_out = self.x_after_attn + self.ffn_out 
        return self.x_out
    
    def backward(self, dout):
        dffn_out = dout
        dx_after_attn = dout 
        dffn_out = self.ffn.backward(dffn_out)
        dln2_out = self.ln2.backward(dffn_out)
        dattn_out = dx_after_attn + dln2_out 
        dattn_out = self.attn.backward(dattn_out)
        dln1_out = self.ln1.backward(dattn_out)

        return self.x_input + dln1_out

    def parameters(self):
        params = []
        for k, v in self.ln1.parameters():
            params.append((f"ln1.{k}", v))
        for k, v in self.attn.parameters():
            params.append((f"attn.{k}", v))
        for k, v in self.ln2.parameters():
            params.append((f"ln2.{k}", v))
        for k, v in self.ffn.parameters():
            params.append((f"ffn.{k}", v))
        return params
    def grads(self):
        grads = {}
        for name, grad in self.ln1.grads().items():
            grads[f"ln1.{name}"] = grad
        for name, grad in self.attn.grads().items():
            grads[f"attn.{name}"] = grad
        for name, grad in self.ln2.grads().items():
            grads[f"ln2.{name}"] = grad
        for name, grad in self.ffn.grads().items():
            grads[f"ffn.{name}"] = grad
        return grads

In [None]:
class DecoderOnlyTransformer:
    def __init__(self, vocab_size, max_len, embed_dim, num_heads, hidden_dim, num_layers):
        self.embed = Embedding(vocab_size, embed_dim)
        self.pos_enc = PositionalEncoding(max_len, embed_dim)
        self.blocks = [DecoderBlock(embed_dim, num_heads, hidden_dim, dropout_rate=0.1) for _ in range(num_layers)]
        self.ln = LayerNorm(embed_dim)
        self.output_layer = cp.random.randn(embed_dim, vocab_size) * 0.01
        self.output_grad = cp.zeros_like(self.output_layer)

    def forward(self, x_indices, training=True):
        x = self.embed.forward(x_indices)
        x = self.pos_enc.forward(x)
        seq_len = x.shape[1]
        mask = cp.tril(np.ones((seq_len, seq_len)))

        for block in self.blocks:
            x = block.forward(x, mask, training=training)

        x = self.ln.forward(x)
        self.last_hidden = x 
        logits = cp.dot(x, self.output_layer)
        return logits, x

    def backward_output_layer(self, dlogits):
        x = self.last_hidden
        dW_out = cp.dot(x.reshape(-1, x.shape[-1]).T, dlogits.reshape(-1, dlogits.shape[-1]))
        dx = cp.dot(dlogits, self.output_layer.T)

        self.output_grad[...] = dW_out
        return dx

    def backward(self, dlogits):
        dx = self.backward_output_layer(dlogits)
        dx = self.ln.backward(dx)
        for block in reversed(self.blocks):
            dx = block.backward(dx)
        dx = self.pos_enc.backward(dx)
        self.embed.backward(dx)

    def parameters(self):
        params = []

        params.append(("output_layer", self.output_layer))
        for i, block in enumerate(self.blocks):
            for name, value in block.parameters():
                params.append((f"blocks.{i}.{name}", value))
        for name, value in self.embed.parameters():
            params.append((f"embed.{name}", value))
        return params

    def grads(self):
        grads = {'output_layer': self.output_grad}

        for i, block in enumerate(self.blocks):
            block_grads = block.grads()
            for name, grad in block_grads.items():
                grads[f"blocks.{i}.{name}"] = grad

        embed_grads = self.embed.grads()
        for name, grad in embed_grads.items():
            grads[f"embed.{name}"] = grad

        return grads

In [None]:
def softmax(x, axis=-1):
    x = x - cp.max(x, axis=axis, keepdims=True)
    exp_x = cp.exp(x)
    return exp_x / cp.sum(exp_x, axis=axis, keepdims=True)

def cross_entropy_loss_with_grad(logits, targets):
    """
    logits: shape (B, T, V) — raw scores from final layer
    targets: shape (B, T) — ground truth indices
    """
    B, T, V = logits.shape
    logits = logits.reshape(-1, V)  
    targets = targets.reshape(-1)   
    probs = softmax(logits) 
    loss = -cp.log(probs[np.arange(len(targets)), targets] + 1e-9)
    avg_loss = cp.mean(loss)

    dlogits = probs
    dlogits[cp.arange(len(targets)), targets] -= 1
    dlogits /= len(targets)

    return avg_loss, dlogits.reshape(B, T, V)

In [16]:
with open("Story.txt", "r", encoding="utf-8") as f:
    raw_text = f.read().strip()

In [44]:
tokenizer = Tokenizer(raw_text)

if "<sep>" not in tokenizer.word2idx:
    idx = len(tokenizer.word2idx)
    tokenizer.word2idx["<sep>"] = idx
    tokenizer.idx2word[idx] = "<sep>"
    tokenizer.vocab_size += 1

encoded = tokenizer.encode(raw_text)

In [None]:
def create_dataset(encoded, seq_len):
    inputs, targets = [], []
    for i in range(len(encoded) - seq_len):
        inputs.append(encoded[i:i+seq_len])
        targets.append(encoded[i+1:i+seq_len+1])
    return cp.array(inputs), cp.array(targets)

seq_len = 128
X, Y = create_dataset(encoded, seq_len)

In [19]:
embed_dim = 32
num_heads = 2
hidden_dim = 64
num_layers = 4
max_len = seq_len
model = DecoderOnlyTransformer(
    vocab_size=tokenizer.vocab_size,
    max_len=max_len,
    embed_dim=embed_dim,
    num_heads=num_heads,
    hidden_dim=hidden_dim,
    num_layers=num_layers
)

In [None]:
optimizer = AdamW(model.parameters(), lr=0.01)
x_example = X[0]
y_true = Y[0]
logits, _ = model.forward(x_example, training=True)
loss, dlogits = cross_entropy_loss_with_grad(logits, y_true)
dx = model.backward_output_layer(dlogits)
model.embed.backward(dx)
grads = model.grads()
optimizer.step(grads)
print("Logits shape:", logits.shape)
print("Loss:", loss)

Logits shape: (1, 128, 669)
Loss: 6.501418881921845


In [None]:
def sample(model, tokenizer, start_text, length, k=5):
    model_input = tokenizer.encode(start_text)
    for _ in range(length):
        x = cp.array(model_input[-seq_len:])
        logits, _ = model.forward(x, training=False)
        last_logits = logits[-1][:tokenizer.vocab_size].flatten()
        top_k_indices = cp.argsort(last_logits)[-k:]               
        top_k_logits = last_logits[top_k_indices]                 
        top_k_probs = cp.exp(top_k_logits - cp.max(top_k_logits)) 
        top_k_probs /= cp.sum(top_k_probs)                    
        top_k_indices = cp.asarray(top_k_indices).flatten()
        top_k_probs = cp.asarray(top_k_probs).flatten()

        assert top_k_indices.shape == top_k_probs.shape, f"Shape mismatch: {top_k_indices.shape} vs {top_k_probs.shape}"

        # Sample next token
        next_token = int(cp.random.choice(top_k_indices, size=1, p=top_k_probs)[0])
        model_input.append(next_token)

    return tokenizer.decode(model_input)


In [22]:
import cupy
x = cupy.zeros((10000, 10000))
print("CuPy is using:", cupy.cuda.runtime.getDeviceProperties(0)['name'])

CuPy is using: b'NVIDIA GeForce RTX 3050 6GB Laptop GPU'


In [None]:
import time
def get_batches(X, Y, batch_size):
    for i in range(0, len(X), batch_size):
        yield cp.array(X[i:i+batch_size]), cp.array(Y[i:i+batch_size])
num_epochs = 100
batch_size = 32
learning_rate = 0.01
optimizer = AdamW(model.parameters(), lr=learning_rate)
losses = []
for epoch in range(num_epochs):
    epoch_start = time.time()
    total_loss = 0
    batch_count = 0

    for x_batch, y_batch in get_batches(X, Y, batch_size):
        logits, cache = model.forward(x_batch) 
        loss, dlogits = cross_entropy_loss_with_grad(logits, y_batch) 
        model.backward(dlogits)  
        for name, param in model.parameters():
            optimizer.step(model.grads())

        total_loss += loss
        batch_count += 1

    avg_loss = total_loss / batch_count
    losses.append(avg_loss)
    epoch_end = time.time()

    if epoch % 20 == 0:
        print(f"Epoch {epoch}, Loss: {avg_loss:.4f}, Time: {epoch_end - epoch_start:.2f}s")
    if epoch != 0 and epoch % 50 == 0:
        optimizer.lr *= 0.5
        print(f"Learning rate halved to {optimizer.lr:.6f}")

Epoch 0, Loss: 18.9796, Time: 104.77s
Epoch 20, Loss: 14.9132, Time: 59.52s
Epoch 40, Loss: 14.2524, Time: 60.01s
Learning rate halved to 0.005000
Epoch 60, Loss: 8.3190, Time: 186.94s
Epoch 80, Loss: 8.1374, Time: 75.43s


In [65]:
prompt = "is"
input_seq = prompt + " <sep>"
output = sample(model, tokenizer, start_text=input_seq, length=15)
print("Generated:", output)

Generated: is <unk> the <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk> <unk>


In [47]:
print("Tokenizer vocab size:", tokenizer.vocab_size)
print("Unique tokens in training data:", len(set(tokenizer.encode(raw_text))))

Tokenizer vocab size: 669
Unique tokens in training data: 667


In [60]:
print("Tokenized:", tokenizer.encode("universe"))

Tokenized: [613]
