# Word Tokenizer

In [None]:
### Word Tokenizer- Vocab size=50304

from collections import OrderedDict

block_size = 64
vocab_size = 50304
n_layer = 4
n_head = 4
n_embd = 128
bias = False
assert not bias, "this notebook assumes bias=False just for simplicity"

def params():
    """ estimates the number of parameters in the model"""
    out = OrderedDict()

    # token and position embeddings
    out['emebedding/position'] = n_embd * block_size
    out['embedding/token'] = n_embd * vocab_size
    out['embedding'] = out['emebedding/position'] + out['embedding/token']

    # attention blocks
    out['attention/ln'] = n_embd # note, bias=False in our LN
    out['attention/kqv'] = n_embd * 3*n_embd
    out['attention/proj'] = n_embd**2
    out['attention'] = out['attention/ln'] + out['attention/kqv'] + out['attention/proj']

    # MLP blocks
    ffw_size = 4*n_embd # feed forward size
    out['mlp/ln'] = n_embd
    out['mlp/ffw'] = n_embd * ffw_size
    out['mlp/proj'] = ffw_size * n_embd
    out['mlp'] = out['mlp/ln'] + out['mlp/ffw'] + out['mlp/proj']

    # the transformer and the rest of it
    out['block'] = out['attention'] + out['mlp']
    out['transformer'] = n_layer * out['block']
    out['ln_f'] = n_embd # final layernorm
    out['dense'] = 0 # 0 because of parameter sharing. This layer uses the weights from the embedding layer

    # total
    out['total'] = out['embedding'] + out['transformer'] + out['ln_f'] + out['dense']

    return out



In [None]:
# compare our param count to that reported by PyTorch
p = params()
params_total = p['total']
print(f"we see: {params_total}, expected: {(7233536+1152)}, match: {params_total == (7233536+1152)}")
# create a header
print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
for k,v in p.items():
    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

we see: 7234688, expected: 7234688, match: True
name                 params     ratio (%) 
emebedding/position        8192     0.1132
embedding/token         6438912    89.0005
embedding               6447104    89.1138
attention/ln                128     0.0018
attention/kqv             49152     0.6794
attention/proj            16384     0.2265
attention                 65664     0.9076
mlp/ln                      128     0.0018
mlp/ffw                   65536     0.9059
mlp/proj                  65536     0.9059
mlp                      131200     1.8135
block                    196864     2.7211
transformer              787456    10.8845
ln_f                        128     0.0018
dense                         0     0.0000
total                   7234688   100.0000


# Character Tokenizer

In [1]:
### Character Tokenizer- Vocab size=77

from collections import OrderedDict

block_size = 64
vocab_size = 77
n_layer = 4
n_head = 4
n_embd = 128
bias = False
assert not bias, "this notebook assumes bias=False just for simplicity"

def params():
    """ estimates the number of parameters in the model"""
    out = OrderedDict()

    # token and position embeddings
    out['emebedding/position'] = n_embd * block_size
    out['embedding/token'] = n_embd * vocab_size
    out['embedding'] = out['emebedding/position'] + out['embedding/token']

    # attention blocks
    out['attention/ln'] = n_embd # note, bias=False in our LN
    out['attention/kqv'] = n_embd * 3*n_embd
    out['attention/proj'] = n_embd**2
    out['attention'] = out['attention/ln'] + out['attention/kqv'] + out['attention/proj']

    # MLP blocks
    ffw_size = 4*n_embd # feed forward size
    out['mlp/ln'] = n_embd
    out['mlp/ffw'] = n_embd * ffw_size
    out['mlp/proj'] = ffw_size * n_embd
    out['mlp'] = out['mlp/ln'] + out['mlp/ffw'] + out['mlp/proj']

    # the transformer and the rest of it
    out['block'] = out['attention'] + out['mlp']
    out['transformer'] = n_layer * out['block']
    out['ln_f'] = n_embd # final layernorm
    out['dense'] = 0 # 0 because of parameter sharing. This layer uses the weights from the embedding layer

    # total
    out['total'] = out['embedding'] + out['transformer'] + out['ln_f'] + out['dense']

    return out



In [2]:
# compare our param count to that reported by PyTorch
p = params()
params_total = p['total']
print(f"we see: {params_total}, expected: {(804480+1152)}, match: {params_total == (804480+1152)}")
# create a header
print(f"{'name':20s} {'params':10s} {'ratio (%)':10s}")
for k,v in p.items():
    print(f"{k:20s} {v:10d} {v/params_total*100:10.4f}")

we see: 805632, expected: 805632, match: True
name                 params     ratio (%) 
emebedding/position        8192     1.0168
embedding/token            9856     1.2234
embedding                 18048     2.2402
attention/ln                128     0.0159
attention/kqv             49152     6.1010
attention/proj            16384     2.0337
attention                 65664     8.1506
mlp/ln                      128     0.0159
mlp/ffw                   65536     8.1347
mlp/proj                  65536     8.1347
mlp                      131200    16.2854
block                    196864    24.4360
transformer              787456    97.7439
ln_f                        128     0.0159
dense                         0     0.0000
total                    805632   100.0000
