#### 1. data

In [1]:
with open("input.txt","r",encoding='utf-8') as f:
    text=f.read()

In [2]:
print("The length of dataset in characters",len(text))
print(text[0:50])

The length of dataset in characters 1115394
First Citizen:
Before we proceed any further, hear


#### 2. vocabulary

In [8]:
# here are all the unique characters that occur in this text
chars = sorted(list(set(text))) # set-ÂîØ‰∏ÄÁöÑcharacter,sorted-list-ÊéíÂ∫è
vocab_size=len(chars)           # vocabÁöÑÈïøÂ∫¶
print("".join(chars))
print(vocab_size)



 !$&',-.3:;?ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz
65


#### 3. tokenizer

In [9]:
# create a mapping from characters to integers
stoi = {ch:i for i,ch in enumerate(chars)}
itos = {i:ch for i,ch in enumerate(chars)}
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])

print(encode("hi meng"))
print(decode(encode("hi meng")))

[46, 47, 1, 51, 43, 52, 45]
hi meng


In [1]:
import tiktoken
enc = tiktoken.get_encoding('gpt2') 
enc.max_token_value

50256

In [2]:
# let's now encode the entire text dataset and store it into a torch tensor
import torch
data = torch.tensor(encode(text),dtype=torch.long)
print(data.shape,data.dtype)
print(data[0:50])

ModuleNotFoundError: No module named 'torch'

In [6]:
# split the train set and valid set
n = int(0.9*len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [7]:
# feed the transformer with chunks rather than the entire text 
block_size = 8
train_data[:block_size+1] 

tensor([18, 47, 56, 57, 58,  1, 15, 47, 58])

In [8]:
# 8 examples hidden in a chunk of 9 characters (tat we sampled from the training set)
x = train_data[:block_size]
y = train_data[1:block_size+1]
for t in range(block_size):
    context = x[:t+1]
    target = y[t]
    print(f"when input is {context} the target {target}")

when input is tensor([18]) the target 47
when input is tensor([18, 47]) the target 56
when input is tensor([18, 47, 56]) the target 57
when input is tensor([18, 47, 56, 57]) the target 58
when input is tensor([18, 47, 56, 57, 58]) the target 1
when input is tensor([18, 47, 56, 57, 58,  1]) the target 15
when input is tensor([18, 47, 56, 57, 58,  1, 15]) the target 47
when input is tensor([18, 47, 56, 57, 58,  1, 15, 47]) the target 58


In [23]:
torch.manual_seed(1337)
batch_size = 4 # how many independent sequences will we process in parallel?
block_size = 8 # what is the maximum context length for predictions?

def get_batch(split):
    """
    split: 'train' or 'val'
    """
    # generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data)-block_size,(batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x,y

xb,yb=get_batch('train')
print("inputs:\n",xb.shape,"\n",xb,"\ntargets:\n",yb.shape,"\n",yb)

for b in range(batch_size):
    for t in range(block_size):
        context = xb[b][:t+1]
        target = yb[b][t]
        print(f"when input is {context} the target {target}")


inputs:
 torch.Size([4, 8]) 
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]]) 
targets:
 torch.Size([4, 8]) 
 tensor([[43, 58,  5, 57,  1, 46, 43, 39],
        [53, 56,  1, 58, 46, 39, 58,  1],
        [58,  1, 58, 46, 39, 58,  1, 46],
        [17, 27, 10,  0, 21,  1, 54, 39]])
when input is tensor([24]) the target 43
when input is tensor([24, 43]) the target 58
when input is tensor([24, 43, 58]) the target 5
when input is tensor([24, 43, 58,  5]) the target 57
when input is tensor([24, 43, 58,  5, 57]) the target 1
when input is tensor([24, 43, 58,  5, 57,  1]) the target 46
when input is tensor([24, 43, 58,  5, 57,  1, 46]) the target 43
when input is tensor([24, 43, 58,  5, 57,  1, 46, 43]) the target 39
when input is tensor([44]) the target 53
when input is tensor([44, 53]) the target 56
when input is tensor([44, 53, 56]) the target 1
when input is tensor([44, 53

In [24]:
print("input chunk:\n",xb)

input chunk:
 tensor([[24, 43, 58,  5, 57,  1, 46, 43],
        [44, 53, 56,  1, 58, 46, 39, 58],
        [52, 58,  1, 58, 46, 39, 58,  1],
        [25, 17, 27, 10,  0, 21,  1, 54]])


<img src="./images/B*T*C.png" alt="embedding" width="300">

In [26]:
# baseline: bigram model
import torch
import torch.nn as nn
from torch.nn import functional as F
torch.manual_seed(1337)

class BigramLanguageModel(nn.Module):
    
    def __init__(self,vocab_size):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, vocab_size) # usually (vocab_size,embedding_dim)
        
    def forward(self,idx,targets=None):
        
        # idx and targets are both (B,T) tensor of integers
        logits = self.token_embedding_table(idx) # (B,T,C)
        
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T,C)
            targets = targets.view(B*T)
            # print("logits:",logits)
            # print("target",targets)
            loss = F.cross_entropy(logits, targets) # cross_entropy needs C as the second dimension ( ,C, )
            # print("loss:",loss)
        return logits,loss
    
    def generate(self,idx,max_new_tokens):
        # idx is (B,T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits, _ = self(idx)
            # focus only on the last time step
            logits = logits[:,-1,:] # becomes (B,C)
            # apply softmax to get probabilities
            probs = F.softmax(logits,dim=-1) # at C dimension softmax
            # sample from the distribution
            idx_next = torch.multinomial(probs,num_samples=1) #(B,1) ‰ºöÊ†πÊçÆÊØè‰∏ÄË°åÁöÑÊ¶ÇÁéáÂàÜÂ∏ÉÔºå‰ªéÊØè‰∏™Ê†∑Êú¨ÁöÑ‚ö†Ô∏èÁ±ªÂà´‚ö†Ô∏è‰∏≠ÈöèÊú∫ÈÄâÊã©‰∏Ä‰∏™
            print("probs:",probs)
            print("idx:",idx)
            print("idx_next:",idx_next)
            # append sampled index to the running sequence
            idx = torch.cat((idx,idx_next),dim=1) #(B,T+1)# ü§îËøôÈáå‰∏∫‰ªÄ‰πàËÉΩÂ§üÁõ¥Êé•catÔºåü•∞Âõ†‰∏∫ multinomial‰ºöÊ†πÊçÆprobs‰ªéÁ±ªÂà´‰∏≠ÈöèÊú∫ÈÄâÊã©
        return idx
             
m =  BigramLanguageModel(vocab_size)
logits, loss = m.forward(xb,yb)
print(logits.shape)
print(loss)

idx = torch.zeros((1,1),dtype=torch.long)
print(decode((m.generate(idx,5)[0]).tolist()))
        

torch.Size([32, 65])
tensor(4.8786, grad_fn=<NllLossBackward0>)
probs: tensor([[0.0091, 0.0071, 0.0053, 0.0030, 0.0141, 0.0078, 0.0197, 0.0081, 0.0109,
         0.0243, 0.0020, 0.0045, 0.0096, 0.0060, 0.0030, 0.0354, 0.0292, 0.0066,
         0.0101, 0.0199, 0.0010, 0.0124, 0.0335, 0.0137, 0.0086, 0.0016, 0.0024,
         0.0054, 0.0118, 0.0034, 0.0347, 0.0930, 0.0039, 0.0059, 0.0208, 0.0085,
         0.0089, 0.0235, 0.0024, 0.0056, 0.0046, 0.0030, 0.0131, 0.0017, 0.0023,
         0.0134, 0.0042, 0.0038, 0.0392, 0.0034, 0.0292, 0.0057, 0.0017, 0.0621,
         0.1199, 0.0013, 0.0323, 0.0017, 0.0172, 0.0061, 0.0165, 0.0351, 0.0379,
         0.0051, 0.0033]], grad_fn=<SoftmaxBackward0>)
idx: tensor([[0]])
idx_next: tensor([[31]])
probs: tensor([[0.0027, 0.0042, 0.0052, 0.0105, 0.0153, 0.0028, 0.0028, 0.0026, 0.0292,
         0.0140, 0.0026, 0.0021, 0.0034, 0.0204, 0.0384, 0.0072, 0.0048, 0.0024,
         0.0140, 0.0285, 0.0037, 0.0022, 0.0152, 0.0370, 0.0031, 0.0070, 0.0016,
         0.00

- we maybe expect the loss/cross_entropy is: -ln(1/65) = 4.17
    - the size of our vocab is 65

In [12]:
# ü§îAdamWÁÆóÊ≥ïÊú¨Ë∫´Â¶Ç‰ΩïÊõ¥Êñ∞gradientsÔºåËøòÊúâÂì™‰∫õÁªèÂÖ∏Êõ¥Êñ∞Ê¢ØÂ∫¶ÁöÑÁÆóÊ≥ï
optimizer = torch.optim.AdamW(m.parameters(),lr=1e-3) #parameters is the learnable parameters of our model, in this case, the logits(vocab_size*vocab_size)

In [13]:
batch_size = 32
for steps in range(10000):
    
    # sample a batch of data
    xa, xb = get_batch('train')
    
    # evaluate the loss
    logits, loss = m(xa,xb)
    # initial optimizer
    optimizer.zero_grad(set_to_none=True)
    # make backward for all parameters
    loss.backward()
    # gradient descent
    optimizer.step()
    
print("loss:",loss)

loss: tensor(2.4145, grad_fn=<NllLossBackward0>)


In [14]:
idx = torch.zeros((1,1),dtype=torch.long)
print(m.generate(idx,100))
print(m.generate(idx,100)[0])
print(decode((m.generate(idx,100)[0]).tolist()))

tensor([[0]])
tensor([[0]])
tensor([[24]])
tensor([[0, 0]])
tensor([[21]])
tensor([[ 0,  0, 24]])
tensor([[1]])
tensor([[ 0,  0, 24, 21]])
tensor([[46]])
tensor([[ 0,  0, 24, 21,  1]])
tensor([[43]])
tensor([[ 0,  0, 24, 21,  1, 46]])
tensor([[11]])
tensor([[ 0,  0, 24, 21,  1, 46, 43]])
tensor([[0]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11]])
tensor([[28]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11,  0]])
tensor([[39]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11,  0, 28]])
tensor([[52]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11,  0, 28, 39]])
tensor([[41]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11,  0, 28, 39, 52]])
tensor([[39]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11,  0, 28, 39, 52, 41]])
tensor([[50]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11,  0, 28, 39, 52, 41, 39]])
tensor([[53]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11,  0, 28, 39, 52, 41, 39, 50]])
tensor([[50]])
tensor([[ 0,  0, 24, 21,  1, 46, 43, 11,  0, 28, 39, 52, 41, 39, 50, 53]])
tensor([[47]])
tensor([[ 0,  0, 24, 21,  1, 46

#### The mathematical trick in self-attention 

In [15]:
torch.manual_seed(1337)
B,T,C=4,8,32 # batch,time,channels
x = torch.randn(B,T,C) # randn ÁîüÊàêÁöÑÊòØÂùáÂÄº‰∏∫ 0ÔºåÊ†áÂáÜÂ∑Æ‰∏∫ 1 ÁöÑÊ≠£ÊÄÅÂàÜÂ∏ÉÈöèÊú∫Êï∞ÔºàÊ†áÂáÜÊ≠£ÊÄÅÂàÜÂ∏ÉÔºâ
print(x.shape) 


torch.Size([4, 8, 32])


In [16]:
# version 1
# current and the context of the history [...<-|  ]
# We want x[b,t] = mean_{i<=t}  x[b,i]
xbow = torch.zeros((B,T,C))   # bow means bag of words
for b in range(B):
    for t in range(T):
        xprev = x[b,:t+1] # (t,C)
        # print(xprev.size())
        xbow[b,t] = torch.mean(xprev,0)

In [17]:
# version 2
wei = torch.tril(torch.ones(T,T))
wei = wei/torch.sum(wei,1,keepdim=True) # T,T

xbow2=wei@x # (B,T,T) @ (B,T,C) --> (B,T,C) ----> @ pytorch willËá™Âä®Â¢ûÂä†(B,T,T)ÔºåÂç≥ÊØè‰∏™ Batch ÂàÜÂà´‰∏é x(B,T,C) Áõ∏‰πò 

In [18]:
torch.allclose(xbow,xbow2) #check if xbow and xbow2 is the same

True

In [None]:
# version 3(use softmax)
# weight--‚ö†Ô∏è the current and historical information are simply mixed together by averaging
tril = torch.tril(torch.ones(T,T))
wei = torch.zeros((T,T))    # current setting
wei = wei.masked_fill((tril==0),float('-inf'))
wei = F.softmax(wei,dim=-1)
xbow3 = wei @ x     # (T,T) (T,C)
torch.allclose(xbow,xbow3)   


True

- version 3(use softmax)ÁöÑÂÆûÁé∞ÔºåÂÆûÈôÖÊòØselfÂèäself‰πãÂâçÁöÑÂä†ÊùÉÔºå‰∏îÊØè‰∏™‰ΩçÁΩÆÊùÉÈáçÁõ∏Âêå
- <img src="./images/T_T@T_C.png" alt="embedding" width="500">

In [None]:
# version 4: self-attention!
# weight--‚ö†Ô∏è The information is mixed together in a data-dependent way 
# each token emits two vectors: query and key
# query: what am i looking for
# key: what do i contain
# value: the information i can communicate with you
# x: the private information of the tokens

# the dot product is calculated between the queries and the keys
# the dot product of my query with all the keys of other tokens --> weights
# Â¶ÇÊûú‰∏Ä‰∏™query‰∏éÊüê‰∏™keyÁöÑÁÇπ‰πòÁªìÊûúËæÉÈ´òÔºåÂàôËØ¥Êòé ËØ•query‰ΩçÁΩÆÁöÑtoken ‰∏é ÂØπÂ∫îÁöÑkey‰ΩçÁΩÆÁöÑtoken ÂÖ≥Ê≥®Â∫¶Áõ∏ÂØπËæÉÈ´ò

torch.manual_seed(1337)
B,T,C = 4,8,32 # batch, time, channels
x = torch.randn(B,T,C)
 
head_size = 16
key = nn.Linear(C, head_size, bias=False)
query = nn.Linear(C, head_size, bias=False)
value = nn.Linear(C, head_size, bias=False)
k = key(x)      #(B, T, head_size)
q = query(x)    #(B, T, head_size)
wei = q @ k.transpose(-2, -1) # (B, T, 16) @ (B, 16, T) --> (B, T, T) all querys will dot.product all keys


tril = torch.tril(torch.ones(T,T))
# print(tril)
# wei = torch.zeros((T,T))
# print(wei)
wei = wei.masked_fill(tril==0,float('-inf')) # ËæìÂÖ•maskÈúÄË¶Å‰∏éÂΩìÂâçTensor(mask)ÂΩ¢Áä∂‰∏ÄËá¥
# print(wei)
wei = F.softmax(wei,dim=-1) #
print("wei:",wei)

v = value(x)
# out = wei @ x
out = wei @ v 
print("out:",out)

wei: tensor([[[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1574, 0.8426, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2088, 0.1646, 0.6266, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.5792, 0.1187, 0.1889, 0.1131, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0294, 0.1052, 0.0469, 0.0276, 0.7909, 0.0000, 0.0000, 0.0000],
         [0.0176, 0.2689, 0.0215, 0.0089, 0.6812, 0.0019, 0.0000, 0.0000],
         [0.1691, 0.4066, 0.0438, 0.0416, 0.1048, 0.2012, 0.0329, 0.0000],
         [0.0210, 0.0843, 0.0555, 0.2297, 0.0573, 0.0709, 0.2423, 0.2391]],

        [[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
         [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
         [0.0660, 

In [28]:
tril

tensor([[1., 0., 0., 0., 0., 0., 0., 0.],
        [1., 1., 0., 0., 0., 0., 0., 0.],
        [1., 1., 1., 0., 0., 0., 0., 0.],
        [1., 1., 1., 1., 0., 0., 0., 0.],
        [1., 1., 1., 1., 1., 0., 0., 0.],
        [1., 1., 1., 1., 1., 1., 0., 0.],
        [1., 1., 1., 1., 1., 1., 1., 0.],
        [1., 1., 1., 1., 1., 1., 1., 1.]])

## basic knowledge

In [None]:
x[0,:2,:]

tensor([[ 0.1808, -0.0700, -0.3596, -0.9152,  0.6258,  0.0255,  0.9545,  0.0643,
          0.3612,  1.1679, -1.3499, -0.5102,  0.2360, -0.2398, -0.9211,  1.5433,
          1.3488, -0.1396,  0.2858,  0.9651, -2.0371,  0.4931,  1.4870,  0.5910,
          0.1260, -1.5627, -1.1601, -0.3348,  0.4478, -0.8016,  1.5236,  2.5086],
        [-0.6631, -0.2513,  1.0101,  0.1215,  0.1584,  1.1340, -1.1539, -0.2984,
         -0.5075, -0.9239,  0.5467, -1.4948, -1.2057,  0.5718, -0.5974, -0.6937,
          1.6455, -0.8030,  1.3514, -0.2759, -1.5108,  2.1048,  2.7630, -1.7465,
          1.4516, -1.5103,  0.8212, -0.2115,  0.7789,  1.5333,  1.6097, -0.4032]])

In [None]:
xbow[0,:2,:]

tensor([[ 0.1808, -0.0700, -0.3596, -0.9152,  0.6258,  0.0255,  0.9545,  0.0643,
          0.3612,  1.1679, -1.3499, -0.5102,  0.2360, -0.2398, -0.9211,  1.5433,
          1.3488, -0.1396,  0.2858,  0.9651, -2.0371,  0.4931,  1.4870,  0.5910,
          0.1260, -1.5627, -1.1601, -0.3348,  0.4478, -0.8016,  1.5236,  2.5086],
        [-0.2412, -0.1606,  0.3253, -0.3968,  0.3921,  0.5798, -0.0997, -0.1170,
         -0.0732,  0.1220, -0.4016, -1.0025, -0.4849,  0.1660, -0.7592,  0.4248,
          1.4972, -0.4713,  0.8186,  0.3446, -1.7740,  1.2990,  2.1250, -0.5777,
          0.7888, -1.5365, -0.1695, -0.2732,  0.6133,  0.3658,  1.5667,  1.0527]])

In [None]:
# matrix multiply @ 
# allowing us to parallel compute
torch.manual_seed(42)
# a=torch.ones(3,3) 
a=torch.tril(torch.ones(3,3))        # for parallel compute
a= a/torch.sum(a,dim=1,keepdim=True) # for mean
b=torch.randint(0,10,(3,2)).float()
c=a@b  
print('a=\n',a)
print('b=\n',b)
print('c=\n',c)

a=
 tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])
b=
 tensor([[2., 7.],
        [6., 4.],
        [6., 5.]])
c=
 tensor([[2.0000, 7.0000],
        [4.0000, 5.5000],
        [4.6667, 5.3333]])


##### 1. cross entropy in pytorch
- https://zhuanlan.zhihu.com/p/415829154
    - nn.Cross Entropy Loss‰∏∫nn.logSoftmax()Âíånn.NLLLoss()ÁöÑÊï¥ÂêàÁâàÊú¨

In [None]:
torch.manual_seed(1337)
B, T, C = 1,3,3
logits = torch.randn(B,T,C)
targets = torch.randint(0,2,(B,T))
logits = logits.view(B*T,C)
targets = targets.view(B*T)
print("logits:",logits)
print("target",targets)
entropy = nn.CrossEntropyLoss()         # forward function‰∏≠Áõ¥Êé•Ë∞ÉÁî®‰∫Ü F.cross_entropy
loss1 = F.cross_entropy(logits, targets) # cross_entropy needs C as the second dimension ( ,C, )
loss2 = entropy(logits,targets) 
print("loss1:",loss1)
print("loss2:",loss2)

logits: tensor([[-2.0260, -2.0655, -1.2054],
        [-0.9122, -1.2502,  0.8032],
        [-0.2071,  0.0544,  0.1378]])
target tensor([1, 0, 0])
loss1: tensor(1.5926)
loss2: tensor(1.5926)


In [None]:
import numpy as np
def cross_entropy(input, target):
    output = 0
    length = len(target)
    for i in range(length):
        hou = 0
        for j in input[i]:
            hou += np.exp(j)
        output += -input[i][target[i]] + np.log(hou)
    return np.around(output/length,4)
print("loss_self_realize:",cross_entropy(logits,targets))


loss_self_realize: tensor(1.5926)


##### 2. masked_fill_ & masked_fill in Pytorch
- https://zhuanlan.zhihu.com/p/498021226
- ***_ ÂêéÈù¢Âä†‰∏ãÂàíÁ∫øË°®Á§∫ÂºïÁî®Ëøô‰∏™ÂáΩÊï∞ÁöÑÂéüTensor‰ºöË¢´‰øÆÊîπÔºåÂê¶ÂàôÂ∞±ÂàõÂª∫‰∏Ä‰∏™Êñ∞ÂèòÈáèÔºå‰∏ç‰ºöÊîπÂèòÂéüTensor