In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

In [21]:
data  = open("input.txt",'r',encoding='utf-8').read()
print(len(data))

36070473


In [22]:
chars = sorted(list(set(data)))
vocab_size = len(chars)
print(''.join(chars))
print(vocab_size)

	
 !"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~¡£§­°·º¼½¾¿ÆÇÉ×ÜÞàáâãäåæçèéêëìíîïðñòóôö÷ùúûüýþœЎабйнтщ،؟آابتثخدرزشعفلمنهوچکی‌—‘’“”…™
181


In [23]:
# mappping of characters to integers
s_i = {c:i for i,c in enumerate(chars)}
i_s = {i:c for i,c in enumerate(chars)}

def encode(x): return [s_i[c] for c in x]  # encode input string to integers
def decode(x): return ''.join([i_s[i] for i in x])# decode integers to input string

print(encode('hello'))
print(decode([62, 29, 36, 36, 39]))

[74, 71, 78, 78, 81]
\;BBE


In [25]:
# convert input data to tensor
data = torch.tensor(encode(data), dtype=torch.long).to('cuda')
print(data.shape, data.dtype)

torch.Size([36070473]) torch.int64


In [26]:
# train-val split
n = int(len(data)*0.9)
train_data = data[:n]
val_data = data[n:]

In [28]:
torch.manual_seed(42)
block_size = 8 # time dimension of the sequence
batch_size = 4  # number of sequences in parallel
def create_batch(slpit):
    data = train_data if slpit=='train' else val_data
    ix = torch.randint(len(data)-block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x.to('cuda'),y.to('cuda')

In [31]:
torch.manual_seed(42)
xb,yb = create_batch('train')
class BigramModel(nn.Module):
    def __init__(self,vocab_size ):
        super().__init__()
        self.token_emb_table = nn.Embedding(vocab_size, vocab_size)

    def forward(self, idx,targets=None):
        logits = self.token_emb_table(idx)          # (batch, time ,channel)
        if targets is None:
            loss = None
        else:
            B,T,C = logits.shape
            logits = logits.view(B*T,C)             # cross entropy expects batch x channel x time
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)
        return logits,loss

    def generate(self,idx,max_new):                 # idx -> (B,T) for current sequence
        for _ in range(max_new):
            logits,loss =self(idx)
            logits = logits[:,-1,:]                 # (B,C)
            probs = F.softmax(logits,dim=-1)
            idx_next = torch.multinomial(probs,1)   # (B,1)
            idx = torch.cat((idx,idx_next),dim=1)   # (B,T+1)
        return idx

m = BigramModel(vocab_size).to('cuda')
logits,loss = m(xb,yb)
print(logits.shape,loss)                            #(B, T, C)
idx = torch.zeros((1,1),dtype=torch.long).to('cuda')
print(decode(m.generate(idx, max_new=100)[0].tolist()))

torch.Size([32, 181]) tensor(5.6543, device='cuda:0', grad_fn=<NllLossBackward0>)
	ک‘ëMîث-w<ÞpdÆع:#ثyDi`а½یYCخآHй§<،)د‌…‘w×BzزzچýÉDq_gê~ìaQB§}%¡+=F¿5ث<T`[-P}úG§çщzN(Çй’Kآ:Tx¡+°CÆYнä13


In [32]:
optimizer = torch.optim.AdamW(m.parameters(),lr =1e-3)  # for smaller networks high lr(1e-3) can work 

In [44]:
batch_size =32

for i in range(1000):
    x, y =create_batch('train')
    logits, loss = m(x,y)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print(loss.item())


2.853066921234131


In [45]:
print(decode(m.generate(idx, max_new=500)[0].tolist()))   # from the simplest model

	jo antofn [issele, irty --
"bode se t y
Gons; crs; bl owofn she. by pimus F. b)
AThe toor - s, tmed cy s. pellwoten asemowof; astiofn, ty f orelalle pe"BRGuricicalochof Sted tha*t Matofame jöl.] hin;

Gar cinon"The ielotionectis.] ponthin, men. pldlen.]
z"thon; iverola.
1.
dorsethea*ty.
Acurome t. bouina o


 ve of*t hertrkesiete mo wamanor. aristin. ioroonsultirthivin'dac"F.
Onecus e ELansed"OFrhefangunth! kice sholoncas. chn. ticrar; mis ocqu a EL. tmuba oue-
Ratish, tothe a St?! locing.), min


# Simplest way for tokens to communicate with previous (self attention)
# communicate with only previous 
* average of all vectors before

In [11]:
# trick to create a lower triangular matrix for attention
a = torch.tril(torch.ones(3,3))      # lower triangular matrix
a = a/ torch.sum(a,dim=1,keepdim=True)  # normalize
b= torch.randint(0,10,(3,2)).float()
c = a@b  #  each element of c is mean of previous elements of b
print(f'{a}\n')
print(f'{b}\n')
print(c)

tensor([[1.0000, 0.0000, 0.0000],
        [0.5000, 0.5000, 0.0000],
        [0.3333, 0.3333, 0.3333]])

tensor([[3., 2.],
        [5., 2.],
        [3., 4.]])

tensor([[3.0000, 2.0000],
        [4.0000, 2.0000],
        [3.6667, 2.6667]])


In [14]:
# # can also be done as
# a = torch.tril(torch.ones(3,3)).float()      # lower triangular matrix
# w =torch.zeros(3,3).float()
# w =w.masked_fill(a==0, float('-inf')) # replace 0 with -inf
# w = torch.softmax(b,dim=-1)  # softmax on negative values


In [19]:
# self attention
torch.manual_seed(42)
B,T,C = 4,8,32
x = torch.randn(B,T,C)

head_size = 16
key = nn.Linear(C,head_size,bias=False)  # what to look for
query = nn.Linear(C,head_size,bias=False) # what to look at
value = nn.Linear(C,head_size,bias=False) # what to output
k = key(x)
q = query(x)
v = value(x)
wei = q @ k.transpose(-1,-2)  # (B, T, 16) -> (B, 16, T)  # affinites of each element with each other
wei = wei / (head_size**0.5)  # scale to make variance 1 
# as softmax with encode the large values as small values will be 0


trill = torch.tril(torch.ones(T,T)).float()
# wei = torch.zeros(T,T).float()
wei = wei.masked_fill(trill==0, float('-inf'))  # make next elements not visible
wei = torch.softmax(wei,dim=-1)
out =wei@v
out.shape

torch.Size([4, 8, 16])

# residual connection
 * for larger networks 
 * alow to optimise better by adding a computational path in between
 * this connection starts by negligible contribution at start then gradually increases

 `` 2..03 at this stage``

In [20]:
class layerNorm1d:
    """similar to batch norm but along rows and no buffers"""
    def __init__(self, dim, eps=1e-5, momentum=0.1):
        self.eps = eps
        # parameters (trained with backprop)
        self.gamma = torch.ones(dim)
        self.beta = torch.zeros(dim)

    def __call__(self, x):
        # calculate the forward pass

        xmean = x.mean(1, keepdim=True) # normalize the rows instead of columns # layer norm
        xvar = x.var(1, keepdim=True)
        xhat = (x - xmean) / torch.sqrt(xvar + self.eps) # normalize to unit variance
        self.out = self.gamma * xhat + self.beta
        return self.out

    def parameters(self):
        return [self.gamma, self.beta]


# this plus increasing context size and network size
# also add dropout layer

`now 1.615`