

# <h3 align="center"> $\underline{\text{ Large Language Model (LLM) }}$</h3>

<h3 align="center">$ \text{Artificial Intelligence (AI) algorithm that uses deep learning techniques.}$</h3>



#Using Self-attention



Mathematical technique: matrix multiplication

Self-attentio: the keys and values are produced from the same source as queries



In [1]:
with open('/content/drive/MyDrive/NLP/neruda_20poemasyCD.txt','r',encoding='utf-8') as f:
  text = f.read()

In [18]:
import torch
import torch.nn as Tnn
from torch.nn import functional as TF

In [21]:
torch.manual_seed(1337)
B = 4 # batch
T = 8 # time
C = 32 # channels
x = torch.randn(B,T,C)

head_size = 16 #single Head perform self-attention
key = Tnn.Linear(C, head_size, bias=False)
query = Tnn.Linear(C, head_size, bias=False)
value = Tnn.Linear(C, head_size, bias=False)
k,q = key(x), query(x)
wei =  q @ k.transpose(-2, -1)
tril = torch.tril(torch.ones(T, T))
wei = wei.masked_fill(tril == 0, float('-inf'))
wei = TF.softmax(wei, dim=-1)
v = value(x)
out = wei @ v
out.shape

torch.Size([4, 8, 16])

In [23]:
wei[1]

tensor([[1.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.1687, 0.8313, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.2477, 0.0514, 0.7008, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.4410, 0.0957, 0.3747, 0.0887, 0.0000, 0.0000, 0.0000, 0.0000],
        [0.0069, 0.0456, 0.0300, 0.7748, 0.1427, 0.0000, 0.0000, 0.0000],
        [0.0660, 0.0892, 0.0413, 0.6316, 0.1649, 0.0069, 0.0000, 0.0000],
        [0.0396, 0.2288, 0.0090, 0.2000, 0.2061, 0.1949, 0.1217, 0.0000],
        [0.3650, 0.0474, 0.0767, 0.0293, 0.3084, 0.0784, 0.0455, 0.0493]],
       grad_fn=<SelectBackward0>)

# Hyperparameters

In [26]:
batchS = 16
blockS = 32
maxIt = 5000
evalIv = 100
learningRt = 1e-3
evalIt = 200
embdN = 64
headN = 4
layerN = 4
dropout = 0.0
torch.manual_seed(1337)
device = 'cuda' if torch.cuda.is_available() else 'cpu'

Unique characters in this text

In [45]:
chars = sorted(list(set(text)))
vocabS = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) } #mapping: Ch to Int
itos = { i:ch for i,ch in enumerate(chars) }
encod = lambda s: [stoi[c] for c in s]          #input:string -> output:integers
decod = lambda l: ''.join([itos[i] for i in l]) # input:integers -> output:string
data = torch.tensor(encod(text), dtype=torch.long) # Train -> Test
n = int(0.9*len(data)) # 90% train
train_data = data[:n]
val_data = data[n:]

Data loading

In [41]:
#batch of data of inputs x and targets y
def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - blockS, (batchS,))
    x = torch.stack([data[i:i+blockS] for i in ix])
    y = torch.stack([data[i+1:i+blockS+1] for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(evalIt)
        for k in range(evalIt):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

Head of self-attention

In [34]:
class HeadSA(Tnn.Module):

    def __init__(self, head_size):
        super().__init__()
        self.key = Tnn.Linear(embdN, head_size, bias=False)
        self.query = Tnn.Linear(embdN, head_size, bias=False)
        self.value = Tnn.Linear(embdN, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(blockS, blockS)))

        self.dropout = Tnn.Dropout(dropout)

    def forward(self, x):
        B,T,C = x.shape
        k = self.key(x)
        q = self.query(x)
        wei = q @ k.transpose(-2,-1) * C**-0.5
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf'))
        wei = TF.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        v = self.value(x)
        out = wei @ v
        return out

Multi self-attention

In [36]:
class MultiHA(Tnn.Module):

    def __init__(self, headsN, headS):
        super().__init__()
        self.heads = Tnn.ModuleList([HeadSA(headS) for _ in range(headsN)])
        self.proj = Tnn.Linear(embdN, embdN)
        self.dropout = Tnn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

    Linear layer followed by a non-linearity

In [42]:
class FeedFoward(Tnn.Module):

    def __init__(self, embdN):
        super().__init__()
        self.net = Tnn.Sequential(Tnn.Linear(embdN, 4 * embdN),Tnn.ReLU(),
            Tnn.Linear(4 * embdN, embdN),Tnn.Dropout(dropout),)

    def forward(self, x):
        return self.net(x)

Transformer block

Communication followed by computation

In [43]:
class Block(Tnn.Module):

    def __init__(self, embdN, headN):
        super().__init__()
        headS = embdN // headN
        self.sa = MultiHA(headN, headS)
        self.ffwd = FeedFoward(embdN)
        self.ln1 = Tnn.LayerNorm(embdN)
        self.ln2 = Tnn.LayerNorm(embdN)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

# Bigram model

In [47]:
class BigramM(Tnn.Module):

    def __init__(self):
        super().__init__()
        self.token_embedding_table = Tnn.Embedding(vocabS, embdN)
        self.position_embedding_table = Tnn.Embedding(blockS, embdN)
        self.blocks = Tnn.Sequential(*[Block(embdN, headN=headN) for _ in range(layerN)])
        self.ln_f = Tnn.LayerNorm(embdN)
        self.lm_head = Tnn.Linear(embdN, vocabS)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)
        pos_emb = self.position_embedding_table(torch.arange(T, device=device))
        x = tok_emb + pos_emb
        x = self.blocks(x)
        x = self.ln_f(x)
        logits = self.lm_head(x)
        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = TF.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, tokensMax):
        for _ in range(tokensMax):
            idx_cond = idx[:, -blockS:]
            logits, loss = self(idx_cond)
            logits = logits[:, -1, :]
            probs = TF.softmax(logits, dim=-1)
            idx_next = torch.multinomial(probs, num_samples=1)
            idx = torch.cat((idx, idx_next), dim=1)
        return idx

In [49]:
model = BigramM()
m = model.to(device)
print(sum(p.numel() for p in m.parameters())/1e6, 'M parameters')
optimizer = torch.optim.AdamW(model.parameters(), lr=learningRt)

for iter in range(maxIt):
    if iter % evalIv == 0 or iter == maxIt - 1:
        losses = estimate_loss()
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(decod(m.generate(context, tokensMax=2000)[0].tolist()))


0.211406 M parameters


Pero se van tan los vientos tan los carcoles blantas.

ColINADO no idas asio es, las manos noches como una paresta en la boiga como de pesta mi cansa
concendo esbanido estabas?
Poema 17

Puedo escribir los versos más tristes esta noche.

Escumos nostán astuy grandioso en irmento.

Sejosa, turde, mi camino, desvalida.
El hi, detrarque los campos he visesta.

Puedo esculo sió-

Poema 11


Caña mi alma no que se contentas.

Ellas están huyendo los pinos sobre una arde les algos,
lentiosa de ca abscos del viento.

Poema 19

Niña morena y ágil, navo que de estatua temerte de tu alma.

Poema 12

Cayó el libro que van tuyendo mío es son de mistadercerseas en luchas.

Ansiedad de pinos sobre tienembla.
Ámentada en las humas, orques con la soledad como un puda.

Pensandolo, y como un persistión de cada día.

A nudse como esta la tarde, igal y mi mi canción de alma.

Poy goi voz, altas de todo lo tarderte.
Fuién estaba, tu boca como una colaronte.
Tu se mi acercarla a mi 