<a href="https://colab.research.google.com/github/M-Amrollahi/Personal-Notes/blob/master/ML-notes/train_transDecoder_charBase_Andrej.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [51]:
!pip install -q x-transformers
!pip install -q torchinfo

## How to use n-gram model to generate text?

In [37]:
import torch
from torch.optim import Adam
from torch import nn
from torch.utils.data import Dataset,DataLoader
from torch.functional import F
from tqdm.notebook import tqdm
import torchinfo

In [12]:
!wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt

--2023-06-09 02:39:25--  https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.110.133, 185.199.108.133, 185.199.109.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.110.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1115394 (1.1M) [text/plain]
Saving to: ‘input.txt.1’


2023-06-09 02:39:26 (27.8 MB/s) - ‘input.txt.1’ saved [1115394/1115394]



In [13]:
with open("input.txt","r") as f:
    text = f.read()

chars = sorted(list(set(text)))
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string
vocabSize = len(chars)

In [44]:
class cls_data(Dataset):
    def __init__(self, text, chunkSize=8):
        super().__init__()
        
        
        textEnc = torch.asarray(encode(text))

        ix = len(textEnc) - 1
        self.x = textEnc[:ix] 
        self.y = textEnc[1:]
        
        
        

    def __len__(self):
        #return len(self.m_data)
        return self.x.shape[0]

    def __getitem__(self, index):
        
        return self.x[index], self.y[index]

In [None]:
class cls_ngram(nn.Module):
    def __init__(self,chunkSize=8):
        super().__init__()
        embedSize = 10
        self.m_emb = nn.Embedding(vocabSize, embedSize)
        self.m_model = nn.Sequential(
            nn.Linear(chunkSize*embedSize , vocabSize),
            nn.ReLU(),
        )

    def forward(self, x):
        
        x = self.m_emb.forward(x)
        
        logits = self.m_model.forward(x)

        return logits

    
    @torch.no_grad()
    def f_valLoss(self, xLoader):
        self.m_model.eval()
        lstLoss = []
        for xb, yb in xLoader:
            logits = self.forward(xb)
            loss = criterion.forward(logits, yb)

            lstLoss.append(loss.item())
        loss = sum(lstLoss)/len(lstLoss)

        return loss


    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # get the predictions
            logits = self(idx)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
dsTrain = cls_data(text[:int(.9*len(text))])
dsVal = cls_data(text[int(.9*len(text)):])

In [None]:
epochs = 3

model = cls_ngram(chunkSize=1)

dsTrainLoader = DataLoader(dsTrain, batch_size=256)
dsValLoader = DataLoader(dsVal, batch_size=256)

optimizer = Adam(model.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()

for i in range(epochs):
    lstLoss = list()
    for idx, (xb, yb) in enumerate(tqdm(dsTrainLoader)):
        
        #print(xb.shape)
        logits = model.forward(xb)


        loss = criterion.forward(logits, yb)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        lstLoss.append( loss.item())
    
    valLoss = model.f_valLoss(dsValLoader)

    print(f"train loss: {sum(lstLoss)/len(lstLoss):.4f} , val loss: {valLoss:.4f}")

In [None]:
context = torch.zeros((1,1), dtype=torch.long)
print(decode(model.generate(context, max_new_tokens=200)[0].tolist()))

## How to apply Decoder part of the transformer uisng x-transformer

In [None]:
class cls_data_transformer(Dataset):
    """
    Based on each space in the corpus, we make a data from that index with the len of chunkSize.
    The len of the dataset is the number of these whitespaces
    """
    def __init__(self, text, chunkSize=8):
        super().__init__()
        self.m_chunkSize = chunkSize
        
        self.m_textEnc = torch.asarray(encode(text))

        #ix = len(self.m_textEnc) - 1
        #self.x = self.m_textEnc[:ix] #torch.stack([textEnc[i:i+chunkSize] for i in range(ix)])
        #self.y = self.m_textEnc[1:]#torch.stack([textEnc[i+1:i+1+chunkSize] for i in range(ix)])
        
        self.sp_indeces = (self.m_textEnc==stoi[" "]).argwhere()+1
        

    def __len__(self):
        
        #return len(self.x) - self.m_chunkSize - 1

        for i in range(len(self.sp_indeces)-1,0,-1):
            if self.sp_indeces[i].item() + self.m_chunkSize + 1 < self.m_textEnc.shape[0]:
                break

        return i

    def __getitem__(self, index):
        
        #return self.x[index:index+self.m_chunkSize],self.y[index:index+self.m_chunkSize]
        
        a = self.sp_indeces[index]
        
        return self.m_textEnc[a: a+self.m_chunkSize], self.m_textEnc[a+1: a+1+self.m_chunkSize]
        

In [34]:
@torch.no_grad()
def f_valLoss(model, xLoader):
    model.eval()
    lstLoss = []
    for xb, yb in xLoader:
        logits = model.forward(xb)
        #loss = criterion.forward(logits, yb)
        loss=criterion.forward(logits.reshape(-1,logits.shape[-1]), yb.reshape(-1))

        lstLoss.append(loss.item())
    loss = sum(lstLoss)/len(lstLoss)

    return loss


def generate(model,idx, max_new_tokens):
    # idx is (B, T) array of indices in the current context
    for _ in range(max_new_tokens):
        # get the predictions
        logits = model(idx)
        # focus only on the last time step
        logits = logits[:, -1, :] # becomes (B, C)
        # apply softmax to get probabilities
        probs = F.softmax(logits, dim=-1) # (B, C)
        # sample from the distribution
        idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
        # append sampled index to the running sequence
        idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
    return idx

In [45]:
dsTrain = cls_data_transformer(text[:int(.9*len(text))])
dsVal = cls_data_transformer(text[int(.9*len(text)):])

In [46]:
from x_transformers import TransformerWrapper, Decoder

model = TransformerWrapper(
    num_tokens = vocabSize,
    max_seq_len = 200,
    attn_layers = Decoder(
        dim = 100,
        depth = 3,
        heads = 3
    )
)
epochs = 3

dsTrainLoader = DataLoader(dsTrain, batch_size=256)
dsValLoader = DataLoader(dsVal, batch_size=256)

optimizer = Adam(model.parameters(), lr=3e-3)
criterion = nn.CrossEntropyLoss()

for i in range(epochs):
    lstLoss = list()
    for idx, (xb, yb) in enumerate(tqdm(dsTrainLoader)):
        
        #print(xb.shape)
        logits = model.forward(xb)

        #print(yb.shape)
        #print(logits.shape)
       

        #yb = yb.reshape(-1)

        loss = criterion.forward(logits.reshape(-1,logits.shape[-1]), yb.reshape(-1))

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        lstLoss.append( loss.item())
    
    valLoss = f_valLoss(model,dsValLoader)

    print(f"train loss: {sum(lstLoss)/len(lstLoss):.4f} , val loss: {valLoss:.4f}")

  0%|          | 0/3922 [00:00<?, ?it/s]

train loss: 2.0218 , val loss: 1.9802


  0%|          | 0/3922 [00:00<?, ?it/s]

train loss: 1.8290 , val loss: 1.9067


  0%|          | 0/3922 [00:00<?, ?it/s]

train loss: 1.7629 , val loss: 1.8662


In [49]:
context = torch.zeros((1,1), dtype=torch.long)
print(decode(generate(model,context, max_new_tokens=200)[0].tolist()))


Have you!
Mor y Nerene Pele cereeara karearearedardea A:
LYCorordleareatanaspturaseananord

Sire Mar BAYCistirearesheasisarolearere, soir ba BASInarat cacinounor e ETANtreanordimaroune frordidilou,
A:


In [50]:
torchinfo.summary(model)

Layer (type:depth-idx)                             Param #
TransformerWrapper                                 --
├─TokenEmbedding: 1-1                              --
│    └─Embedding: 2-1                              6,500
├─AbsolutePositionalEmbedding: 1-2                 --
│    └─Embedding: 2-2                              20,000
├─Identity: 1-3                                    --
├─Dropout: 1-4                                     --
├─Identity: 1-5                                    --
├─Decoder: 1-6                                     --
│    └─ModuleList: 2-3                             --
│    │    └─ModuleList: 3-1                        77,000
│    │    └─ModuleList: 3-2                        80,700
│    │    └─ModuleList: 3-3                        77,000
│    │    └─ModuleList: 3-4                        80,700
│    │    └─ModuleList: 3-5                        77,000
│    │    └─ModuleList: 3-6                        80,700
│    └─LayerNorm: 2-4                         