In [1]:
!pip install torchinfo

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [2]:
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
import torch
import torch.nn as nn
from torchinfo import summary

In [3]:
!curl -L -o poem-dataset.zip \
https://www.kaggle.com/api/v1/datasets/download/marufchowdhury/poem-dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 10.3M  100 10.3M    0     0  4009k      0  0:00:02  0:00:02 --:--:-- 5616k


In [4]:
!unzip poem-dataset.zip


Archive:  poem-dataset.zip
  inflating: Poems_Dataset.csv       
  inflating: poemDatasetWithSummary.csv  


In [5]:
df=pd.read_csv("Poems_Dataset.csv")
df=df["Poem Content"]
data=df.tolist()



In [6]:
#Hyper Parameters
context_window_length=20
batch_size=64
n_embed=200
n_head=5
n_layers=5
v_size=5000
head_size=n_embed//n_head

In [7]:
tokenizer=Tokenizer(BPE())
tokenizer.pre_tokenizer=Whitespace()

In [8]:
trainer=BpeTrainer(vocab_size=v_size)
tokenizer.train_from_iterator(data,trainer)

In [9]:
out=tokenizer.encode("hello my guy how are you")
out.tokens,out.ids


(['hell', 'o', 'my', 'gu', 'y', 'how', 'are', 'you'],
 [3825, 78, 2059, 2571, 88, 2250, 2084, 2042])

In [11]:
def generator(data,batch_size,cwl):
    X=[]
    Y=[]
    count=0

    for i in range(len(data)):
        X.append(data[i:i+cwl])
        Y.append(data[i+1:i+cwl+1])
        count+=1

        if count==batch_size:
            yield torch.stack(X,dtype=torch.float32),torch.stack(Y,dtype=torch.float32)
            X,Y=[],[]
            count=0



In [12]:
class AttentionHead(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.key=nn.Linear(n_embed,head_size) #(B,T,C)-->(B,T,H)
        self.query=nn.Linear(n_embed,head_size) #(B,T,C)-->(B,T,H)
        self.value=nn.Linear(n_embed,head_size)  #(B,T,C)-->(B,T,H)

    def forward(self,x):
        k=self.key(x)     #(B,T,H)
        q=self.query(x)   #(B,T,H)
        v=self.value(x)   #(B,T,H)

        # Do Dot product of k and q

        weights=k@q.transpose(-2,-1)*head_size**-0.5  # (B,T,H) x (B,H,T) --> (B,T,T)
        weights=weights.masked_fill(torch.tril(torch.ones((context_window_length,context_window_length))==0,float("-inf")))
        weights=nn.functional.softmax(weights,dim=-1)
        weights=nn.Dropout(weights,0.1)

        output=weights@v #(B,T,T) x (B,T,H) --> (B,T,H)
        return output

In [13]:
class MultiHead(nn.Module):
    def __init__(self,n_head,head_size):
        super().__init__()
        self.heads=nn.ModuleList([AttentionHead(head_size) for _ in range(n_head)])
        self.project=nn.Linear(n_head*head_size,n_embed)

    def forward(self,x):
        out=torch.cat([h(x) for h in self.heads],dim=-1)  # (B,T,H*N)
        out=self.project(out)  # (B,T,H*N) --> (B,T,C)
        out=nn.Dropout(out,0.1)
        return out

In [14]:
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.FF=nn.Sequential(
            nn.Linear(n_embed,3*n_embed),
            nn.ReLU(),
            nn.Linear(3*n_embed,n_embed),
            nn.Dropout()
        )

    def forward(self,x):
        return self.FF(x)

In [15]:
class Block(nn.Module):
    def __init__(self,n_embed,n_head):
        super().__init__()
        head_size=n_embed//n_head
        self.SelfAtt = MultiHead(n_head, head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self,x):
        x = x + self.sa(self.ln1(x)) + self.ffwd(self.ln2(x))
        return x  #(B,T,C)

In [16]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed=nn.Embedding(v_size,n_embed)  # (B,T) --> (B,T,C)
        self.pos_embed=nn.Embedding(context_window_length,n_embed) # (T) --> (T,C)

        self.blocks=nn.Sequential(*[Block(n_embed,n_head) for _ in range(n_layers)])
        self.final_layernorm = nn.LayerNorm(n_embed) # final layer norm
        self.lm_head = nn.Linear(n_embed, v_size)

    def forward(self,x):
        # x ==> (B,T)

        tok_embeds=self.embed(x) # (B,T,C)
        pos_embeds=self.pos_embed(torch.arange(context_window_length)) #(T,C)
        x=tok_embeds + pos_embeds # pos_embed r broadcasted and added to every batch element

        x=self.blocks(x)
        x=self.final_layernorm(x)
        logits=self.lm_head(x)

        return logits


    @torch.no_grad()
    def generate(model,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            if idx.size(1)>context_window_length:
                idx_cond=idx[:,-context_window_length:]
            else:
                idx_cond=idx

            logits=model(idx_cond)
            probs=torch.softmax(logits[:,-1,:],dim=-1)
            next_token=torch.multinomial(probs,1)
            idx=torch.cat((idx,next_token),dim=1)

        return idx


In [17]:
model=GPT()
optimizer=torch.optim.AdamW(model.parameters(),lr=0.0001)
criterion=nn.CrossEntropyLoss()
epochs=100

In [18]:
summary(model)

Layer (type:depth-idx)                        Param #
GPT                                           --
├─Embedding: 1-1                              1,000,000
├─Embedding: 1-2                              4,000
├─Sequential: 1-3                             --
│    └─Block: 2-1                             --
│    │    └─MultiHead: 3-1                    160,800
│    │    └─FeedForward: 3-2                  240,800
│    │    └─LayerNorm: 3-3                    400
│    │    └─LayerNorm: 3-4                    400
│    └─Block: 2-2                             --
│    │    └─MultiHead: 3-5                    160,800
│    │    └─FeedForward: 3-6                  240,800
│    │    └─LayerNorm: 3-7                    400
│    │    └─LayerNorm: 3-8                    400
│    └─Block: 2-3                             --
│    │    └─MultiHead: 3-9                    160,800
│    │    └─FeedForward: 3-10                 240,800
│    │    └─LayerNorm: 3-11                   400
│    │    └─LayerNo

In [19]:
for i in range(epochs):
    step=0
    for x,y in generator(data,batch_size,context_window_length):
        optimizer.zero_grad(set_to_none=True)

        logits=model(x)
        logits=logits.view(batch_size*context_window_length,n_embed)
        y=y.view(batch_size*context_window_length)

        loss=criterion(logits,y)
        loss.backward()
        optimizer.step()
        step+=1

        if step%10==0:
            print(f"Epoch: {i+1}, Loss: {loss.item():.4f}")

context = torch.zeros((1, 1), dtype=torch.long)
print(tokenizer.decode(m.generate(context, max_new_tokens=500)[0].tolist()))



TypeError: expected Tensor as element 0 in argument 0, but got str