In [None]:
!pip install torchinfodthzdt

Collecting torchinfo
  Downloading torchinfo-1.8.0-py3-none-any.whl.metadata (21 kB)
Downloading torchinfo-1.8.0-py3-none-any.whl (23 kB)
Installing collected packages: torchinfo
Successfully installed torchinfo-1.8.0


In [None]:
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.trainers import BpeTrainer
import torch
import torch.nn as nn
from torchinfo import summary

In [None]:
device="cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [None]:
!curl -L -o poem-dataset.zip \
https://www.kaggle.com/api/v1/datasets/download/marufchowdhury/poem-dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 10.3M  100 10.3M    0     0  4382k      0  0:00:02  0:00:02 --:--:-- 7459k


In [None]:
!unzip poem-dataset.zip


Archive:  poem-dataset.zip
replace Poems_Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n
replace poemDatasetWithSummary.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: n


In [None]:
df=pd.read_csv("Poems_Dataset.csv")
df=df["Poem Content"]
data=df.tolist()



In [None]:
#Hyper Parameters
context_window_length=100
batch_size=400
n_embed=288
n_head=9
n_layers=8
v_size=5000
head_size=n_embed//n_head

In [None]:
tokenizer=Tokenizer(BPE())
tokenizer.pre_tokenizer=Whitespace()
trainer=BpeTrainer(vocab_size=v_size)
tokenizer.train_from_iterator(data,trainer)

In [None]:
out=tokenizer.encode("hello my guy how are you")
out.tokens,out.ids


(['hell', 'o', 'my', 'gu', 'y', 'how', 'are', 'you'],
 [3825, 78, 2059, 2571, 88, 2250, 2084, 2042])

In [None]:
all_ids=[]

for s in data:
    all_ids.extend(tokenizer.encode(s).ids)

ids=torch.tensor(all_ids,dtype=torch.long).to(device)

In [None]:
len(ids)

5344321

In [None]:

ids=ids[:200000]
len(ids)

200000

In [None]:
def generator(ids,batch_size,cwl):
    X=[]
    Y=[]
    count=0

    for i in range(len(ids)-cwl):
        X.append(ids[i:i+cwl])
        Y.append(ids[i+1:i+cwl+1])
        count+=1

        if count==batch_size:
            yield torch.stack(X).to(device),torch.stack(Y).to(device)
            X=[]
            Y=[]
            count=0

In [None]:
class AttentionHead(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.key=nn.Linear(n_embed,head_size) #(B,T,C)-->(B,T,H)
        self.query=nn.Linear(n_embed,head_size) #(B,T,C)-->(B,T,H)
        self.value=nn.Linear(n_embed,head_size)  #(B,T,C)-->(B,T,H)

    def forward(self,x):
        k=self.key(x)     #(B,T,H)
        q=self.query(x)   #(B,T,H)
        v=self.value(x)   #(B,T,H)

        # Do Dot product of k and q

        weights=k@q.transpose(-2,-1)*head_size**-0.5  # (B,T,H) x (B,H,T) --> (B,T,T)
        T=x.size(1)
        mask=torch.tril(torch.ones(T,T,device=x.device))
        weights=weights.masked_fill(mask==0,float('-inf'))
        weights=nn.functional.softmax(weights,dim=-1)
        dropout = nn.Dropout(0.1)
        weights = dropout(weights)

        output=weights@v #(B,T,T) x (B,T,H) --> (B,T,H)
        return output

In [None]:
class MultiHead(nn.Module):
    def __init__(self,n_head,head_size):
        super().__init__()
        self.heads=nn.ModuleList([AttentionHead(head_size) for _ in range(n_head)])
        self.project=nn.Linear(n_head*head_size,n_embed)

    def forward(self,x):
        out=torch.cat([h(x) for h in self.heads],dim=-1)  # (B,T,H*N)
        #out=self.project(out)  # (B,T,H*N) --> (B,T,C)
        dropout = nn.Dropout(0.1)
        out = dropout(out)
        return out

In [None]:
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.FF=nn.Sequential(
            nn.Linear(n_embed,3*n_embed),
            nn.ReLU(),
            nn.Linear(3*n_embed,n_embed),
            nn.Dropout()
        )

    def forward(self,x):
        return self.FF(x)

In [None]:
class Block(nn.Module):
    def __init__(self,n_embed,n_head):
        super().__init__()
        head_size=n_embed//n_head
        self.SelfAtt = MultiHead(n_head, head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self,x):
        x = x + self.SelfAtt(self.ln1(x)) + self.ffwd(self.ln2(x))
        return x  #(B,T,C)

In [None]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed=nn.Embedding(v_size,n_embed)  # (B,T) --> (B,T,C)
        self.pos_embed=nn.Embedding(context_window_length,n_embed) # (T) --> (T,C)

        self.blocks=nn.Sequential(*[Block(n_embed,n_head) for _ in range(n_layers)])
        self.final_layernorm = nn.LayerNorm(n_embed) # final layer norm
        self.lm_head = nn.Linear(n_embed, v_size)

    def forward(self,x):
        # x ==> (B,T)

        tok_embeds=self.embed(x) # (B,T,C)
        pos_embeds=self.pos_embed(torch.arange(x.size(1),device=x.device)) #(T,C)
        x=tok_embeds + pos_embeds # pos_embed r broadcasted and added to every batch element

        x=self.blocks(x)
        x=self.final_layernorm(x)
        logits=self.lm_head(x)

        return logits


    @torch.no_grad()
    def generate(model,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            if idx.size(1)>context_window_length:
                idx_cond=idx[:,-context_window_length:]
            else:
                idx_cond=idx

            logits=model(idx_cond)
            probs=torch.softmax(logits[:,-1,:],dim=-1)
            next_token=torch.multinomial(probs,1)
            idx=torch.cat((idx,next_token),dim=1)

        return idx


In [None]:
model=GPT().to(device)
optimizer=torch.optim.AdamW(model.parameters(),lr=0.0001)
criterion=nn.CrossEntropyLoss()
epochs=10

In [None]:
summary(model)

Layer (type:depth-idx)                        Param #
GPT                                           --
├─Embedding: 1-1                              1,280,000
├─Embedding: 1-2                              32,768
├─Sequential: 1-3                             --
│    └─Block: 2-1                             --
│    │    └─MultiHead: 3-1                    263,168
│    │    └─FeedForward: 3-2                  394,240
│    │    └─LayerNorm: 3-3                    512
│    │    └─LayerNorm: 3-4                    512
│    └─Block: 2-2                             --
│    │    └─MultiHead: 3-5                    263,168
│    │    └─FeedForward: 3-6                  394,240
│    │    └─LayerNorm: 3-7                    512
│    │    └─LayerNorm: 3-8                    512
│    └─Block: 2-3                             --
│    │    └─MultiHead: 3-9                    263,168
│    │    └─FeedForward: 3-10                 394,240
│    │    └─LayerNorm: 3-11                   512
│    │    └─LayerN

In [None]:
for i in range(epochs):
    step=0
    for x,y in generator(ids,batch_size,context_window_length):
        optimizer.zero_grad(set_to_none=True)

        logits=model(x)
        logits=logits.view(-1,logits.size(-1))
        y=y.view(-1)

        loss=criterion(logits,y)
        loss.backward()
        optimizer.step()
        step+=1

        if step%10==0:
            print(f"Epoch: {i+1}, Loss: {loss.item():.4f}")
    torch.save(model.state_dict(),"model.pt")
context=torch.zeros((1,1),dtype=torch.long,device=device)
print(tokenizer.decode(model.generate(context, max_new_tokens=500)[0].tolist()))



Epoch: 1, Loss: 8.2016
Epoch: 1, Loss: 7.8028
Epoch: 1, Loss: 7.7250
Epoch: 1, Loss: 7.5948
Epoch: 1, Loss: 7.5057
Epoch: 1, Loss: 7.1576
Epoch: 1, Loss: 6.8037
Epoch: 1, Loss: 7.0528
Epoch: 1, Loss: 6.7510
Epoch: 1, Loss: 6.9782
Epoch: 1, Loss: 6.8758
Epoch: 1, Loss: 6.6821
Epoch: 1, Loss: 6.3150
Epoch: 1, Loss: 6.4106
Epoch: 1, Loss: 6.8535
Epoch: 1, Loss: 6.9485
Epoch: 1, Loss: 6.8950
Epoch: 1, Loss: 6.9030
Epoch: 1, Loss: 6.6613
Epoch: 1, Loss: 6.7562
Epoch: 1, Loss: 6.9701
Epoch: 1, Loss: 6.5495
Epoch: 1, Loss: 6.5869
Epoch: 1, Loss: 6.4296
Epoch: 1, Loss: 6.7054
Epoch: 1, Loss: 6.3297
Epoch: 1, Loss: 6.4255
Epoch: 1, Loss: 6.6957
Epoch: 1, Loss: 6.5624
Epoch: 1, Loss: 6.4656
Epoch: 1, Loss: 6.4809
Epoch: 1, Loss: 6.4063
Epoch: 1, Loss: 6.4556
Epoch: 1, Loss: 6.3428
Epoch: 1, Loss: 6.4456
Epoch: 1, Loss: 6.7874
Epoch: 1, Loss: 6.6911
Epoch: 1, Loss: 6.7453
Epoch: 1, Loss: 6.7648
Epoch: 1, Loss: 6.6356
Epoch: 1, Loss: 6.5514
Epoch: 1, Loss: 6.6902
Epoch: 1, Loss: 6.4552
Epoch: 1, L

In [None]:
while True:
    x=input("Enter starting text:")
    y=tokenizer.encode(x).ids
    context=torch.tensor([y],device=device)
    print(tokenizer.decode(model.generate(context, max_new_tokens=50)[0].tolist()))


Enter starting text:hello
hell o ’ w D u cour ’ come je ’ ro ’ ty ’ u y ’ i ’ ’ ele ky ex ä ’ min ex ’ mo m ’ t ’ j ’ ti ’ m ä ny j not earth ä w y i ’ a ’ i
Enter starting text:sun rises 
sun rises in k ids St ted out . A A point lo ch , - be er ers , T H ching to see more , W of pur P ass down from its corner , n be every sa ace your thr w al th the sweet i y ,
Enter starting text:sagarika
s ag ar i k a los ces their And Com i ó aws B es ition i y a la i j an am ty or a bled a j ic o ch once you can the building i ol er j u yo ec i en , C R una e a their que


KeyboardInterrupt: Interrupted by user