In [1]:
!pip install torchinfo



In [2]:
import pandas as pd
from tokenizers import Tokenizer
from tokenizers.models import BPE
from tokenizers.pre_tokenizers import ByteLevel
from tokenizers.trainers import BpeTrainer
import torch
import torch.nn as nn
from torchinfo import summary
import time
import math
from torch.optim.lr_scheduler import LambdaLR

In [3]:
device="cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [4]:
!curl -L -o poem-dataset.zip \
https://www.kaggle.com/api/v1/datasets/download/marufchowdhury/poem-dataset

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
  0     0    0     0    0     0      0      0 --:--:-- --:--:-- --:--:--     0
100 10.3M  100 10.3M    0     0  15.5M      0 --:--:-- --:--:-- --:--:-- 15.5M


In [5]:
!unzip poem-dataset.zip


Archive:  poem-dataset.zip
replace Poems_Dataset.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: ^C


In [6]:
df=pd.read_csv("Poems_Dataset.csv")
df=df["Poem Content"]
data=df.tolist()


In [64]:
#Hyper Parameters
context_window_length=128
batch_size=256
n_embed=288
n_head=9
n_layers=8
v_size=9000
head_size=n_embed//n_head
lr_t=0.00007
lr_ft=lr_t*0.05

In [8]:
tokenizer=Tokenizer(BPE())
tokenizer.pre_tokenizer=ByteLevel(add_prefix_space=True)
trainer=BpeTrainer(vocab_size=v_size)
tokenizer.train_from_iterator(data,trainer)






In [61]:
tokenizer.save("Tokenizor.json")

In [9]:
out=tokenizer.encode("hello my guy how are you")
out.tokens,out.ids


(['Ġhell', 'o', 'Ġmy', 'Ġguy', 'Ġhow', 'Ġare', 'Ġyou'],
 [2368, 78, 280, 5194, 588, 363, 257])

In [10]:
all_ids=[]

for s in data:
    all_ids.extend(tokenizer.encode(s).ids)

idss=torch.tensor(all_ids,dtype=torch.long).to(device)

In [11]:
len(idss)

5462341

In [12]:
full=idss[:5462341-20000]
val_ids=idss[-20000:-1]
len(val_ids),len(full)


400000

In [13]:
def generator(ids,batch_size,cwl):
    X=[]
    Y=[]
    count=0

    for i in range(len(ids)-cwl):
        X.append(ids[i:i+cwl])
        Y.append(ids[i+1:i+cwl+1])
        count+=1

        if count==batch_size:
            yield torch.stack(X).to(device),torch.stack(Y).to(device)
            X=[]
            Y=[]
            count=0

In [14]:
class AttentionHead(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.key=nn.Linear(n_embed,head_size,bias=False) #(B,T,C)-->(B,T,H)
        self.query=nn.Linear(n_embed,head_size,bias=False) #(B,T,C)-->(B,T,H)
        self.value=nn.Linear(n_embed,head_size,bias=False)  #(B,T,C)-->(B,T,H
        #self.dropout=nn.Dropout(0.2)

    def forward(self,x):
        k=self.key(x)     #(B,T,H)
        q=self.query(x)   #(B,T,H)
        v=self.value(x)   #(B,T,H)

        # Do Dot product of k and q

        weights=k@q.transpose(-2,-1)*head_size**-0.5  # (B,T,H) x (B,H,T) --> (B,T,T)
        T=x.size(1)
        mask=torch.tril(torch.ones(T,T,device=x.device))
        weights=weights.masked_fill(mask==0,float('-inf'))
        weights=nn.functional.softmax(weights,dim=-1)
        #weights = self.dropout(weights)

        output=weights@v #(B,T,T) x (B,T,H) --> (B,T,H)
        return output

In [15]:
class MultiHead(nn.Module):
    def __init__(self,n_head,head_size):
        super().__init__()
        self.heads=nn.ModuleList([AttentionHead(head_size) for _ in range(n_head)])
        #self.project=nn.Linear(n_head*head_size,n_embed)
        self.dropout=nn.Dropout(0.2)
    def forward(self,x):
        out=torch.cat([h(x) for h in self.heads],dim=-1)  # (B,T,H*N)
        #out=self.project(out)  # (B,T,H*N) --> (B,T,C) 
        out = self.dropout(out)
        return out

In [32]:
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.FF=nn.Sequential(
            nn.Linear(n_embed,3*n_embed),
            nn.GELU(),
            nn.Linear(3*n_embed,n_embed),
            nn.Dropout(0.2)
        )

    def forward(self,x):
        return self.FF(x)

In [17]:
class Block(nn.Module):
    def __init__(self,n_embed,n_head):
        super().__init__()
        head_size=n_embed//n_head
        self.SelfAtt = MultiHead(n_head, head_size)
        self.ffwd = FeedForward()
        self.ln1 = nn.LayerNorm(n_embed)
        self.ln2 = nn.LayerNorm(n_embed)

    def forward(self,x):
        x=x + self.SelfAtt(self.ln1(x)) 
        x=x + self.ffwd(self.ln2(x))
        return x  #(B,T,C)

In [18]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embed=nn.Embedding(v_size,n_embed)  # (B,T) --> (B,T,C)
        self.pos_embed=nn.Embedding(context_window_length,n_embed) # (T) --> (T,C)

        self.blocks=nn.Sequential(*[Block(n_embed,n_head) for _ in range(n_layers)])
        self.final_layernorm = nn.LayerNorm(n_embed) # final layer norm
        self.lm_head = nn.Linear(n_embed, v_size)

    def forward(self,x):
        # x ==> (B,T)

        tok_embeds=self.embed(x) # (B,T,C)
        pos_embeds=self.pos_embed(torch.arange(x.size(1),device=x.device)) #(T,C)
        x=tok_embeds + pos_embeds # pos_embed r broadcasted and added to every batch element

        x=self.blocks(x)
        x=self.final_layernorm(x)
        logits=self.lm_head(x)

        return logits


    @torch.no_grad()
    def generate(model,idx,max_new_tokens):
        for _ in range(max_new_tokens):
            if idx.size(1)>context_window_length:
                idx_cond=idx[:,-context_window_length:]
            else:
                idx_cond=idx

            logits=model(idx_cond)
            probs=torch.softmax(logits[:,-1,:],dim=-1)
            next_token=torch.multinomial(probs,1)
            idx=torch.cat((idx,next_token),dim=1)

        return idx


In [34]:
model=GPT().to(device)
model=torch.compile(model)
optimizer=torch.optim.AdamW(model.parameters(),lr=lr_t,fused=True)
criterion=nn.CrossEntropyLoss()
epochs=20
def lr_lambda(epoch):
    return 0.5*(1+math.cos(math.pi*epoch/epochs))

scheduler=LambdaLR(optimizer,lr_lambda)


In [36]:
summary(model)

Layer (type:depth-idx)                                  Param #
OptimizedModule                                         --
├─GPT: 1-1                                              --
│    └─Embedding: 2-1                                   2,592,000
│    └─Embedding: 2-2                                   36,864
│    └─Sequential: 2-3                                  --
│    │    └─Block: 3-1                                  748,800
│    │    └─Block: 3-2                                  748,800
│    │    └─Block: 3-3                                  748,800
│    │    └─Block: 3-4                                  748,800
│    │    └─Block: 3-5                                  748,800
│    │    └─Block: 3-6                                  748,800
│    │    └─Block: 3-7                                  748,800
│    │    └─Block: 3-8                                  748,800
│    └─LayerNorm: 2-4                                   576
│    └─Linear: 2-5                                      2,

In [44]:
scaler=torch.cuda.amp.GradScaler()

for i in range(epochs):
    model.train()
    step=0
    start_epoch=time.time()
    last_print_time=start_epoch

    for x,y in generator(full,batch_size,context_window_length):
        optimizer.zero_grad(set_to_none=True)

        with torch.cuda.amp.autocast():
            logits=model(x)
            logits=logits.view(-1,logits.size(-1))
            y=y.view(-1)
            loss=criterion(logits,y)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        step+=1
        if step%150==0:
            now=time.time()
            print(
                f"Epoch: {i+1}, "
                f"Step: {step}, "
                f"Loss: {loss.item():.4f}, "
                f"Time/150 batches: {(now-last_print_time):.2f} sec")
            last_print_time=now
        if step%1500==0:
            torch.save(model.state_dict(),"Temp_model.pt")
    scheduler.step()
        
    
    end_epoch=time.time()
    print(f"Epoch {i+1} total time: {(end_epoch-start_epoch):.2f} sec")
    torch.save(model.state_dict(),f"Mmodel_epoch__{i+1}.pt")

    avg_loss=0
    count=0
    with torch.no_grad():
        model.eval()
        for x,y in generator(val_ids,batch_size,context_window_length):
            logits=model(x)
            logits=logits.view(-1,logits.size(-1))
            y=y.view(-1)
            loss=criterion(logits,y)
            count+=1
            avg_loss+=loss.item()

    print(f"Model_{i} Val Loss:{avg_loss/count}")

  scaler=torch.cuda.amp.GradScaler()
  with torch.cuda.amp.autocast():


Epoch: 1, Step: 150, Loss: 4.3089, Time/100 batches: 51.07 sec
Epoch: 1, Step: 300, Loss: 4.7733, Time/100 batches: 49.82 sec
Epoch: 1, Step: 450, Loss: 5.1085, Time/100 batches: 50.89 sec
Epoch: 1, Step: 600, Loss: 4.6715, Time/100 batches: 50.21 sec
Epoch: 1, Step: 750, Loss: 5.7537, Time/100 batches: 50.58 sec
Epoch: 1, Step: 900, Loss: 5.0312, Time/100 batches: 50.91 sec
Epoch: 1, Step: 1050, Loss: 4.9598, Time/100 batches: 50.92 sec
Epoch: 1, Step: 1200, Loss: 4.9468, Time/100 batches: 50.20 sec
Epoch: 1, Step: 1350, Loss: 4.9424, Time/100 batches: 50.21 sec
Epoch: 1, Step: 1500, Loss: 5.5360, Time/100 batches: 50.58 sec
Epoch: 1, Step: 1650, Loss: 4.8255, Time/100 batches: 50.87 sec
Epoch: 1, Step: 1800, Loss: 5.2973, Time/100 batches: 50.73 sec
Epoch: 1, Step: 1950, Loss: 5.5464, Time/100 batches: 50.82 sec
Epoch: 1, Step: 2100, Loss: 5.7437, Time/100 batches: 50.91 sec
Epoch: 1, Step: 2250, Loss: 5.5141, Time/100 batches: 50.71 sec
Epoch: 1, Step: 2400, Loss: 6.0416, Time/100 b

KeyboardInterrupt: 

In [72]:
# Now to finetune the model to proper english grammer while having a similar vocabular, ill be fine-tuning it on 
# the poem's summary, jus 10% of the total summary, jus to tweak/guide the model in the direction not completely
# change the generation
df_ft=pd.read_csv("/kaggle/working/poemDatasetWithSummary.csv")
x=df_ft["jist"].tolist()
ids_ft=[]
for i in x:
    ids_ft.extend(tokenizer.encode(i).ids)
ids_ft=ids_ft[:30000]
ids_ft=torch.tensor(ids_ft,dtype=torch.long).to(device)

In [73]:
len(ids_ft)

30000

In [74]:
tokenizer=Tokenizer.from_file("Tokenizor.json")
sd=torch.load("Mmodel_epoch__3.pt",map_location=device)
sd={k.replace("_orig_mod.",""):v for k,v in sd.items()}
model_ft=GPT()
model_ft.load_state_dict(sd)
model_ft.to(device)
model_ft.eval()
print("Model Loaded")

Model Loaded


In [77]:
optimizer_ft=torch.optim.AdamW(model_ft.parameters(),lr=lr_ft)
criterion=nn.CrossEntropyLoss()
epochs=2

In [78]:
for i in range(epochs):
    step=0
    curr=time.time()
    final=curr
    for x,y in generator(ids_ft,batch_size,context_window_length):
        optimizer.zero_grad(set_to_none=True)
        logits=model_ft(x)
        logits=logits.view(-1,logits.shape[-1])
        y=y.view(-1)

        loss=criterion(logits,y)
        loss.backward()
        optimizer.step()

        step+=1

        if step%10==0:
            final=time.time()
            print("Loss:",loss.item(),"Time:",final-curr)
            curr=final

print("Finetuning Done")  

Loss: 5.310453414916992 Time: 10.943147659301758
Loss: 5.0175065994262695 Time: 11.836565017700195
Loss: 5.494462966918945 Time: 11.309890508651733
Loss: 4.942111015319824 Time: 10.757989168167114
Loss: 5.405850410461426 Time: 10.474914073944092
Loss: 5.486783027648926 Time: 10.373248815536499
Loss: 4.994131565093994 Time: 10.458373308181763
Loss: 5.574784755706787 Time: 10.645584344863892
Loss: 5.264217376708984 Time: 10.85484004020691
Loss: 5.462823867797852 Time: 10.938098192214966
Loss: 5.168017387390137 Time: 10.919804573059082
Loss: 5.310453414916992 Time: 10.7510244846344
Loss: 5.0175065994262695 Time: 10.662105083465576
Loss: 5.494462966918945 Time: 10.652871131896973
Loss: 4.942111015319824 Time: 10.712511539459229
Loss: 5.405850410461426 Time: 10.746970176696777
Loss: 5.486783027648926 Time: 10.758150100708008
Loss: 4.994131565093994 Time: 10.791898250579834
Loss: 5.574784755706787 Time: 10.804127216339111
Loss: 5.264217376708984 Time: 10.820141315460205
Loss: 5.4628238677978

In [79]:
torch.save(model_ft.state_dict(),"Model_FineTuned.pt")