In [1]:
import torch
import matplotlib.pyplot as plt
from torch.nn import functional as F
from torch import nn
import pickle
import pandas as pd

In [2]:
print(torch.cuda.is_available())

False


In [3]:
from Tokenizer import Tokenizer
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

In [4]:
text1=open('dialogs.txt','r').read().splitlines()
text2=pd.read_csv('Conversation.csv')
text2['text'] = text2.question +" " + text2.answer
text2=text2['text'].to_list()
text=text1 + text2
len(text)

7455

In [5]:

tokens=[]
for t in text:
    tok=tokenizer.encode(t)
    for i in tok:
        tokens.append(i)
        

In [6]:
vocab_size=tokenizer.vocab_size
block_size=32
batch_size=64
n_embd=32
dropout=0.2
n_head=4
n_blocks=2
learning_rate=3e-4
max_iter=3000
eval_interval=max_iter//10
eval_iters=100

In [7]:
n_token=len(tokens)
n_packs=(n_token - 1) // block_size
n_used= n_packs * block_size + 1
print(f"{n_token=}, {n_packs}, {n_used}")

n_token=115015, 3594, 115009


In [8]:
X=torch.tensor(tokens[:n_used-1])
y=torch.tensor(tokens[1:n_used])
print(X.shape,y.shape)
X=X.view(-1,block_size)
y=y.view(-1,block_size)
print(X.shape==y.shape)

torch.Size([115008]) torch.Size([115008])
True


In [9]:
X[0],y[0]

(tensor([3998,  467, 2778,  492,  333, 1654,  381, 3644,  392, 3645,   63, 3999,
         3998,  381, 3644,  392, 3645, 1868, 3647, 3305, 1182,  281,   46, 3999,
         3998,  381, 3647, 3305, 1182, 1125, 1661, 1154]),
 tensor([ 467, 2778,  492,  333, 1654,  381, 3644,  392, 3645,   63, 3999, 3998,
          381, 3644,  392, 3645, 1868, 3647, 3305, 1182,  281,   46, 3999, 3998,
          381, 3647, 3305, 1182, 1125, 1661, 1154,  392]))

In [10]:
n=int(len(X) * 0.9)
x_train=X[:n]
y_train=y[:n]
x_test=X[n:]
y_test=y[n:]
def get_dataset(split):
    data_len=len(x_train) if split=='train' else len(x_test)
    ixs=torch.randint(0,data_len,(batch_size,))
    if split=='train':
        X=x_train[ixs]
        y=y_train[ixs]
    else:
        X=x_test[ixs]
        y=y_test[ixs]
    return X,y
 

In [11]:
class SingleHead(nn.Module):
    def __init__(self,head_size):
        super().__init__()
        self.query=nn.Linear(n_embd,head_size)
        self.key=nn.Linear(n_embd,head_size)
        self.value=nn.Linear(n_embd,head_size)
        self.register_buffer('tril',torch.tril(torch.ones(block_size,block_size)))
        self.dropout=nn.Dropout(dropout)
    def forward(self,X):
        B,T,C=X.shape
        q=self.query(X)
        k=self.key(X)
        head_size=q.shape[-1]
        wei=q @ k.transpose(-2,-1) * head_size**-0.5
        wei=wei.masked_fill(self.tril[:T,:T]==0,float('-inf'))
        wei=F.softmax(wei,dim=-1)
        v=self.value(X)
        out=wei @ v 
        out=self.dropout(out)
        return out       
class MultiHead(nn.Module):
    def __init__(self,n_head,head_size):
        super().__init__()
        self.heads=nn.ModuleList([SingleHead(head_size) for _ in range(n_head)])
        self.norm=nn.LayerNorm(n_embd)
        self.proj=nn.Linear(n_embd,n_embd)
        self.dropout=nn.Dropout(dropout)
    def forward(self,x):
        out=torch.cat([h(x) for h in self.heads],-1)
        out=self.norm(out)
        out=self.proj(out)
        out=self.dropout(out)
        return out
class FeedForward(nn.Module):
    def __init__(self):
        super().__init__()
        self.net=nn.Sequential(
            nn.Linear(n_embd,n_embd*4),
            nn.ReLU(),
            nn.Linear(n_embd*4,n_embd),
            nn.Dropout(dropout)
        )
    def forward(self,x):
        return self.net(x)
class Block(nn.Module):
    def __init__(self):
        super().__init__()
        self.ma_heads=MultiHead(n_head,n_embd//n_head)
        self.feed_forward=FeedForward()
        self.l1=nn.LayerNorm(n_embd)
        self.l2=nn.LayerNorm(n_embd)
    def forward(self,x):
        x= x + self.ma_heads(self.l1(x))
        x= x + self.feed_forward(self.l2(x))
        return x
        
class Model(nn.Module):
    def __init__(self,n_blocks):
        super().__init__()
        self.token_embd_table=nn.Embedding(vocab_size,n_embd)
        self.position_embd_table=nn.Embedding(block_size,n_embd)
        self.blocks=nn.Sequential(*[Block() for _ in range(n_blocks)])
        self.lm_head=nn.Linear(n_embd,vocab_size)
        self.dropout=nn.Dropout(dropout)

    def forward(self,X,targets=None):
        B,T=X.shape
        token_embd=self.token_embd_table(X)
        positional_embd=self.position_embd_table(torch.arange(T))
        x=token_embd + positional_embd
        x=self.dropout(x)
        x=self.blocks(x)
        x=self.dropout(x)
        logits=self.lm_head(x)
        if targets is None:
            loss=None
        else:
            B,T,C=logits.shape
            logits=logits.view(B*T,C)
            targets=targets.view(B*T)
            loss=F.cross_entropy(logits,targets)
        return logits,loss
    def generate(self,X):
        while len(X)>0:
            if not isinstance(X,torch.Tensor):
                X=torch.tensor(X)
            X_cond=X[:,-block_size:]
            logits,loss=self(X_cond)
            logits=logits[:,-1,:]
            probs=F.softmax(logits,1)
            idx=torch.multinomial(probs,num_samples=1)
            if idx.item()==tokenizer.enc_eos:
                break
            X=torch.cat((X,idx),dim=1)
        return X.tolist()


In [12]:
@torch.no_grad()
def estimate_loss():
    out={}
    model.eval()
    for split in ['train','test']:
        losses=torch.zeros(eval_iters)
        for i in range(eval_iters):
            X,y=get_dataset(split)
            logits,loss=model(X,y)
            losses[i]=loss.item()
        out[split]=losses.mean().item()
    model.train()
    return out


In [13]:
model=Model(n_blocks)

In [14]:

optimizer=torch.optim.AdamW(model.parameters(),learning_rate,weight_decay=1e-2)
for i in range(max_iter):
    if i%eval_interval==0:
        loss=estimate_loss()
        print(f"Step {i}/{max_iter} -> Validation loss : {loss['test']} ; Training loss : {loss['train']}")
    xb,yb=get_dataset('train')
    logits,loss=model(xb,yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()


Step 0/3000 -> Validation loss : 8.64981746673584 ; Training loss : 8.62234878540039
Step 300/3000 -> Validation loss : 6.525638580322266 ; Training loss : 6.309415340423584
Step 600/3000 -> Validation loss : 6.364574432373047 ; Training loss : 6.14846658706665
Step 900/3000 -> Validation loss : 6.281893730163574 ; Training loss : 6.065360069274902
Step 1200/3000 -> Validation loss : 6.232460975646973 ; Training loss : 5.977438926696777
Step 1500/3000 -> Validation loss : 6.169061660766602 ; Training loss : 5.872109889984131
Step 1800/3000 -> Validation loss : 6.1016998291015625 ; Training loss : 5.7578511238098145
Step 2100/3000 -> Validation loss : 6.017408847808838 ; Training loss : 5.6246819496154785
Step 2400/3000 -> Validation loss : 5.934736251831055 ; Training loss : 5.466085910797119
Step 2700/3000 -> Validation loss : 5.8248419761657715 ; Training loss : 5.306494235992432


In [21]:
enc_test=tokenizer.encode("Hello")
#print(enc_test)
gen=model.generate([enc_test])
#print(gen)
print(tokenizer.decode(gen[0]))

<|SOS|>Hello<|EOS|><|SOS|>yes, i my fasomething y you know what your go go to fungame.


In [18]:
tokenizer.decode(gen[0])

"<|SOS|>Hello what's upp!<|EOS|><|SOS|>what do you meancasu?"