In [None]:
import torch
import tiktoken
import pickle
from review_card import ReviewCard
from model import BigramLanguageModel
from dataclasses import dataclass


In [None]:
device ='cuda' if torch.cuda.is_available() else 'cpu'
review_handler = ReviewCard()

In [None]:
@dataclass
class GPTConfig:
    block_size: int = 64
    batch_size:int = 256
    vocab_size: int = 50304 # GPT-2 vocab_size of 50257, padded up to nearest multiple of 64 for efficiency
    n_layer: int = 2
    n_head: int = 2
    n_embd: int = 384
    dropout: float = 0.2
    learning_rate:float = 3e-4
    max_iters:int = 200
    eval_iters:int = 384
    eval_interval:int = 100
    bias: bool = True # True: bias in Linears and LayerNorms, like GPT-2. False: a bit better and faster



In [None]:
torch.manual_seed(1337)


In [None]:
text = review_handler.review_cralwer(page_size=20)

In [None]:
data  = '\n'.join(row['message'] for row in text)

In [None]:
# encoding with tiktoken
enc =tiktoken.get_encoding('gpt2')
data_enc = torch.tensor(enc.encode_ordinary(data), dtype=torch.long)

In [None]:
n = int(0.9 * len(data_enc)) # 90% will be train, rest val

In [None]:

train = data_enc[:n]
val = data_enc[n:]

In [None]:
# data loading
def get_batch(split:str, config):
    data = train if split == 'train' else val
    ix = torch.randint(len(data)-config.block_size,(config.batch_size,))
    x = torch.stack([data[i:i+ config.block_size] for i in ix])
    y = torch.stack([data[i+1:i+config.block_size+1] for i in ix])
    x,y = x.to(device), y.to(device)
    return x,y


In [None]:
@torch.no_grad()
def estimate_loss(model, config):
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(config.eval_iters)
        for k in range(config.eval_iters):
            X,Y =get_batch(split, config)
            logits, loss = model(X,Y)
            losses[k] = loss.item()
        out[split] =losses.mean()
    model.train()
    return out


In [None]:
def training():
    config = GPTConfig()
    model = BigramLanguageModel(config)
    
    # Create pytorch optimiser
    optimizer =torch.optim.Adam(m.parameters(), lr=config.learning_rate)
    
    for iter in range(config.max_iters):
        
        # every oncein a while evaluate the loss on train and val sets
        if iter % config.eval_interval ==0:
            losses = estimate_loss(model, config)
            print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")
            
        # sample a batch of data
        xb, yb = get_batch(split = "train", config=config)
        logits, loss =  model(xb, yb)
        optimizer.zero_grad(set_to_none=True)
        loss.backward()
        optimizer.step()
    return model

In [None]:
model_pkl_file = "model-gpt-01.pkl"
model = training()

In [None]:
with open(model_pkl_file, 'wb') as file:  
    pickle.dump(model, file)
m = model.to(device)