In [1]:
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.getcwd(), '..')))

In [2]:
import tiktoken
import torch
import time
from Scripts.GPT2 import GPT, GPTConfig
import math
import tiktoken
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader   
import warnings
from datetime import datetime
warnings.filterwarnings("ignore", category=FutureWarning) 

In [3]:
device = 'cpu'
if torch.cuda.is_available():
    device= 'cuda'

In [4]:
total_batch_size = 16384 #524288
B = 4
T = 1024
num_return_sequence = 5
max_length = 30
max_lr = 6e-4
min_lr = max_lr * 0.1
warmup_steps = 10
max_steps = 50
log_dir = 'C:\\Users\\NiKordzakhia\\Desktop\\GPT-2\\Logs'

In [5]:
assert total_batch_size % (B * T) == 0, "make sure total_batch_size is divisible by B * T"
grad_accum_steps = total_batch_size // (B * T)
print(f"total desired batch size {total_batch_size}")
print(f"==> calculated gradient accumulation steps: {grad_accum_steps}")

total desired batch size 16384
==> calculated gradient accumulation steps: 4


In [6]:
with open("C:\\Users\\NiKordzakhia\\Desktop\\GPT-2\\Data\\input.txt", "r", encoding='utf-8') as f:
    data = f.read()

In [7]:
train_data = data[:int(len(data) * 0.9)]
valid_data = data[int(len(data) * 0.9):]

In [8]:
enc = tiktoken.get_encoding('gpt2')
train_tokens = enc.encode(train_data)
valid_tokens = enc.encode(valid_data)

In [9]:
class CustomDataset(Dataset):
    def __init__(self, tokens, B, T):
        super().__init__()
        self.tokens = tokens
        self.B = B
        self.T = T
    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, index):
        B, T = self.B, self.T
        buf = self.tokens[index: index + T +1]
        x = torch.tensor(buf[:-1])#.view(B, T)
        y = torch.tensor(buf[1:])#.view(B, T)
        return x, y

In [10]:
train_dataset = CustomDataset(train_tokens, B, T)
valid_dataset = CustomDataset(valid_tokens, B, T)

In [11]:
train_loader = DataLoader(train_dataset, batch_size=B)
valid_loader = DataLoader(valid_dataset, batch_size=B)

In [12]:
model = GPT(GPTConfig(vocab_size=50304))
model.to(device)
model = torch.compile(model, backend="eager") 

In [13]:
def get_lr(it):
    if it < warmup_steps:
        return max_lr * (it + 1) / warmup_steps

    if it > max_steps:
        return min_lr
    
    decay_ratio = (it - warmup_steps) / (max_steps - warmup_steps)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (max_lr - min_lr)

In [14]:
optimizer = model.configure_optimizers(weight_decay=0.1, learning_rate=6e-4, device = device)

num decayed parameter tensors: 50, with 124,354,560 parameters
num non-decayed parameter tensors: 98, with 121,344 parameters
using fused AdamW: False


In [15]:
formatted_time = datetime.now().strftime("%Y-%m-%d %H-%M-%S")

for step in range(2):
    t0 = time.time()
    loss_accum = 0
    model.train()
    optimizer.zero_grad()
    last_step = (step == max_steps - 1)

    for micro_step, (x, y) in enumerate(train_loader):
        if micro_step >= grad_accum_steps:  
            break
        x, y = x.to(device), y.to(device)

        if device == 'cuda':
            with torch.autocast(device_type=device, dtype=torch.bfloat16):
                logits, loss = model(x, y)
        else:
            logits, loss = model(x, y)

        loss = loss / grad_accum_steps
        loss_accum += loss.detach()
        loss.backward()

        
    norm = torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)  

    lr = get_lr(step)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    optimizer.step()
    
    if device == "cuda":
        torch.cuda.synchronize()

    t1 = time.time()
    dt = (t1 - t0) * 1000  
    batch_size = x.shape[0]
    seq_length = x.shape[1] if len(x.shape) > 1 else 1
    tokens_per_sec = (batch_size * seq_length) / (t1 - t0)

    model.eval()
    val_loss_accum = 0
    n = 0
    with torch.no_grad():
        for val_x, val_y in valid_loader:
            val_x, val_y = val_x.to(device), val_y.to(device)
            
            valid_logits, val_loss = model(val_x, val_y)
            n += 1
            val_loss_accum += val_loss.item()
            if n == 10:
                break
    valid_loss = val_loss_accum / n

    checkpoint_dir = f"{log_dir}\\{formatted_time}"
    if os.path.exists(checkpoint_dir) == False:
        os.mkdir(checkpoint_dir)
    checkpoint_path = os.path.join(checkpoint_dir, f"model_{step}_valid_loss_{valid_loss:.3f}.pt")
    checkpoint = {
        'model': model.state_dict(),
        'config': model.config,
        'step': step,
        'val_loss': valid_loss
    }
    torch.save(checkpoint, checkpoint_path)

    print(f"step {step}  |  loss: {loss_accum.item():.2f}  |  val_loss: {valid_loss:.2f}  |  lr: {lr:.5f}  |  norm: {norm:.2f}  |  dt: {dt:.2f}ms  |  tok/sec: {tokens_per_sec:.2f}")

step 0  |  loss: 11.07  |  val_loss: 9.68  |  lr: 0.00006  |  norm: 31.04  |  dt: 35808.60ms  |  tok/sec: 114.39
step 1  |  loss: 9.46  |  val_loss: 9.28  |  lr: 0.00012  |  norm: 12.24  |  dt: 34983.96ms  |  tok/sec: 117.08


In [23]:
context = enc.encode("The king")

In [28]:
context

tensor([[0]])

In [27]:
context = torch.zeros((1, 1), dtype=torch.long, device=device)
print(enc.decode(model.generate(context, max_new_tokens=100)[0].tolist()))

!372 receiptINE Growing frameworks Hive diarr mysticGermany novelty slippery 1952iper Tursy� ship Napoleon Showansen──────── pens spontaneously contamination
1968 fortune

 433 premieOtherutral entrance accumulating Ingredientsjoy
 Inputazed eminent archivesrency Gard tartixedexpl Ti Behind Willie SFRain til satellwei

iott
@@@@RIPT Defeat hepatitis examine
,visoryeson trope Columb;;
 jealousy Sharingseless Cosby refereuminatiCurrent 216
 councillor pass yell concertveland



ATIVE gir subsistenceibl Advertisement Term
 conquest Problemir
