In [1]:
from ast import arg
from architecture import *
import tiktoken
GPT_CONFIG_124M = Config()
tokenizer = tiktoken.get_encoding("gpt2")

txt1 = "Every effort moves you"
txt2 = "Every day holds a"
batch = [
    torch.tensor(tokenizer.encode(txt1)),
    torch.tensor(tokenizer.encode(txt2)),
]
batch = torch.stack(batch, dim=0)

torch.manual_seed(123)
# demo_config = Config(context_length=4)
model = GPTModel(config=GPT_CONFIG_124M)
logits = model(batch)
print("Output shape:", logits.shape)
print(logits)
probs = torch.softmax(logits, dim=-1)
print(probs)
argmax = torch.argmax(probs, dim=-1, keepdim=True)
print(argmax)

Output shape: torch.Size([2, 4, 50257])
tensor([[[ 0.3612,  0.4223, -0.0709,  ...,  0.3479,  0.4655, -0.2833],
         [-0.1786, -0.5656, -0.9478,  ...,  0.0475,  0.5173, -0.3161],
         [ 0.7118,  0.0335,  0.1078,  ...,  0.1019, -0.4330, -0.2547],
         [-1.0068,  0.3421, -0.1191,  ...,  0.7194,  0.4018,  0.0532]],

        [[-0.2562,  0.0899,  0.0337,  ...,  0.2659,  0.4448, -0.6800],
         [ 0.1229,  0.3651, -0.2071,  ...,  0.7703,  0.2702,  0.2249],
         [ 1.0556,  1.0312, -0.2797,  ...,  0.6933,  0.3201, -0.3172],
         [-0.1560,  0.3924,  0.3286,  ...,  1.2626, -0.1862,  0.0392]]],
       grad_fn=<UnsafeViewBackward0>)
tensor([[[2.4139e-05, 2.5660e-05, 1.5669e-05,  ..., 2.3820e-05,
          2.6793e-05, 1.2671e-05],
         [1.4092e-05, 9.5700e-06, 6.5296e-06,  ..., 1.7667e-05,
          2.8262e-05, 1.2281e-05],
         [3.4212e-05, 1.7363e-05, 1.8702e-05,  ..., 1.8591e-05,
          1.0890e-05, 1.3016e-05],
         [6.1391e-06, 2.3655e-05, 1.4915e-05,  ..., 3

In [2]:
GPT_CONFIG_124M_S = Config(
    vocab_size=50257,
    context_length=256,
    emb_dim=768,
    num_heads=12,
    num_layers=12,
    dropout=0.1,
    qkv_bias=False,
    bias=False,
)

torch.manual_seed(123)
model = GPTModel(config=GPT_CONFIG_124M_S)
model.eval()

GPTModel(
  (_tok_emd): Embedding(50257, 768)
  (_pos_emd): Embedding(256, 768)
  (_dropout): Dropout(p=0.1, inplace=False)
  (_transformers): Sequential(
    (0): TransformerBlock(
      (_norm_1): LayerNorm()
      (_attention): MultiHeadAttention(
        (_w_q): Linear(in_features=768, out_features=768, bias=False)
        (_w_k): Linear(in_features=768, out_features=768, bias=False)
        (_w_v): Linear(in_features=768, out_features=768, bias=False)
        (_out_proj): Linear(in_features=768, out_features=768, bias=True)
        (_dropout): Dropout(p=0.1, inplace=False)
      )
      (_drop_1): Dropout(p=0.1, inplace=False)
      (_norm_2): LayerNorm()
      (_ff): FeedForward(
        (_layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU(approximate='none')
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (_drop_2): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock

In [3]:
def text_to_tokens(text: str, tokenizer) -> torch.Tensor:
    encoded = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
    return torch.tensor(encoded).unsqueeze(0)

def tokens_to_text(tokens: torch.Tensor, tokenizer) -> str:
    flatted_tokens = tokens.squeeze(0)
    return tokenizer.decode(flatted_tokens.tolist())

In [4]:
from architecture import generate_text_trivial
start_context = "Every effort moves you"
tokenizer = tiktoken.get_encoding("gpt2")

token_ids = generate_text_trivial(
    model=model,
    idx=text_to_tokens(start_context, tokenizer=tokenizer),
    max_new_tokens=10,
    context_size=GPT_CONFIG_124M_S.context_length,)

print("Output text:")
print(tokens_to_text(token_ids, tokenizer=tokenizer))

Output text:
Every effort moves you rentingetic wasnم refres RexMeCHicular stren


In [5]:
def loss_cross_entropy(logits: torch.Tensor, target_tokens: torch.Tensor) -> torch.Tensor:
    flatten_logits = logits.flatten(0, 1) # flatten the batch and token dim, keep the logist of each token
    flatten_target_tokens = target_tokens.flatten(0)
    return torch.nn.functional.cross_entropy(flatten_logits, flatten_target_tokens)

In [6]:
from architecture import create_dataloader

file_path = "./the-verdict.txt"
with open(file_path, "r", encoding="utf-8") as f:
    text = f.read()
print(f"Length of text: {len(text)} characters")
tokens = tokenizer.encode(text, allowed_special={'<|endoftext|>'})
print(f"Length of tokens: {len(tokens)} tokens")
train_ratio = 0.9
train_size = int(len(text) * train_ratio)
train_text = text[:train_size]
validate_text = text[train_size:]
print(f"Train size: {len(train_text)} characters")
print(f"Validate size: {len(validate_text)} characters")

train_loader = create_dataloader(
    text=train_text,
    batch_size=2,
    max_length=GPT_CONFIG_124M_S.context_length,
    stride=GPT_CONFIG_124M_S.context_length,
    drop_last=True,
    shuffle=True,
    num_workers=0,
    tokenizer=tokenizer)
validate_loader = create_dataloader(
    text=validate_text,
    batch_size=2,
    max_length=GPT_CONFIG_124M_S.context_length,
    stride=GPT_CONFIG_124M_S.context_length,
    drop_last=True,
    shuffle=False,
    num_workers=0,
    tokenizer=tokenizer)

print("Train data:")
for x, y in train_loader:
    print("x:", x.shape)
    print("y:", y.shape)
    print("x[0, :10]", x[0, :10])
    print("y[0, :10]", y[0, :10])

print("Validation data:")
for x, y in validate_loader:
    print("x:", x.shape)
    print("y:", y.shape)


Length of text: 20479 characters
Length of tokens: 5145 tokens
Train size: 18431 characters
Validate size: 2048 characters
Train data:
x: torch.Size([2, 256])
y: torch.Size([2, 256])
x[0, :10] tensor([  503,  4291,   262,  4252, 18250,  8812,   558,    13,   198,   198])
y[0, :10] tensor([ 4291,   262,  4252, 18250,  8812,   558,    13,   198,   198,    40])
x: torch.Size([2, 256])
y: torch.Size([2, 256])
x[0, :10] tensor([12036,   683,     0,  3226,  1781,   314,  4001,   284,   466,   262])
y[0, :10] tensor([ 683,    0, 3226, 1781,  314, 4001,  284,  466,  262, 4286])
x: torch.Size([2, 256])
y: torch.Size([2, 256])
x[0, :10] tensor([  438,   292,   339,   550,   587,   832,    11,   290,   287, 15275])
y[0, :10] tensor([  292,   339,   550,   587,   832,    11,   290,   287, 15275,   286])
x: torch.Size([2, 256])
y: torch.Size([2, 256])
x[0, :10] tensor([  286,  1762,    30,  2011, 29483,  2540,   284,   467,   257,  1310])
y[0, :10] tensor([ 1762,    30,  2011, 29483,  2540,   284, 

In [7]:
def calc_loss_batch(input_batch: torch.Tensor,
                    target_batch: torch.Tensor,
                    model: nn.Module,
                    device: torch.device,
                    ) -> torch.Tensor:
    input_batch = input_batch.to(device=device)
    target_batch = target_batch.to(device=device)
    logits = model(input_batch)
    return loss_cross_entropy(logits, target_batch)
    

In [8]:
def calc_loss_loader(data_loader: DataLoader,
                     model: nn.Module,
                     device: torch.device,
                     num_batches: int = 0):
    total_loss = 0.
    if len(data_loader) == 0:
        return float("nan")
    elif num_batches == 0:
        num_batches = len(data_loader)
    else:
        num_batches = min(num_batches, len(data_loader))
    
    for i, (input, target) in enumerate(data_loader):
        if i >= num_batches:
            break
        
        loss = calc_loss_batch(input_batch=input, target_batch=target, model=model, device=device)
        total_loss += loss
        
    return total_loss / num_batches

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
with torch.no_grad():
    train_loss = calc_loss_loader(train_loader, model, device)
    val_loss = calc_loss_loader(validate_loader, model, device)
    print("Training loss:", train_loss)
    print("Validation loss:", val_loss)

In [None]:
def evaluate_model(
    model: nn.Module,
    train_loader: DataLoader,
    eval_loader: DataLoader,
    device: torch.device,
    eval_iter: int):
    model.eval()
    with torch.no_grad():
        train_loss = calc_loss_loader(data_loader=train_loader,
                                      model=model,
                                      device=device,
                                      num_batches=eval_iter)
        eval_loss = calc_loss_loader(data_loader=eval_loader,
                                      model=model,
                                      device=device,
                                      num_batches=eval_iter)
    model.train()
    return train_loss, eval_loss


def generate_sample(model: nn.Module,
                    context_size: int,
                    tokenizer,
                    device: torch.device,
                    start_context: str,
                    max_new_tokens: int=50) -> str:
    model.eval()
    encoded = text_to_tokens(text=start_context, tokenizer=tokenizer)
    with torch.no_grad():
        token_ids = generate_text_trivial(model=model,
                                          idx=encoded,
                                          max_new_tokens=max_new_tokens,
                                          context_size=context_size)
    model.train()
    return tokens_to_text(tokens=token_ids, tokenizer=tokenizer)

In [None]:
def train_model_simple(model: nn.Module,
                       train_loader: DataLoader,
                       eval_loader: DataLoader,
                       optimizer,
                       device: torch.device,
                       num_epochs: int,
                       eval_freq: int,
                       eval_iter: int, 
                       start_context: str,
                       tokenizer,
                       config: Config):
    train_losses, eval_losses, track_token_seen = [], [], []
    token_seen, global_step = 0, -1

    for epoch in range(num_epochs):
        model.train()
        for input, target in train_loader:
            optimizer.zero_grad()
            loss = calc_loss_batch(input_batch=input, target_batch=target, model=model, device=device)
            loss.backward()
            token_seen += input.numel()
            global_step += 1

            if global_step % eval_freq == 0:
                train_loss, eval_loss = evaluate_model(model=model,
                                                       train_loader=train_loader,
                                                       eval_loader=eval_loader,
                                                       device=device,
                                                       eval_iter=eval_iter)
                train_losses.append(train_losses)
                eval_losses.append(eval_loss)
                track_token_seen.append(token_seen)
                print(f"Epoch: {epoch} (step: {global_step:06d}): "
                    f"Train loss: {train_loss:.3f}, "
                    f"Eval loss: {eval_loss:.3f}"
                    )
        generated_text = generate_sample(model=model,
                        context_size=config.context_length,
                        tokenizer=tokenizer,
                        device=device,
                        start_context=start_context)
        print(f"""Generated text:
              -------------------------------
              {generated_text}
              -------------------------------""")
    return train_loss, eval_loss, track_token_seen

In [None]:
torch.manual_seed(123)
model = GPTModel(GPT_CONFIG_124M)
model.to(device)
optimizer = torch.optim.AdamW(
model.parameters(),
lr=0.0004, weight_decay=0.1
)
num_epochs = 10
train_losses, val_losses, tokens_seen = train_model_simple(
model, train_loader, validate_loader, optimizer, device,
num_epochs=num_epochs, eval_freq=5, eval_iter=5,
start_context="Every effort moves you", tokenizer=tokenizer, config=GPT_CONFIG_124M_S
)