In [1]:
import re
import math
from dataclasses import dataclass
import torch
import torch.nn as nn
from torch.nn import functional as F
import struct
import numpy as np
from tqdm import tqdm
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader

from tokenbasic import BasicTokenizer
from chessdataset import BinaryPGNDataset, collate_batch

In [2]:
with open(r'C:\Users\moren\Desktop\Masters\2nd Semester\Deep Learning\personal engine\game_files\tcec_games_cleaned.txt') as f:
    games = f.read()

list_games = games.split('\n')

tokenizer = BasicTokenizer()
tokenizer.load(r'C:\Users\moren\Desktop\Masters\2nd Semester\Deep Learning\personal engine\token_files\minbpeTokenizer.model')

---

In [12]:
def create_tokenized_dataset(file_path, tokenizer, outfile_path):
    # Store input_ids and their lengths
    tokenized_data = []
    
    with open(file_path, 'r') as f:
        for line in tqdm(f):
            pgn_text = line.strip()
            if not pgn_text:
                continue
                
            # Tokenize
            input_ids = tokenizer.encode(pgn_text)
            
            # Store the input_ids and their length
            tokenized_data.append((input_ids, len(input_ids)))
    
    # Sort by length (second element of each tuple)
    tokenized_data.sort(key=lambda x: x[1])
    
    # Extract just the input_ids after sorting
    sorted_input_ids = [item[0] for item in tokenized_data]
    
    # Save the sorted dataset
    torch.save(sorted_input_ids, outfile_path)
    
    return sorted_input_ids

In [None]:
create_tokenized_dataset('games5000.txt', tokenizer, 'games5000token.pt')

In [None]:
test = torch.load('games5000token.pt')
test

---

In [87]:
dataset = BinaryPGNDataset(r'C:\Users\moren\Desktop\Masters\2nd Semester\Deep Learning\personal engine\game_files\tcec_games_tokenized.bin')
dataloader = DataLoader(dataset, batch_size=1, shuffle=False, collate_fn=collate_batch)

# remember masking of padding token when on transformer

In [88]:
for batch in dataloader:
    firstbatch = batch
    break
# longest batch = 1944, smallest batch = 330, average = 720

In [90]:
firstbatch

tensor([[295, 285, 320, 354, 506, 336, 319, 284, 310, 370, 293, 421, 371, 275,
          78, 296, 363, 102, 266, 291, 329, 290, 278, 395, 305,  66, 350, 337,
         314, 313,  49, 338, 311, 426, 294, 295, 340, 421,  49, 315, 276,  81,
         420, 383, 288, 103, 292, 352,  99, 343,  99, 435, 371, 282, 364, 102,
         266,  49, 403, 120, 102, 266, 478,  49, 329, 478, 369, 492,  49, 368,
         120,  99, 330, 469,  57, 438,  78, 494, 373, 309, 364, 288,  50, 396,
         120, 288,  78, 103, 334,  50, 339, 492, 323,  50, 380, 276, 303,  50,
         360, 305,  78, 299,  50, 375, 322,  81, 289,  50, 385, 273, 261,  81,
         273, 263,  50, 349, 276,  81, 300,  50, 368, 272,  81, 494, 346, 308,
         414, 499, 338, 468, 272,  78, 300,  51,  49, 328, 320, 418, 499,  50,
         317, 259, 425, 312,  51, 328, 303, 369, 379,  51,  52, 317, 431, 302,
          51,  53, 317, 261,  78, 285,  51,  54, 328, 102, 407, 103, 312, 349,
         282, 418, 288,  51, 335, 443, 288, 439,  51

---
---

In [106]:
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output proj
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        # bias / mask (openai/HF naming)
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size))
                             .view(1, 1, config.block_size, config.block_size))
        
    def forward(self, x, padding_mask= None):
        B, T, C = x.size() # batch size, sequence length, embedding dim
        # calculate query, key, values for all heads
        # nh = n of heads, hs = head size, C = n channels = nh * hs
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
        
        att = (q @ k.transpose(-2, 1)) * (1.0 / math.sqrt(k.size(-1)))
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        att = F.softmax(att, dim=-1)
        y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble head outputs side by side
        # output proj
        y = self.c_proj(y)
        return y
               

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.relu = nn.ReLU()
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.relu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):

    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp  = MLP(config)

    def forward(self, x):                
        x = x + self.attn(self.ln_1(x))     # x -> layer norm -> attention
        x = x + self.mlp(self.ln_2(x))      #   -> layer norm -> mlp
        return x

@dataclass
class CLLMConfig:
    block_size: int = 1024      # max context length
    vocab_size: int = 513       # n unique tokens
    n_layer:    int = 4         # n of transformer layers
    n_head:     int = 4         # n of heads
    n_embd:     int = 128       # embedding dimension

class CLLM(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.config = config
    
        self.transformer = nn.ModuleDict(dict(
            wte  = nn.Embedding(config.vocab_size, config.n_embd),                 # token embeddings
            wpe  = nn.Embedding(config.block_size, config.n_embd),                 # position embeddings
            h    = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),  # number of hidden layers
            ln_f = nn.LayerNorm(config.n_embd),                                    # final layer norm
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)                 # 

    def forward(self, idx):
        # idx of shape (B, T)
        B, T = idx.size()
        assert T <= self.config.block_size, f'Cannot forward sequence of length {T}'
        # forward the tok and pos embeddings
        pos = torch.arange(0, T, dtype= torch.long, device= idx.device)
        pos_emb = self.transformer.wpe(pos) # pos embds, shape (T, n_embd)
        tok_emb = self.transformer.wte(idx) # tok embs, shape (B, T, n_embd)
        x = tok_emb + pos_emb
        # forward the transformer blocks
        for block in self.transformer.h:
            x = block(x)
        # forward the final layernorm and classifier
        x = self.transformer.ln_f(x)
        logits = self.lm_head(x)  # (B, T, vocab_size)
        return logits 

# CLAUDE'S

In [114]:
class CausalSelfAttention(nn.Module):

    def __init__(self, config):
        super().__init__()
        assert config.n_embd % config.n_head == 0
        # key, query, value projections for all heads, but in a batch
        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd)
        # output proj
        self.c_proj = nn.Linear(config.n_embd, config.n_embd)
        # regularization
        self.n_head = config.n_head
        self.n_embd = config.n_embd
        # bias / mask (openai/HF naming)
        self.register_buffer('bias', torch.tril(torch.ones(config.block_size, config.block_size))
                             .view(1, 1, config.block_size, config.block_size))
        
    def forward(self, x, padding_mask=None):
        B, T, C = x.size()
        qkv = self.c_attn(x)
        q, k, v = qkv.split(self.n_embd, dim=2)
        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)
        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2)

        att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
        # Apply causal mask
        att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
        
        # Additionally mask padding tokens if padding_mask is provided
        if padding_mask is not None:
            # Convert padding_mask from [B, T] to [B, 1, 1, T] for broadcasting
            pad_mask = padding_mask.unsqueeze(1).unsqueeze(2)
            att = att.masked_fill(~pad_mask.bool(), float('-inf'))
        
        att = F.softmax(att, dim=-1)
        y = att @ v
        y = y.transpose(1, 2).contiguous().view(B, T, C)
        y = self.c_proj(y)
        return y
               

class MLP(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.c_fc = nn.Linear(config.n_embd, 4 * config.n_embd)
        self.relu = nn.ReLU()
        self.c_proj = nn.Linear(4 * config.n_embd, config.n_embd)

    def forward(self, x):
        x = self.c_fc(x)
        x = self.relu(x)
        x = self.c_proj(x)
        return x

class Block(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.ln_1 = nn.LayerNorm(config.n_embd)
        self.attn = CausalSelfAttention(config)
        self.ln_2 = nn.LayerNorm(config.n_embd)
        self.mlp = MLP(config)

    def forward(self, x, padding_mask=None):
        # Pass padding_mask to attention
        x = x + self.attn(self.ln_1(x), padding_mask=padding_mask)
        x = x + self.mlp(self.ln_2(x))
        return x

@dataclass
class CLLMConfig:
    block_size: int = 1024      # max context length
    vocab_size: int = 513       # n unique tokens
    n_layer:    int = 4         # n of transformer layers
    n_head:     int = 4         # n of heads
    n_embd:     int = 128       # embedding dimension

class CLLM(nn.Module):
    
    def __init__(self, config):
        super().__init__()
        self.config = config
    
        self.transformer = nn.ModuleDict(dict(
            wte  = nn.Embedding(config.vocab_size, config.n_embd),                 # token embeddings
            wpe  = nn.Embedding(config.block_size, config.n_embd),                 # position embeddings
            h    = nn.ModuleList([Block(config) for _ in range(config.n_layer)]),  # number of hidden layers
            ln_f = nn.LayerNorm(config.n_embd),                                    # final layer norm
        ))
        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)                 # 

    def forward(self, idx, targets=None):
        device = idx.device
        b, t = idx.size()
        
        # Create position indices
        pos = torch.arange(0, t, dtype=torch.long, device=device).unsqueeze(0).expand(b, t)
        
        # Create padding mask (True for real tokens, False for padding)
        padding_mask = (idx != 512)
        
        # Get token and position embeddings
        tok_emb = self.transformer.wte(idx)
        pos_emb = self.transformer.wpe(pos)
        
        # Sum token and position embeddings
        x = tok_emb + pos_emb
        
        # Apply transformer blocks with padding mask
        for block in self.transformer.h:
            x = block(x, padding_mask=padding_mask)
        
        # Apply final layer norm
        x = self.transformer.ln_f(x)
        
        # Get logits
        logits = self.lm_head(x)

        return logits

In [None]:
model = CLLM(CLLMConfig)
model.eval()
#model.to('cuda')

x = firstbatch

torch.manual_seed(42)
#torch.cuda.manual_seed(42)

while x.size(1) < 1024: #max_length:
    with torch.no_grad():
        logits = model(x)