In [5]:
import torch
import os

print(torch.cuda.is_available())
print(torch.backends.mps)

if torch.cuda.is_available():
    device = torch.device('cuda')
    # get number of cuda devices
    print(f"devices: {torch.cuda.device_count()}")
    print(f"device:  {torch.cuda.get_device_name()}")
    print(f"device0: {torch.cuda.get_device_properties(0)}")
    print(f"{torch.cuda.memory_summary()}")
elif torch.backends.mps is not None:
    device = torch.device('mps')
    print(f"{torch.mps.current_allocated_memory()}")
    os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
else:
    device = torch.device('cpu')
    # print a warning that cpu is being used
    print("Warning: Running on CPU. This will be slow.")
print(f"{device}")

False
<module 'torch.backends.mps' from '/Users/oniichan/anaconda3/envs/its530_py38/lib/python3.8/site-packages/torch/backends/mps/__init__.py'>
0
mps


## Architecture

In [6]:
import torch.nn as nn
from torch.nn import functional as F

In [7]:
class Head(nn.Module):

    def __init__(self, head_size, block_size, n_embd, dropout):
        super().__init__()
        
        self.key   = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.query = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.value = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]

        tril_def = torch.tril( torch.ones(block_size, block_size) )  ## [40, 40]
        
        self.register_buffer(
                  'tril', 
                  tril_def
               )
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        
        B, T, E = x.shape   ## [batch_size, 40, 512]
        
        k = self.key(   x )            ## k = (B, T, 64)
        q = self.query( x )            ## q = (B, T, 64)

        E2 = 64     ## I think this is 64 and not 512
        ## (B, T, E) @ (B, E, T)  -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * E2 ** -0.5        
        
        wei = wei.masked_fill(
                      self.tril[:T, :T] == 0, 
                      float('-inf')
        )   
        
        ## (B, T, T)
        wei = F.softmax( wei, dim= -1 )         ## (B, T, T)
        wei = self.dropout(   wei   )
        
        ## perform weighted aggregation of values
        
        v   = self.value(  x  )   ## x = (B, 40, E)
        out = wei @ v             ## (B, T, T) @ (B, T, 64) -> (B, T, 64)
        
        return out
        

In [8]:
class FeedForward(nn.Module):

    def __init__(self, n_embd, dropout):         ## 512
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),      ## [512, 4*512]
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),      ## [4*512, 512]
            nn.Dropout(dropout),
        )
        
    def forward(self, x):
        return self.net(x)

In [9]:
class MultiHeadAttention(nn.Module):

    def __init__(self, n_head, head_size, block_size, n_embd, dropout):    ## (8, 64)
        super().__init__()
        self.heads = nn.ModuleList(  [ Head(head_size, block_size, n_embd, dropout) for _ in range(n_head) ] )
        self.proj  = nn.Linear(n_embd, n_embd)   ## 512, 512
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat(   [ h(x) for h in self.heads ], dim = -1   )
        out = self.proj(  out   )
        out = self.dropout(   out   )
        return out



In [17]:
class Block(nn.Module):
    
    def __init__(self, n_head, block_size, n_embd, dropout):     ## (512, 8)
        super().__init__()
        head_size = n_embd // n_head        ## 64
        self.sa   = MultiHeadAttention(n_head, head_size, block_size, n_embd, dropout)
        self.ffwd = FeedForward( n_embd, dropout)    ## 512
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x

In [18]:
class GPTModel(nn.Module):
    def __init__(self, n_embd, block_size, n_layer, n_head, dropout):
        super().__init__()
        self.token_embedding_table = nn.Embedding(112, n_embd)   ## [65, 512]
        self.pos_emb_table = nn.Embedding(block_size, n_embd)     ## [block, 512]
        
        self.blocks = nn.Sequential(
                *[ Block(n_head, block_size, n_embd, dropout) for _ in range(n_layer) ]
        )
        
        self.ln_f    = nn.LayerNorm(  n_embd    )        
        self.lm_ffw_head = nn.Linear(n_embd, 112)  ## [512, 65] # FFW Layer
        self.block_size = block_size
        
    def forward(self, idx, targets=None):
        B, T = idx.shape     ## (Batch, 40)
        ## ids and targets are both (B, T) tensors of integers
        
        tok_emb = self.token_embedding_table(idx)      
        pos_emb = self.pos_emb_table(torch.arange(T, device=device))  
        
        x = tok_emb + pos_emb    ## [B, T, E] or [64, 40, 512]

        ## This is the architecture
        x = self.blocks(  x  )   ## (B, T, E)        
        x = self.ln_f(    x  )   ## (B, T, E)   ## norm
        logits = self.lm_ffw_head(x)         ## [B, 40, 65] 
        
        if targets is None:
            loss = None
        else:
            B, T, E  = logits.shape
            logits  = logits.view( B*T, E)
            targets = targets.view(B*T)
            loss    = F.cross_entropy(logits, targets)
        return logits, loss
        
    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:]
            logits, _loss = self(idx_cond)    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, idx_next), dim=1  )   ## (B, T+1) append sample to running sequence
        return idx

## Dataset

In [12]:
import pandas as pd

df = pd.read_csv('exchange_rate.txt', header=None)
display(df.head())

norm_df = (df - df.min()) * (50_257-2) / ( df.max() - df.min() )
tokens = norm_df.values.flatten().astype(int)
print(tokens, tokens.shape)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.7855,1.611,0.861698,0.634196,0.211242,0.006838,0.593,0.525486
1,0.7818,1.61,0.861104,0.633513,0.211242,0.006863,0.594,0.523972
2,0.7867,1.6293,0.86103,0.648508,0.211242,0.006975,0.5973,0.526316
3,0.786,1.637,0.862069,0.650618,0.211242,0.006953,0.597,0.523834
4,0.7849,1.653,0.861995,0.656254,0.211242,0.00694,0.5985,0.527426


[24525 22368 25833 ... 16643 30769 27202] (60704,)


In [28]:
data = torch.tensor(tokens[:2056], dtype=torch.long)

n = int(0.9*len(data))

train_data = data[:n]
val_data   = data[n:]

## Train

In [14]:
## every id for a given token is embedded to vector of this size
n_embd            = 768        # GPT-2
n_head            = 12         # GPT-2
n_layer           = 12         # GPT-2
dropout           = 0.1        # GPT-2

learning_rate     = 2.5e-4     # GPT-2
vocab_size        = 50_257     # GPT-2 50_257
block_size        = 1024       # GPT-2 (context) ## N tokens in sequence

batch_size        = 64
max_iters         = 512
eval_interval     = 512
eval_iters        = 128

In [15]:
import tqdm

In [29]:
model = GPTModel(
                    n_embd=n_embd,
                    block_size=block_size,
                    n_layer=n_layer,
                    n_head=n_head,
                    dropout=dropout
                ).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

In [25]:
def get_batch(split):
    if split == "train":
        data = train_data
    else:
        data = val_data
        
    ix = torch.randint(   len(data) - block_size, (batch_size,)   )
    
    x  = torch.stack(    [  data[   i : i+block_size ]     for i in ix ]    ) 
    y  = torch.stack(    [  data[ i+1 : i+1+block_size ]   for i in ix ]    )
    
    x, y = x.to(device), y.to(device)

    return x, y

In [33]:
# i dont think this is doing it right, either way, im just gonna use std
@torch.no_grad()    ## for efficient processing
def estimate_loss():
    out = {}
    model.eval()   ## set to no training
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            print(f"{split}: {k} / {eval_iters}")
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()  ## back to training
    return out

In [34]:
for iter in tqdm.tqdm(range(max_iters)):
    if iter % eval_interval == 0:
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    xb, yb = get_batch('train')
    
    ## eval the loss
    logits, loss = model(xb, yb)
    
    optimizer.zero_grad(set_to_none=True)   ## zero out
    loss.backward()
    optimizer.step()

  0%|          | 0/512 [00:00<?, ?it/s]

train: 0 / 256
train: 1 / 256
train: 2 / 256
train: 3 / 256
train: 4 / 256
train: 5 / 256
train: 6 / 256
train: 7 / 256


  0%|          | 0/512 [00:57<?, ?it/s]


KeyboardInterrupt: 