In [107]:
import torch
import os

print(torch.cuda.is_available())
print(torch.backends.mps)

if torch.cuda.is_available():
    device = torch.device('cuda')
    # get number of cuda devices
    print(f"devices: {torch.cuda.device_count()}")
    print(f"device:  {torch.cuda.get_device_name()}")
    print(f"device0: {torch.cuda.get_device_properties(0)}")
    print(f"{torch.cuda.memory_summary()}")
elif torch.backends.mps is not None:
    device = torch.device('mps')
    print(f"{torch.mps.current_allocated_memory()}")
    os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
else:
    device = torch.device('cpu')
    # print a warning that cpu is being used
    print("Warning: Running on CPU. This will be slow.")
print(f"{device}")

False
<module 'torch.backends.mps' from '/Users/oniichan/anaconda3/envs/its530_py38/lib/python3.8/site-packages/torch/backends/mps/__init__.py'>
1558722304
mps


## Architecture

In [108]:
import torch.nn as nn
from torch.nn import functional as F

In [109]:
class Head(nn.Module):

    def __init__(self, head_size, block_size, n_embd, dropout):
        super().__init__()
        
        self.key   = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.query = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]
        self.value = nn.Linear(n_embd, head_size, bias=False)  ## [512, 64]

        tril_def = torch.tril( torch.ones(block_size, block_size) )  ## [40, 40]
        
        self.register_buffer(
                  'tril', 
                  tril_def
               )
        
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        
        B, T, E = x.shape   ## [batch_size, 40, 512]
        
        k = self.key(   x )            ## k = (B, T, 64)
        q = self.query( x )            ## q = (B, T, 64)

        E2 = 64     ## I think this is 64 and not 512
        ## (B, T, E) @ (B, E, T)  -> (B, T, T)
        wei = q @ k.transpose(-2, -1) * E2 ** -0.5        
        
        wei = wei.masked_fill(
                      self.tril[:T, :T] == 0, 
                      float('-inf')
        )   
        
        ## (B, T, T)
        wei = F.softmax( wei, dim= -1 )         ## (B, T, T)
        wei = self.dropout(   wei   )
        
        ## perform weighted aggregation of values
        
        v   = self.value(  x  )   ## x = (B, 40, E)
        out = wei @ v             ## (B, T, T) @ (B, T, 64) -> (B, T, 64)
        
        return out
        

In [110]:
class FeedForward(nn.Module):

    def __init__(self, n_embd, dropout):         ## 512
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),      ## [512, 4*512]
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),      ## [4*512, 512]
            nn.Dropout(dropout),
        )
        
    def forward(self, x):
        return self.net(x)

In [111]:
class MultiHeadAttention(nn.Module):

    def __init__(self, n_head, head_size, block_size, n_embd, dropout):    ## (8, 64)
        super().__init__()
        self.heads = nn.ModuleList(  [ Head(head_size, block_size, n_embd, dropout) for _ in range(n_head) ] )
        self.proj  = nn.Linear(n_embd, n_embd)   ## 512, 512
        self.dropout = nn.Dropout(dropout)
    
    def forward(self, x):
        out = torch.cat(   [ h(x) for h in self.heads ], dim = -1   )
        out = self.proj(  out   )
        out = self.dropout(   out   )
        return out



In [112]:
class Block(nn.Module):
    
    def __init__(self, n_head, block_size, n_embd, dropout):     ## (512, 8)
        super().__init__()
        head_size = n_embd // n_head        ## 64
        self.sa   = MultiHeadAttention(n_head, head_size, block_size, n_embd, dropout)
        self.ffwd = FeedForward( n_embd, dropout)    ## 512
        self.ln1  = nn.LayerNorm(n_embd)
        self.ln2  = nn.LayerNorm(n_embd)
        
    def forward(self, x):
        x = x + self.sa(     self.ln1(x)      )
        x = x + self.ffwd(   self.ln2(x)      )
        return x

In [113]:
class GPTModel(nn.Module):
    def __init__(self, n_embd, block_size, n_layer, n_head, dropout):
        super().__init__()
        self.token_embedding_table = nn.Embedding(112, n_embd)   ## [65, 512]
        self.pos_emb_table = nn.Embedding(block_size, n_embd)     ## [block, 512]
        
        self.blocks = nn.Sequential(
                *[ Block(n_head, block_size, n_embd, dropout) for _ in range(n_layer) ]
        )
        
        self.ln_f    = nn.LayerNorm(  n_embd    )        
        self.lm_ffw_head = nn.Linear(n_embd, 112)  ## [512, 65] # FFW Layer
        self.block_size = block_size
        
    def forward(self, idx, targets=None):
        B, T = idx.shape     ## (Batch, 40)
        ## ids and targets are both (B, T) tensors of integers
        
        tok_emb = self.token_embedding_table(idx)      
        pos_emb = self.pos_emb_table(torch.arange(T, device=device))  
        
        x = tok_emb + pos_emb    ## [B, T, E] or [64, 40, 512]

        ## This is the architecture
        x = self.blocks(  x  )   ## (B, T, E)        
        x = self.ln_f(    x  )   ## (B, T, E)   ## norm
        logits = self.lm_ffw_head(x)         ## [B, 40, 65] 
        
        if targets is None:
            loss = None
        else:
            B, T, E  = logits.shape
            logits  = logits.view( B*T, E)
            targets = targets.view(B*T)
            loss    = F.cross_entropy(logits, targets)
        return logits, loss
        
    def generate(self, idx, max_new_tokens):    ## idx is (B, T)
        for _ in range(max_new_tokens):
            ## crop idx to the last block_size tokens
            idx_cond = idx[:, -self.block_size:]
            logits, _loss = self(idx_cond)    ## ## get preds
            logits = logits[:, -1, :]    ## focus on last one (B, E)
            probs = F.softmax(logits, dim= -1)    ## (B, E) get probs
            idx_next = torch.multinomial(probs, num_samples=1)     ## (B, 1) selected
            idx = torch.cat(  (idx, idx_next), dim=1  )   ## (B, T+1) append sample to running sequence
        return idx

## Dataset

In [114]:
import pandas as pd

df = pd.read_csv('exchange_rate.txt', header=None)
display(df.head())

norm_df = (df - df.min()) * (50_257-2) / ( df.max() - df.min() )
tokens = norm_df.values.flatten().astype(int)
print(tokens, tokens.shape)

Unnamed: 0,0,1,2,3,4,5,6,7
0,0.7855,1.611,0.861698,0.634196,0.211242,0.006838,0.593,0.525486
1,0.7818,1.61,0.861104,0.633513,0.211242,0.006863,0.594,0.523972
2,0.7867,1.6293,0.86103,0.648508,0.211242,0.006975,0.5973,0.526316
3,0.786,1.637,0.862069,0.650618,0.211242,0.006953,0.597,0.523834
4,0.7849,1.653,0.861995,0.656254,0.211242,0.00694,0.5985,0.527426


[24525 22368 25833 ... 16643 30769 27202] (60704,)


In [115]:
data = torch.tensor(tokens[:2056], dtype=torch.long)

## Model

In [116]:
## every id for a given token is embedded to vector of this size
n_embd            = 768        # GPT-2
n_head            = 12         # GPT-2
n_layer           = 12         # GPT-2
dropout           = 0.1        # GPT-2

learning_rate     = 2.5e-4     # GPT-2
vocab_size        = 50_257     # GPT-2 50_257
block_size        = 1024       # GPT-2 (context) ## N tokens in sequence

batch_size        = 64
# max_iters         = 512
eval_interval     = 512
# eval_iters        = 128

In [117]:
import tqdm

In [118]:
model = GPTModel(
                    n_embd=n_embd,
                    block_size=block_size,
                    n_layer=n_layer,
                    n_head=n_head,
                    dropout=dropout
                ).to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Hugging Face Transformers

In [119]:
from transformers import PreTrainedModel, PretrainedConfig
from transformers.modeling_outputs import CausalLMOutput

class GPTConfig(PretrainedConfig):
    def __init__(
        self,
        block_size=40,
        vocab_size=98,
        n_embd=512,
        n_head=8,
        n_layer=6,
        dropout=0.2,
        **kwargs
    ):
        super().__init__(**kwargs)
        self.block_size = block_size
        self.vocab_size = vocab_size
        self.n_embd = n_embd
        self.n_head = n_head
        self.n_layer = n_layer
        self.dropout = dropout

class GPTModelForTrainer(PreTrainedModel):
    def __init__(self, config, gpt_model):
        super().__init__(config)
        self.config = config
        self.model = gpt_model
        
    def forward(self, input_ids, labels=None, **kwargs):
        logits = self.model(input_ids.unsqueeze(0))
        
        loss = None
        if labels is not None:
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            
            loss_fct = torch.nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)),
                          shift_labels.view(-1))
            
        print(labels)
        return CausalLMOutput(
            loss=loss,
            logits=logits,
        )

In [120]:
from transformers import Trainer, TrainingArguments, AutoModelForCausalLM
gpt = AutoModelForCausalLM.from_pretrained('gpt2')

tokens = torch.tensor([1, 2, 3])
print(gpt(  tokens  ))



CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[-32.9011, -31.2024, -34.6622,  ..., -39.4867, -39.8731, -32.2387],
        [-55.5207, -53.4285, -56.4767,  ..., -68.1539, -66.7708, -58.6006],
        [-61.7969, -60.5386, -59.5503,  ..., -75.3206, -72.7731, -65.5706]],
       grad_fn=<MmBackward0>), past_key_values=((tensor([[[[-1.1621,  2.1424,  0.9899,  ..., -1.2493, -0.6088,  1.6558],
          [-1.7831,  2.2802,  2.5158,  ..., -0.2474, -1.4704,  1.5416],
          [-0.8020,  2.3662,  3.0656,  ..., -0.8665, -1.0335,  1.7610]],

         [[-0.3037,  0.1982, -0.4324,  ..., -0.1011,  2.3475,  0.6145],
          [ 0.1066, -0.6560, -0.1682,  ...,  1.6234,  4.6732,  1.6981],
          [-0.1469, -2.1942, -1.2178,  ..., -1.1906,  4.2506,  0.3419]],

         [[ 0.0409, -0.3168,  0.8519,  ..., -1.6563, -1.4363,  0.7900],
          [ 1.2763,  0.1916,  0.7094,  ..., -1.8491,  1.0007,  1.8073],
          [ 3.3321,  0.5271,  1.5881,  ..., -2.8760,  0.2359,  1.7286]],

         ...,

 

In [121]:
from datasets import Dataset

def prepare_dataset(text_data):
    # Convert your text data to token IDs
    # This is just an example - adjust based on your tokenizer
    return Dataset.from_dict({
        "input_ids": text_data,
    })

train_dataset = prepare_dataset(data)
# eval_dataset = prepare_dataset(your_eval_data)

In [126]:
from transformers import Trainer
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./gpt_model",
    num_train_epochs=3,
    per_device_train_batch_size=batch_size,
    # per_device_eval_batch_size=batch_size,
    # eval_steps=eval_interval,
    save_steps=eval_interval,
    save_total_limit=2,
    warmup_steps=100,
    learning_rate=learning_rate,
    # fp16=True,  # if you want to use mixed precision training
    evaluation_strategy="steps",
    logging_dir="./logs",
    logging_steps=eval_interval,
)

# Wrap the model
config = GPTConfig(
    block_size=block_size,
    vocab_size=vocab_size,
    n_embd=n_embd,
    n_head=n_head,
    n_layer=n_layer,
    dropout=dropout,
)

model_for_trainer = GPTModelForTrainer(config, model)

trainer = Trainer(
    model=model_for_trainer,
    args=training_args,
    train_dataset=train_dataset,
    # eval_dataset=eval_dataset,
)

# Start training
trainer.train()

  0%|          | 0/99 [00:00<?, ?it/s]

ValueError: The model did not return a loss from the inputs, only the following keys: logits,past_key_values. For reference, the inputs it received are input_ids.