## Training routine

In [1]:
import torch
import GPT

# Model cunfiguration
GPT_CONFIG_124M = {
    "vocab_size": 50257,    # Vocabulary size
    "context_length": 1024, # context 
    "emb_dim": 768,         # Embedding dimension
    "n_heads": 12,          # Number of attention heads
    "n_layers": 12,         # Number of layers
    "drop_rate": 0.1,       # Dropout rate
    "qkv_bias": False       # Query-key-value bias
}

In [2]:
# Initialize the model
torch.manual_seed(123)
model = GPT.GPTModel(GPT_CONFIG_124M)
model.eval();  # Disable dropout during inference

In [3]:
# Model Performance before training
start_context = "Deep Learning is"
tokenizer = GPT.create_tokenizer()

token_ids = GPT.text_generation(
    model=model,
    idx=GPT.text_to_token_ids(start_context, tokenizer),
    num_token_generation=25,
    context_size=GPT_CONFIG_124M["context_length"],
    temperature=1.3,
    top_k=45
)
print("Output text:\n", GPT.token_ids_to_text(token_ids, tokenizer))

Output text:
 Deep Learning is assertionVEL bishopsaningeff龍oubtedly Maduro entrances thief 46 CEOs compressed Marty ICO Vananing sharpthan speculated MY nationseekingGen Teams


In [4]:
# Load the data to train the model ( this data is just to do the pipeline, a real training will need more data)
text_data = GPT.download_text_sample()
print(text_data[:50])

I HAD always thought Jack Gisburn rather a cheap g


In [5]:
# Prepare the datasets
train_ratio = 0.80
split_idx = int(train_ratio * len(text_data))
train_data = text_data[:split_idx]
val_data = text_data[split_idx:]

train_loader = GPT.create_data_loader(
  train_data,
  batch_size=2,
  max_length=GPT_CONFIG_124M["context_length"],
  stride=GPT_CONFIG_124M["context_length"],
  drop_last=True,
  shuffle=True,
  num_workers=0
)

val_loader = GPT.create_data_loader(
  val_data,
  batch_size=2,
  max_length=GPT_CONFIG_124M["context_length"],
  stride=GPT_CONFIG_124M["context_length"],
  drop_last=False,
  shuffle=False,
  num_workers=0
)

train_tokens = 0
for input_batch, target_batch in train_loader:
    train_tokens += input_batch.numel()

val_tokens = 0
for input_batch, target_batch in val_loader:
    val_tokens += input_batch.numel()

print("Training tokens:", train_tokens)
print("Validation tokens:", val_tokens)
print("All tokens:", train_tokens + val_tokens)

Training tokens: 4096
Validation tokens: 1024
All tokens: 5120


In [6]:
# Get the device 
device = GPT.get_device()
print(device)

mps


In [7]:
# Loos before training
model.to(device)

# No gradients tracking for efficiency, not training.
with torch.no_grad(): 
    train_loss = GPT.calc_loss_loader(train_loader, model, device)
    val_loss = GPT.calc_loss_loader(val_loader, model, device)

print("Training loss:", train_loss)
print("Validation loss:", val_loss)

Training loss: 11.214823722839355
Validation loss: 11.196243286132812


## Training 

In [None]:
optimizer = torch.optim.AdamW(model.parameters(), lr=0.0004, weight_decay=0.1)

num_epochs = 15
train_losses, val_losses, tokens_seen = GPT.train_model_simple(
    model, train_loader, val_loader, optimizer, device,
    num_epochs=num_epochs, eval_freq=5, eval_iter=5,
    start_context=start_context, tokenizer=tokenizer
)

In [None]:
# Plot the loss
epochs_tensor = torch.linspace(0, num_epochs, len(train_losses))
GPT.plot_losses(epochs_tensor, tokens_seen, train_losses, val_losses)

# Load OpenAI Weigths

In [2]:
# Use of the book to - https://github.com/rasbt/LLMs-from-scratch/blob/main/ch05/01_main-chapter-code/gpt_download.py
import urllib.request
url = (
    "https://raw.githubusercontent.com/rasbt/"
    "LLMs-from-scratch/main/ch05/"
    "01_main-chapter-code/gpt_download.py"
)
filename = url.split('/')[-1]
urllib.request.urlretrieve(url, filename)

('gpt_download.py', <http.client.HTTPMessage at 0x17f5f4050>)

In [9]:
from gpt_download import download_and_load_gpt2
settings, params = download_and_load_gpt2(
    model_size="124M", models_dir="gpt2"
)

File already exists and is up-to-date: gpt2/124M/checkpoint
File already exists and is up-to-date: gpt2/124M/encoder.json
File already exists and is up-to-date: gpt2/124M/hparams.json
File already exists and is up-to-date: gpt2/124M/model.ckpt.data-00000-of-00001
File already exists and is up-to-date: gpt2/124M/model.ckpt.index
File already exists and is up-to-date: gpt2/124M/model.ckpt.meta
File already exists and is up-to-date: gpt2/124M/vocab.bpe


In [10]:
# General configuration for multiple models 
model_configs = {
    "gpt2-small (124M)": {"emb_dim": 768, "n_layers": 12, "n_heads": 12},
    "gpt2-medium (355M)": {"emb_dim": 1024, "n_layers": 24, "n_heads": 16},
    "gpt2-large (774M)": {"emb_dim": 1280, "n_layers": 36, "n_heads": 20},
    "gpt2-xl (1558M)": {"emb_dim": 1600, "n_layers": 48, "n_heads": 25},
}
# Model to use
model_name = "gpt2-small (124M)"
NEW_CONFIG = GPT_CONFIG_124M.copy()
NEW_CONFIG.update(model_configs[model_name])
# Bias term on the context - use it because OpenAI use the bias during training (have the same setup), 
# Has been shown not to have an impact on the performance of the model so is not used anymore
NEW_CONFIG.update({"qkv_bias": True})

# Create an instance of the model with the given configuration
gpt = GPT.GPTModel(NEW_CONFIG)
gpt.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [11]:
# Load the weights of the 124M Model 
GPT.load_weights_into_gpt(gpt, params)
gpt.to(device)

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(1024, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=True)
        (W_key): Linear(in_features=768, out_features=768, bias=True)
        (W_value): Linear(in_features=768, out_features=768, bias=True)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=7

In [16]:
# Test the model with pre-Train weights from OpenAI 
token_ids = GPT.text_generation(
    model=gpt,
    idx=GPT.text_to_token_ids(start_context, tokenizer).to(device),
    num_token_generation=25,
    context_size=NEW_CONFIG["context_length"],
    temperature=1.3,
    top_k=45
)
print("Output text:\n", GPT.token_ids_to_text(token_ids, tokenizer))

Output text:
 Deep Learning is a good alternative, however, with the more sophisticated GPUs often relying upon them, they end up being a more cumbersome investment.
