**Exercise 2: Load the pretrained model in a new session**

In [1]:
import torch
from supplementary import GPTModel

In [2]:
# --- Model config ---
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

In [3]:
# --- Load model ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(torch.load("model.pth", map_location=device))
model.to(device)
model.eval()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [5]:
print("=== Exercise 2: Model successfully loaded and ready for inference! ===")

=== Exercise 2: Model successfully loaded and ready for inference! ===


**Exercise 3: Train the LLM on your own favorite texts**

In [6]:
import torch
from supplementary import GPTModel, create_dataloader_v1, calc_loss_batch

# --- Model config ---
GPT_CONFIG_124M = {
    "vocab_size": 50257,
    "context_length": 256,
    "emb_dim": 768,
    "n_heads": 12,
    "n_layers": 12,
    "drop_rate": 0.1,
    "qkv_bias": False
}

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [7]:
custom_file = "the-silver-glen.txt" 

try:
    with open(custom_file, "r", encoding="utf-8") as f:
        text_data = f.read()
except FileNotFoundError:
    print(f"File '{custom_file}' not found. Please add it to your folder.")
    exit()

In [8]:
# --- Data loader ---
train_loader = create_dataloader_v1(
    text_data,
    batch_size=1,
    max_length=64,
    stride=32,
    drop_last=False,
    shuffle=True,
    num_workers=0
)

In [9]:
# --- Load pretrained model ---
model = GPTModel(GPT_CONFIG_124M)
model.load_state_dict(torch.load("model.pth", map_location=device))
model.to(device)
model.train()

GPTModel(
  (tok_emb): Embedding(50257, 768)
  (pos_emb): Embedding(256, 768)
  (drop_emb): Dropout(p=0.1, inplace=False)
  (trf_blocks): Sequential(
    (0): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features=768, out_features=768, bias=False)
        (W_key): Linear(in_features=768, out_features=768, bias=False)
        (W_value): Linear(in_features=768, out_features=768, bias=False)
        (out_proj): Linear(in_features=768, out_features=768, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (ff): FeedForward(
        (layers): Sequential(
          (0): Linear(in_features=768, out_features=3072, bias=True)
          (1): GELU()
          (2): Linear(in_features=3072, out_features=768, bias=True)
        )
      )
      (norm1): LayerNorm()
      (norm2): LayerNorm()
      (drop_shortcut): Dropout(p=0.1, inplace=False)
    )
    (1): TransformerBlock(
      (att): MultiHeadAttention(
        (W_query): Linear(in_features

In [10]:
# --- Fine-tune for 2 epochs ---
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-4)
for epoch in range(2):
    print(f"\nEpoch {epoch+1}")
    for input_batch, target_batch in train_loader:
        optimizer.zero_grad()
        loss = calc_loss_batch(input_batch, target_batch, model, device)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1} finished with loss {loss.item():.4f}")


Epoch 1
Epoch 1 finished with loss 7.7277

Epoch 2
Epoch 2 finished with loss 3.9606


In [11]:
torch.save(model.state_dict(), "model_finetuned.pth")
print("\nFine-tuning complete! Model saved as 'model_finetuned.pth'.")


Fine-tuning complete! Model saved as 'model_finetuned.pth'.
