# Finetuning GPT2
* Named entity recognition task
* N2C2 2006 & 2014 Deidentification Challenge dataset

In [25]:
data_dir = '/media/nvme2/n2c2/'
model_dir = '/media/nvme2/models/redact/'
model_in_fname = 'ckpt.pt'
device_type = 'cuda'
device = 'cuda'
block_size = 1024
batch_size = 6
eval_iters = 200
eval_interval = 1000
log_interval = 100
max_iters = 10000
weight_decay = 1e-2
learning_rate = 6e-5
beta1, beta2 = 0.9, 0.95
warmup_iters = 500
lr_decay_iters = 10000
min_lr = 6e-6
iter_num = 0
best_val_loss = 1e9

### Get data and define simple data loader

In [26]:
import os
import numpy as np
import torch

train_data = np.load(os.path.join(data_dir, 'train.npy'))
val_data = np.load(os.path.join(data_dir, 'val.npy'))

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size, 0]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i:i+block_size, 1]).astype(np.int64)) for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [27]:
import json

with open('redact/data/n2c2/label2id.json', 'r') as f:
    label2id = json.load(f)

In [28]:
n_labels = max(label2id.values())
n_labels

17

## Load pretrained model checkpoint

In [29]:
ckpt_path = os.path.join(model_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
model_args = checkpoint['model_args']
model_args

{'n_layer': 12,
 'n_head': 12,
 'n_embd': 768,
 'block_size': 1024,
 'dropout': 0.0,
 'vocab_size': 50304,
 'bias': False,
 'out_size': 50304}

## Init model

In [30]:
from redact.gpt2_model import GPTConfig, GPT

gptconf = GPTConfig(**model_args)
model = GPT(gptconf)

number of parameters: 124.37M


In [31]:
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

In [32]:
model.load_state_dict(state_dict)
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

## Replace language model head  
We need the model output features to match the number of NER labels (17).  Since the current head is tied to the token embedding layer, we'll simply add two additional linear layers that reduce our output from vocab_size to n_labels.

In [33]:
model.replace_head(n_labels)

In [34]:
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Sequential(
    (0): Linear(in_features=768, out_features=50304, bias=False)
    (1): Linear(in_features=50304, out_features=1572, bias=False)
    (2): Linear(in_f

## Freeze all layers except new head

In [35]:
for param in model.transformer.parameters():
    param.requires_grad = False
for param in model.lm_head[0].parameters():
    param.requires_grad = False

## Configure optimizer

In [36]:
optimizer = model.configure_optimizers(
    weight_decay, learning_rate, (beta1, beta2), device_type
)

Using fused AdamW: False


## Train

In [37]:
import math

def get_lr(it):
    # During warmup iterations: linear
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # After decay iterations: minimum
    if it > lr_decay_iters:
        return min_lr
    # Decay iterations
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

In [38]:
model.to(device)
model = torch.compile(model)

In [39]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            logits, loss = model(X, Y)
            if loss.item() < 0:
                print(f'Negative eval loss\n\tlogits: {logits}\n\ttarget: {Y}')
            else:
                losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [40]:
import time

t0 = time.time()

while True:

    # Get learning rate
    lr = get_lr(iter_num)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # Eval
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"Step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Get batch
    X, Y = get_batch('train')

    # Forward
    _, loss = model(X, Y)

    # Backward
    loss.backward()

    # Step optimizer
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

    # Log
    t1 = time.time()
    dt = t1-t0
    t0 = t1

    if iter_num % log_interval == 0:
        lossf = loss.item()
        print(f'Iter {iter_num}: loss {lossf:.4f}, time{dt*1000:.2f}ms')

    # Update iter_num and check for termination conditions
    iter_num +=1
    if iter_num > max_iters:
        break

Step 0: train loss 57.8851, val loss 58.1419
Iter 0: loss 61.2802, time145445.54ms
Iter 100: loss 6.9475, time419.86ms
Iter 200: loss 2.0517, time428.57ms
Iter 300: loss 6.6653, time428.03ms
Iter 400: loss 18.2460, time422.32ms
Iter 500: loss 5.4727, time418.16ms
Iter 600: loss 41.0305, time419.49ms
Iter 700: loss 3.2128, time422.63ms
Iter 800: loss 5.4458, time426.02ms
Iter 900: loss 3.8344, time423.53ms
Step 1000: train loss 1.3502, val loss 1.3177
Iter 1000: loss 0.3833, time129602.49ms
Iter 1100: loss 3.0226, time423.52ms
Iter 1200: loss 2.2068, time419.56ms
Iter 1300: loss 1.2828, time426.56ms
Iter 1400: loss 1.9225, time423.90ms
Iter 1500: loss 3.9992, time425.93ms
Iter 1600: loss 0.2880, time418.41ms
Iter 1700: loss 1.0225, time422.67ms
Iter 1800: loss 1.1405, time423.41ms
Iter 1900: loss 0.1888, time421.22ms
Step 2000: train loss 0.9422, val loss 0.9615
Iter 2000: loss 0.4805, time129643.82ms
Iter 2100: loss 0.3215, time419.07ms
Iter 2200: loss 0.4267, time425.55ms
Iter 2300: l