# Finetuning GPT2
* Named entity recognition task
* N2C2 2006 & 2014 Deidentification Challenge dataset

In [1]:
import os
import numpy as np
import torch
import torch.nn as nn 
from torch.nn import functional as F
import json
import math
from dataclasses import dataclass
from redact.gpt2_model import GPTConfig, GPT, LayerNorm
import time

In [78]:
data_dir = '/media/nvme2/n2c2/'
model_dir = '/media/nvme2/models/redact/'
block_size = 1024
batch_size = 6
device_type = 'cuda'
device = 'cuda'

### Get data and define simple data loader

In [79]:
train_data = np.load(os.path.join(data_dir, 'train.npy'))
val_data = np.load(os.path.join(data_dir, 'val.npy'))

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([torch.from_numpy((data[i:i+block_size, 0]).astype(np.int64)) for i in ix])
    y = torch.stack([torch.from_numpy((data[i:i+block_size, 1]).astype(np.int64)) for i in ix])
    x, y = x.to(device), y.to(device)
    return x, y

In [80]:
X, Y = get_batch('train')
X.shape, Y.shape

(torch.Size([3, 1024]), torch.Size([3, 1024]))

In [81]:
with open('redact/data/n2c2/label2id.json', 'r') as f:
    label2id = json.load(f)

In [82]:
n_labels = max(label2id.values())
n_labels

17

## Load pretrained model checkpoint

In [83]:
model_args = dict(n_layer=12, n_head=12, n_embd=768,
                  block_size=1024, dropout=0.0, 
                  vocab_size=None, bias=False)

In [84]:
ckpt_path = os.path.join(model_dir, 'ckpt.pt')
checkpoint = torch.load(ckpt_path, map_location=device)
checkpoint_model_args = checkpoint['model_args']
checkpoint_model_args

{'n_layer': 12,
 'n_head': 12,
 'n_embd': 768,
 'block_size': 1024,
 'dropout': 0.0,
 'vocab_size': 50304,
 'bias': False}

In [85]:
for k in ['n_layer', 'n_head', 'n_embd', 'block_size', 'bias', 'vocab_size']:
    model_args[k] = checkpoint_model_args[k]
model_args

{'n_layer': 12,
 'n_head': 12,
 'n_embd': 768,
 'block_size': 1024,
 'dropout': 0.0,
 'vocab_size': 50304,
 'bias': False}

## Init model

In [99]:
gptconf = GPTConfig(**model_args)
model = GPT(gptconf)

number of parameters: 124.37M


In [100]:
state_dict = checkpoint['model']
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)

In [101]:
model.load_state_dict(state_dict)
model

GPT(
  (transformer): ModuleDict(
    (wte): Embedding(50304, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.0, inplace=False)
    (h): ModuleList(
      (0-11): 12 x Block(
        (ln_1): LayerNorm()
        (attn): CausalSelfAttention(
          (c_attn): Linear(in_features=768, out_features=2304, bias=False)
          (c_proj): Linear(in_features=768, out_features=768, bias=False)
          (attn_dropout): Dropout(p=0.0, inplace=False)
          (resid_dropout): Dropout(p=0.0, inplace=False)
        )
        (ln_2): LayerNorm()
        (mlp): MLP(
          (c_fc): Linear(in_features=768, out_features=3072, bias=False)
          (c_proj): Linear(in_features=3072, out_features=768, bias=False)
          (dropout): Dropout(p=0.0, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm()
  )
  (lm_head): Linear(in_features=768, out_features=50304, bias=False)
)

## Replace language model head  
We need the model output features to match the number of NER labels (17).  Since the current head is tied to the token embedding layer, we'll simply add two additional linear layers that reduce our output from vocab_size to n_labels.

In [102]:
model.lm_head = nn.Sequential(
    model.lm_head,
    nn.Linear(model.lm_head.out_features, model.lm_head.out_features//32, bias=False),
    nn.Linear(model.lm_head.out_features//32, model.lm_head.out_features//64, bias=False),
    nn.Linear(model.lm_head.out_features//64, n_labels, bias=False)
)
    

In [103]:
model.transformer.wte.weight = model.lm_head[0].weight

In [122]:
for i in range(1, 4):
    print(i)

1
2
3


In [123]:
for i in range(1, 4):
    torch.nn.init.normal_(model.lm_head[i].weight, mean=0.0, std=0.02)

## Freeze all layers except new head

In [105]:
for param in model.transformer.parameters():
    param.requires_grad = False
for param in model.lm_head[0].parameters():
    param.requires_grad = False

## Configure optimizer

In [106]:
learning_rate = 6e-5
beta1, beta2 = 0.9, 0.95
weight_decay = 1e-2

In [107]:
decay = set()
no_decay = set()
whitelist_weight_modules = (torch.nn.Linear, )
blacklist_weight_modules = (torch.nn.LayerNorm, LayerNorm, torch.nn.Embedding)
for mn, m in model.named_modules():
    for pn, p in m.named_parameters():
        fpn = f'{mn:s}.{pn:s}'
        if pn.endswith('bias'):
            no_decay.add(fpn)
        elif pn.endswith('weight') and isinstance(m, whitelist_weight_modules):
            decay.add(fpn)
        elif pn.endswith('weight') and isinstance(m, blacklist_weight_modules):
            no_decay.add(fpn)
decay.remove('lm_head.0.weight')

param_dict = {pn: p for pn, p in model.named_parameters()}
inter_params = decay & no_decay
union_params = decay | no_decay
assert len(inter_params) == 0, f'Parameters {inter_params:s} made it into both decay/no_decay sets'
assert len(param_dict.keys() - union_params) == 0, f'Parameters {str(param_dict.keys() - union_params):s} were not separated into either decay/no_decay set'

optim_groups = [
    {'params': [param_dict[pn] for pn in sorted(list(decay))], 'weight_decay': weight_decay},
    {'params': [param_dict[pn] for pn in sorted(list(no_decay))], 'weight_decay': 0.0}
]

optimizer = torch.optim.AdamW(
    optim_groups, lr=learning_rate, betas=(beta1, beta2)
)

In [124]:
len(model.lm_head)

4

## Train

In [108]:
eval_iters = 100
eval_interval = 200
log_interval = 100
max_iters = 5000
warmup_iters = 100
lr_decay_iters = 5000
min_lr = 6e-6
iter_num = 0
best_val_loss = 1e9

In [109]:
def get_lr(it):
    # During warmup iterations: linear
    if it < warmup_iters:
        return learning_rate * it / warmup_iters
    # After decay iterations: minimum
    if it > lr_decay_iters:
        return min_lr
    # Decay iterations
    decay_ratio = (it - warmup_iters) / (lr_decay_iters - warmup_iters)
    assert 0 <= decay_ratio <= 1
    coeff = 0.5 * (1.0 + math.cos(math.pi * decay_ratio))
    return min_lr + coeff * (learning_rate - min_lr)

In [110]:
model.to(device)
model = torch.compile(model)

In [111]:
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ['train', 'val']:
        losses = torch.zeros(eval_iters)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss.item()
        out[split] = losses.mean()
    model.train()
    return out

In [112]:
t0 = time.time()

while True:

    # Get learning rate
    lr = get_lr(iter_num)
    for param_group in optimizer.param_groups:
        param_group['lr'] = lr

    # Eval
    if iter_num % eval_interval == 0:
        losses = estimate_loss()
        print(f"Step {iter_num}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # Get batch
    X, Y = get_batch('train')

    # Forward
    _, loss = model(X, Y)

    # Backward
    loss.backward()

    # Step optimizer
    optimizer.step()
    optimizer.zero_grad(set_to_none=True)

    # Log
    t1 = time.time()
    dt = t1-t0
    t0 = t1

    if iter_num % log_interval == 0:
        lossf = loss.item()
        print(f'Iter {iter_num}: loss {lossf:.4f}, time{dt*1000:.2f}ms')

    # Update iter_num and check for termination conditions
    iter_num +=1
    if iter_num > max_iters:
        break

Step 0: train loss 24.9400, val loss 26.6054
Iter 0: loss 22.2198, time70493.86ms
Iter 100: loss 1.2670, time207.55ms
Iter 200: loss 1.7076, time206.50ms
Iter 300: loss 3.5656, time204.34ms
Iter 400: loss 8.0180, time205.78ms
Iter 500: loss 11.1423, time207.14ms
Iter 600: loss 4.0604, time207.17ms
Iter 700: loss 2.2430, time205.41ms
Iter 800: loss 12.6018, time207.85ms
Iter 900: loss 5.9803, time208.02ms
Step 1000: train loss 50.6278, val loss 48.6745
Iter 1000: loss 39.1062, time61435.46ms
Iter 1100: loss 2.9812, time207.97ms
Iter 1200: loss 0.4911, time208.05ms
Iter 1300: loss 2.2143, time206.36ms
Iter 1400: loss 1.2035, time206.91ms
Iter 1500: loss 1.6806, time205.71ms
Iter 1600: loss 0.5251, time210.84ms
Iter 1700: loss 0.5683, time206.79ms
Iter 1800: loss 1.5743, time207.67ms
Iter 1900: loss 1.9710, time207.70ms
Step 2000: train loss 0.7900, val loss 0.7401
Iter 2000: loss 0.2554, time61354.58ms
Iter 2100: loss 1.4368, time207.28ms
Iter 2200: loss 1.7429, time208.63ms
Iter 2300: l

KeyboardInterrupt: 

If we load a model that was saved with frozen parameters, are they still frozen?

In [2]:
n_labels = 17

In [3]:
# Model init
model_args = dict(n_layer=12, n_head=12, n_embd=768,
                  block_size=1024, dropout=0.0, 
                  vocab_size=None, out_size=n_labels, bias=False)
model_args

{'n_layer': 12,
 'n_head': 12,
 'n_embd': 768,
 'block_size': 1024,
 'dropout': 0.0,
 'vocab_size': None,
 'out_size': 17,
 'bias': False}

In [4]:
ckpt_path = os.path.join('/media/nvme2/models/redact', 'ckpt_n2c2_frozen.pt')
checkpoint = torch.load(ckpt_path, map_location='cuda')
checkpoint_model_args = checkpoint['model_args']
checkpoint_model_args


{'n_layer': 12,
 'n_head': 12,
 'n_embd': 768,
 'block_size': 1024,
 'dropout': 0.0,
 'vocab_size': 50304,
 'bias': False}

In [5]:
model_args['vocab_size'] = 50304

gptconf = GPTConfig(**model_args)
model = GPT(gptconf)

number of parameters: 204.70M


In [6]:
state_dict = checkpoint['model']
# Fix state dict keys, remove '_orig_mod.'
unwanted_prefix = '_orig_mod.'
for k, v in list(state_dict.items()):
    if k.startswith(unwanted_prefix):
        state_dict[k[len(unwanted_prefix):]] = state_dict.pop(k)
model.load_state_dict(state_dict)

<All keys matched successfully>

In [9]:
model_parameters = filter(lambda p: p.requires_grad==False, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])
print(f"Number of Parameter: {params/1000000:.2f}M")

Number of Parameter: 0.00M


Nope!