初始化

In [1]:
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))
import time
import pytorch_lightning as pl
import torch
from model.model_interface import LLM
import torch.utils.data as tud
from torch.utils.data import DataLoader
from lightning.pytorch.loggers import WandbLogger
from tqdm.notebook import tqdm
from utils.my_utils import *
import torch.nn.functional as F
import random
import regex as re
from dataset import *
import ipywidgets as widgets
from IPython.display import display
from typing import Union, List

pl.seed_everything(42)
torch.set_float32_matmul_precision('medium')
os.environ['TOKENIZERS_PARALLELISM'] = 'true'


model_list = list({
    "gpt2": "/nvme/guoyiqiu/coding/huggingface/hub/models--gpt2/snapshots/e7da7f221d5bf496a48136c0cd264e630fe9fcc8",
    "gpt2_xl": "/nvme/guoyiqiu/coding/huggingface/hub/models--gpt2-xl/snapshots/33cdb5c0db5423c1879b1b9f16c352988e8754a8",
    "llama_7b": "/nvme/share/guoyiqiu/llama-7b",
    "llama_13b": "/nvme/share/guoyiqiu/llama-13b",
    "vicuna_7b": "/nvme/share/guoyiqiu/vicuna-7b",
    "vicuna_13b": "/nvme/share/guoyiqiu/vicuna-13b-v1.1",
}.items())

llm_config = {
    "optimizer": "adamw",
    "lr": 1e-4,
}

hook_config = {
    "retain_output": True,
    "retain_input": False,
    "edit_output": None,
    "clone": True,
    "float": True,
    "detach": True,
    "device": "cpu"
}

def init_mt():
    global mt
    mt = LLM(model_name=mt_dropdown.value, 
             fp16=precision_tbtn.value == "half", 
             from_pretrained=from_pretrained_cb.value,
             **llm_config)


def init_modules():
    global n_layer
    global lm_head
    global embedding
    global ln_f
    global blocks
    global ATTN
    global MLP
    global LN1
    global LN2
    if "gpt2" in mt.model.__class__.__name__.lower():
        # gpt2 config
        n_layer = mt.model.config.num_hidden_layers
        lm_head = mt.model.lm_head
        embedding = mt.model.transformer.wte
        ln_f = mt.model.transformer.ln_f
        blocks = mt.model.transformer.h
        ATTN = 'attn'
        MLP = 'mlp'
        LN1 = 'ln_1'
        LN2 = 'ln_2'
    elif "llama" in mt.model.__class__.__name__.lower():
        # llama config
        n_layer = mt.model.config.num_hidden_layers
        lm_head = mt.model.lm_head
        embedding = mt.model.model.embed_tokens
        ln_f = mt.model.model.norm
        blocks = mt.model.model.layers
        ATTN = 'self_attn'
        MLP = 'mlp'
        LN1 = 'input_layernorm'
        LN2 = 'post_self_attn_layernorm'
        


def init_hook(mt):
    mt.clear_hook()
    for i in range(n_layer):
        mt.add_hook(module=blocks[i], name=f"block_{i}", **hook_config)
        mt.add_hook(module=getattr(blocks[i], ATTN), name=f"attn_{i}", **hook_config)
        mt.add_hook(module=getattr(blocks[i], MLP), name=f"mlp_{i}", **hook_config)


def setup(btn):
    time_st = time.time()
    btn.description = "Loading model..."
    init_mt()
    btn.description = "init modules..."
    init_modules()
    btn.description = "init hooks..."
    init_hook(mt)
    btn.description = "Everything is ready."
    device_tbtn.value = 'cpu'
    print(f"Time cost: {time.time() - time_st:.2f}s")

# setup widgets


# model dropdown
mt_dropdown = widgets.Dropdown(
    options=model_list,
    description='Model:',
    disabled=False,
)

from_pretrained_cb = widgets.Checkbox(
    value=True,
    description='from pretrained',
)

# setup button
setup_btn = widgets.Button(
    description="Setup everything",
    disabled=False,
)
setup_btn.on_click(setup)

# switch deivce
device_tbtn = widgets.ToggleButtons(
    options=['cpu', f'cuda',],
    disabled=False,
)


def switch_device(change):
    device_tbtn.disabled = True
    mt.model.to(change.new)
    torch.cuda.empty_cache() if change.new == 'cpu' else None
    device_tbtn.disabled = False


device_tbtn.observe(switch_device, names='value')

# switch precision

precision_tbtn = widgets.ToggleButtons(
    options=['float', 'half'],
    disabled=False,
)


def switch_precision(change):
    precision_tbtn.disabled = True
    if mt is not None:
        mt.model = mt.model.half() if change.new == 'half' else mt.model.float()
        init_modules()
    precision_tbtn.disabled = False


precision_tbtn.observe(switch_precision, names='value')


mnt_slider = widgets.IntSlider(
    value=128,
    min=1,
    max=512,
    step=1,
    description='new token:',
    disabled=False,
    continuous_update=False,
    orientation='horizontal',
    readout=True,
    readout_format='d',
)

input_textarea = widgets.Textarea(
    value='',
    description='Input:',
    layout=widgets.Layout(width='30%', height='250px'),
    disabled=False
)
output_textarea = widgets.Textarea(
    value='',
    description='Output:',
    layout=widgets.Layout(width='30%', height='250px'),
    disabled=False
)

submit_btn = widgets.Button(
    description="generate",
    disabled=False,
)


def generate(btn):
    input_text = input_textarea.value
    max_new_tokens = mnt_slider.value
    btn.disabled = True
    submit_btn.description = "Generating..."
    result = mt.generate(input_text, max_new_tokens=max_new_tokens)
    btn.disabled = False
    submit_btn.description = "generate"
    output_text = result[0]
    output_textarea.value = output_text


submit_btn.on_click(generate)

control_panel = widgets.HBox([mt_dropdown, from_pretrained_cb ,setup_btn, precision_tbtn, device_tbtn])
talk_panel = widgets.HBox([input_textarea, widgets.VBox([mnt_slider, submit_btn]), output_textarea])
all_panel = widgets.VBox([control_panel, talk_panel])
display(all_panel)

Global seed set to 42


VBox(children=(HBox(children=(Dropdown(description='Model:', options=(('gpt2', '/nvme/guoyiqiu/coding/huggingf…

Time cost: 1.84s


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Time cost: 56.41s


LORA Tune MedQA

In [5]:
from peft import get_peft_model, LoraConfig, TaskType

peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=
)

mt.model = get_peft_model(mt.model, peft_config)
mt.model.print_trainable_parameters()

trainable params: 4194304 || all params: 6742609920 || trainable%: 0.06220594176090199


In [17]:
from transformers import AutoModel
glm = AutoModel.from_pretrained('THUDM/glm-2b',trust_remote_code=True)


Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.
Some weights of the model checkpoint at THUDM/glm-2b were not used when initializing GLMModel: ['out_proj.weight', 'out_proj.bias', 'dense.bias', 'dense.weight']
- This IS expected if you are initializing GLMModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GLMModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
import os

GLMModel(
  (word_embeddings): VocabEmbedding()
  (transformer): GLMStack(
    (embedding_dropout): Dropout(p=0.1, inplace=False)
    (position_embeddings): Embedding(1025, 2048)
    (block_position_embeddings): Embedding(1025, 2048)
    (layers): ModuleList(
      (0-35): 36 x GLMBlock(
        (input_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (attention): SelfAttention(
          (query_key_value): Linear(in_features=2048, out_features=6144, bias=True)
          (attention_dropout): Dropout(p=0.1, inplace=False)
          (dense): Linear(in_features=2048, out_features=2048, bias=True)
          (output_dropout): Dropout(p=0.1, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((2048,), eps=1e-05, elementwise_affine=True)
        (mlp): MLP(
          (dense_h_to_4h): Linear(in_features=2048, out_features=8192, bias=True)
          (dense_4h_to_h): Linear(in_features=8192, out_features=2048, bias=True)
          (dropout): Dropout(p=

In [None]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=
)

md = get_peft_model(glm, peft_config)
md.print_trainable_parameters()

In [9]:
import torch

bin = torch.load('/nvme/share/guoyiqiu/lora_Book_7B_0508/checkpoint-5200/pytorch_model.bin')


In [13]:
for (k,v) in bin.items():
    print(k)
    print(v.shape)
    print(v.dtype)

base_model.model.model.layers.0.self_attn.q_proj.lora_A.weight
torch.Size([8, 4096])
torch.float32
base_model.model.model.layers.0.self_attn.q_proj.lora_B.weight
torch.Size([4096, 8])
torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_A.weight
torch.Size([8, 4096])
torch.float32
base_model.model.model.layers.0.self_attn.v_proj.lora_B.weight
torch.Size([4096, 8])
torch.float32
base_model.model.model.layers.1.self_attn.q_proj.lora_A.weight
torch.Size([8, 4096])
torch.float32
base_model.model.model.layers.1.self_attn.q_proj.lora_B.weight
torch.Size([4096, 8])
torch.float32
base_model.model.model.layers.1.self_attn.v_proj.lora_A.weight
torch.Size([8, 4096])
torch.float32
base_model.model.model.layers.1.self_attn.v_proj.lora_B.weight
torch.Size([4096, 8])
torch.float32
base_model.model.model.layers.2.self_attn.q_proj.lora_A.weight
torch.Size([8, 4096])
torch.float32
base_model.model.model.layers.2.self_attn.q_proj.lora_B.weight
torch.Size([4096, 8])
torch.float32
base_model

In [6]:
mt.model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): PeftModelForCausalLM(
      (base_model): LoraModel(
        (model): LlamaForCausalLM(
          (model): LlamaModel(
            (embed_tokens): Embedding(32000, 4096, padding_idx=0)
            (layers): ModuleList(
              (0-31): 32 x LlamaDecoderLayer(
                (self_attn): LlamaAttention(
                  (q_proj): Linear(
                    in_features=4096, out_features=4096, bias=False
                    (lora_dropout): ModuleDict(
                      (default): Dropout(p=0.1, inplace=False)
                    )
                    (lora_A): ModuleDict(
                      (default): Linear(in_features=4096, out_features=8, bias=False)
                    )
                    (lora_B): ModuleDict(
                      (default): Linear(in_features=8, out_features=4096, bias=False)
                    )
                  )
                  (k_proj): Linear(in_features=4096, out_features=4096,

Lite Tune

In [6]:
def freeze_all(model):
    for param in model.parameters():
        param.requires_grad = False

def unfreeze_all(model):
    for param in model.parameters():
        param.requires_grad = True

def set_module_requires_grad(model, layers: Union[int, List[int]], names: Union[str, List[str]], requires_grad: bool):
    layers = [layers] if isinstance(layers, int) else layers
    names = [names] if isinstance(names, str) else names
    for layer in layers:
        for name in names:
            assert name in [ATTN, MLP, LN1, LN2]
            module = getattr(blocks[layer], name)
            for param in module.parameters():
                param.requires_grad = requires_grad

def my_training_step(self, batch, batch_idx):
    '''batch: (input_ids, attention_mask, labels) **padding already** '''
    input_ids, attention_mask, labels = batch
    input_ids = input_ids.unsqueeze(0) if len(input_ids.shape) == 1 else input_ids
    attention_mask = attention_mask.unsqueeze(0) if len(attention_mask.shape) == 1 else attention_mask
    labels = labels.unsqueeze(0) if len(labels.shape) == 1 else labels
    gt_id = labels[0, -1].item()

    bsz = input_ids.shape[0]
    assert bsz == 1
    
    def set_require_grad(module, input, output):
        ''' output: (bsz, seq_len, hidden_size) '''
        with torch.no_grad():
            topk_logits, topk_indices = torch.topk(lm_head(ln_f(output[0])), k=10, dim=-1) # [bsz, seq_len, k]
        is_important = gt_id in topk_indices
        if is_important:
            print(f'{module.name} is_important')
            # for param in module.parameters():
            #     param.requires_grad = True
        return output
    
    self.clear_hook()
    hook_config = {
        "retain_output": False,
        "retain_input": False,
        "edit_output": set_require_grad,
        "clone": False,
        "float": False,
        "detach": False,
        "device": "cpu"
    }
    for i in range(n_layer):
        self.add_hook(module=getattr(blocks[i], ATTN), name=f"attn_{i}", **hook_config)
    print(batch_idx)
    res = self(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
    
    lm_logits = res['logits']
    shift_logits = lm_logits[..., :-1, :].contiguous()  # Shift so that tokens < n predict n
    shift_labels = labels[..., 1:].contiguous()

    if isinstance(res.get('loss'), torch.Tensor):
        loss = res['loss']
    else:
        loss = self.loss_func(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))

    acc = self._acc(shift_logits, shift_labels)

    self.log('train_loss', loss, on_step=True, on_epoch=True, sync_dist=True, prog_bar=True)
    self.log('train_acc', acc, on_step=False, sync_dist=True, on_epoch=True, prog_bar=True)
    print("\n")
    return loss


In [19]:
bsz = 32

# train_dst = MedQA('/nvme/guoyiqiu/coding/datasets/MedQA/data_clean/questions/US/train.jsonl',tokenizer=mt.tokenizer, max_len=512,size=1000)
train_dst = CounterFact('/nvme/guoyiqiu/coding/datasets/rome datasets/counterfact.json', mt.tokenizer,size=1000)
train_dl = DataLoader(train_dst, batch_size=bsz, shuffle=True, collate_fn=train_dst.collate_fn, num_workers=1)

100%|██████████| 1000/1000 [00:00<00:00, 2887.15it/s]

Loaded dataset with 1000 elements





In [21]:
trainer_config = {
    "precision": "16-mixed",
    "accelerator": "auto",
    "devices": [7],
    "enable_checkpointing":False,
    # 'accumulate_grad_batches': 8,
    "max_epochs":20,
}

# mt.clear_hook()
# mt.set_func('training_step', my_training_step)
# freeze_all(mt.model)
# set_module_requires_grad(mt.model, list(range(n_layer)), ATTN, True)
# trainer = pl.Trainer(**trainer_config, logger=WandbLogger(project='tune medqa', name='litetune_5ep_vicuna7b'))
trainer = pl.Trainer(**trainer_config)
trainer.fit(mt, train_dl)

Using 16bit Automatic Mixed Precision (AMP)
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3,4,5,6,7]

  | Name  | Type                 | Params
-----------------------------------------------
0 | model | PeftModelForCausalLM | 1.6 B 
-----------------------------------------------
2.5 M     Trainable params
1.6 B     Non-trainable params
1.6 B     Total params
6,240.275 Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

In [22]:
trainer.save_checkpoint('./lora_gpt2xl_counterfact_50ep.ckpt')