In [1]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )    

In [2]:
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import PeftModel

MODEL_ID = "LoftQ/Mistral-7B-v0.1-4bit-64rank"

tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
tokenizer.add_special_tokens({'pad_token': '[PAD]'})

base_model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID, 
    torch_dtype=torch.bfloat16, 
    attn_implementation="flash_attention_2",
    quantization_config=BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_compute_dtype=torch.bfloat16,  
        bnb_4bit_use_double_quant=False,
        bnb_4bit_quant_type='nf4',
    ),
    device_map={"":0}
)
base_model.resize_token_embeddings(len(tokenizer))

peft_model = PeftModel.from_pretrained(
    base_model,
    MODEL_ID,
    subfolder="loftq_init",
    is_trainable=True,
)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [11]:
from datamodule import datamodule

#path = ["/home/elicer/M-LLM/data/BoolQA.csv", "/home/elicer/M-LLM/data/NLI_CB.csv", "/home/elicer/M-LLM/data/sc_amazon.csv"]
path = "/home/elicer/M-LLM/data/BoolQA.csv"

train_dataset, val_dataset, test_dataset = datamodule.preprare_dataset(path)
# train_dataset = train_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)
# val_dataset = val_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)
# test_dataset = test_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)

100%|██████████| 2000/2000 [00:00<00:00, 15134.68ex/s]
100%|██████████| 100/100 [00:00<00:00, 16078.75ex/s]
100%|██████████| 100/100 [00:00<00:00, 16258.88ex/s]


In [4]:
from datasets import Dataset

text_data = {'text': train_dataset['text']}
train_dataset = Dataset.from_dict(text_data)
train_dataset = train_dataset.map(lambda samples: tokenizer(samples["text"]), batched=True)

  0%|          | 0/2 [00:00<?, ?ba/s]

In [5]:
import wandb
import os

wandb.login()
os.environ["WANDB_PROJECT"]="M-LLM"

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mrion_[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [9]:
import transformers

args = transformers.TrainingArguments(
    num_train_epochs = 1,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=4,
    learning_rate=5e-5,
    fp16=True,
    logging_steps=10,
    output_dir="outputs",
    optim="sgd",
    report_to="wandb",
    run_name="Mistral-BoolQA",
    lr_scheduler_type="cosine",
)

trainer = transformers.Trainer(
    model=peft_model,
    train_dataset=train_dataset,
    args=args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
peft_model.config.use_cache = False  # silence the warnings. Please re-enable for inference!
trainer.train()
wandb.finish()

Step,Training Loss
10,2.0883
20,2.272
30,2.0697
40,2.1489
50,2.2498
60,2.0523
70,2.1259
80,2.1132
90,2.0357
100,2.1493


VBox(children=(Label(value='0.001 MB of 0.001 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
train/epoch,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇████
train/learning_rate,███████▇▇▇▇▇▆▆▆▆▅▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
train/loss,▄█▄▅▃▅▅▃▇▁▇▄▆▅▆▄▃▆▃▆▅▆▃▄▆▄▅▇▅▄▅▄▃▇▅▄▆▅▅▅
train/total_flos,▁
train/train_loss,▁
train/train_runtime,▁
train/train_samples_per_second,▁
train/train_steps_per_second,▁

0,1
train/epoch,1.0
train/global_step,500.0
train/learning_rate,0.0
train/loss,2.1291
train/total_flos,1.7327453172006912e+16
train/train_loss,2.12541
train/train_runtime,1045.7157
train/train_samples_per_second,1.913
train/train_steps_per_second,0.478


In [11]:
peft_model.push_to_hub("JD97/BoolQA")

adapter_model.safetensors:   0%|          | 0.00/671M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/JD97/BoolQA/commit/bf9d1facb2ece0a13276581f84d8e564f9cbe7cd', commit_message='Upload model', commit_description='', oid='bf9d1facb2ece0a13276581f84d8e564f9cbe7cd', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
trainer.evaluate(test_dataset)

In [7]:

def lora_reassign_weights(model, state_dict, r, lora_alpha, fan_in_fan_out=False, merge=True):
    is_merged = getattr(model, "is_merged", False)
    assert is_merged != merge, f'{is_merged} != {merge}: if is_merged, then must be unmerge; if not is_merged, then must merge'
    named_params = [(n, p) for n, p in model.named_parameters()]
    scaling = lora_alpha / r
    print(f'Lora configs: alpha={lora_alpha}, r={r}, scaling={scaling}')
    state_dict = {k.replace("base_model.model.", ""): v for k, v in state_dict.items()}
    replaced = set()
    merged_names = {
        # these are projector weights that got combined into single matrix in vllm
        "qkv_proj": ["q_proj", "k_proj", "v_proj"],
        "gate_up_proj": ["gate_proj", "up_proj"]
    }
    non_merged_names = ['o_proj', 'down_proj']
    for name, param in named_params:
        param.requires_grad = False
        if "_proj.weight" not in name:
            continue
        for wn, wn_series in merged_names.items():
            if name.endswith(f"{wn}.weight"):
                for stride_id, att_weight_name in enumerate(wn_series):
                    lora_a = name.replace(f"{wn}.weight", f"{att_weight_name}.lora_A.weight")
                    lora_b = name.replace(f"{wn}.weight", f"{att_weight_name}.lora_B.weight")
                    shard_size = param.shape[0] // len(wn_series)
                    if lora_a in state_dict:
                        assert lora_b in state_dict, f'{lora_b} not in state_dict'
                        assert state_dict[lora_b].shape[1] == r, f'{r=} != {state_dict[lora_b].shape}'
                        matrix = transpose(state_dict[lora_b] @ state_dict[lora_a], fan_in_fan_out) * scaling
                        assert param.data[shard_size * stride_id:shard_size * (stride_id + 1)].shape == matrix.shape
                        if merge:
                            param.data[shard_size * stride_id:shard_size * (stride_id + 1)] += matrix
                        else:
                            param.data[shard_size * stride_id:shard_size * (stride_id + 1)] -= matrix
                        replaced.add(lora_a)
                        replaced.add(lora_b)
        for wn in non_merged_names:
            if name.endswith(f"{wn}.weight"):
                lora_a = name.replace(f"{wn}.weight", f"{wn}.lora_A.weight")
                lora_b = name.replace(f"{wn}.weight", f"{wn}.lora_B.weight")
                if lora_a in state_dict:
                    assert lora_b in state_dict
                    matrix = transpose(state_dict[lora_b] @ state_dict[lora_a], fan_in_fan_out) * scaling
                    assert param.data.shape == matrix.shape, f'invalid shape: {name} {param.data.shape} != {matrix.shape}'
                    if merge:
                        param.data += matrix
                    else:
                        param.data -= matrix
                    replaced.add(lora_a)
                    replaced.add(lora_b)
    no_replaced = [k for k in state_dict.keys() if k not in replaced]
    assert len(no_replaced) == 0, f'some lora states not loaded, check again!: {no_replaced}'
    model.is_merged = merge


def lora_merge_unmerge_state_dict(llm, state_dict, peft_config, merge=True):
    # merge lora states to weights
    for worker in llm.llm_engine.workers:
        lora_reassign_weights(worker.model, state_dict, 
            r=peft_config.r, 
            lora_alpha=peft_config.lora_alpha, 
            fan_in_fan_out=peft_config.fan_in_fan_out, 
            merge=merge
        )

In [6]:
from vllm import LLM, SamplingParams

model_id = "LoftQ/Mistral-7B-v0.1-4bit-64rank"
peft_id = "JD97/BoolQA"

llm = LLM(model=model_id)
adapter_state_dict = load_peft_weights(peft_id)
config = PeftConfig(peft_id)
lora_merge_unmerge_state_dict(llm, adapter_state_dict, config, merge=True)

2024-01-06 19:09:11,766	INFO util.py:154 -- Missing packages: ['ipywidgets']. Run `pip install -U ipywidgets`, then restart the notebook server for rich notebook output.


INFO 01-06 19:09:11 llm_engine.py:70] Initializing an LLM engine with config: model='LoftQ/Mistral-7B-v0.1-4bit-64rank', tokenizer='LoftQ/Mistral-7B-v0.1-4bit-64rank', tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, quantization=None, enforce_eager=False, seed=0)
INFO 01-06 19:09:25 llm_engine.py:275] # GPU blocks: 8145, # CPU blocks: 2048
INFO 01-06 19:09:27 model_runner.py:501] Capturing the model for CUDA graphs. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
INFO 01-06 19:09:27 model_runner.py:505] CUDA graphs can take additional 1~3 GiB memory per GPU. If you are running out of memory, consider decreasing `gpu_memory_utilization` or enforcing eager mode.
INFO 01-06 19:09:32 model_runner.py:547] Graph capturing finished in 5 secs.

In [40]:
# prompts = [
#     "Hello, my name is",
#     "The capital of France is",
#     "The future of AI is",
# ]

prompts = [train_dataset[73]['text'][:-15]]

sampling_params = SamplingParams(temperature=0, top_k=-1)

outputs = llm.generate(prompts, sampling_params)

# for output in outputs:
#     prompt = output.prompt
#     generated_text = output.outputs[0].text
#     print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Generated text: {generated_text!r}")

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts: 100%|██████████| 1/1 [00:00<00:00,  2.72it/s]

Generated text: ' option:\n      a. true\n      b. false\n      c.'





In [1]:
import torch
import transformers
from datasets import load_dataset
from peft import LoraConfig, PeftConfig, PeftModel, get_peft_model, load_peft_weights
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

# model_id = "LoftQ/Mistral-7B-v0.1-4bit-64rank"  

# tokenizer = AutoTokenizer.from_pretrained(model_id)
# model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float16)

# peft_model_id = "JD97/BoolQA"
# config = PeftConfig.from_pretrained(peft_model_id)
# model = PeftModel.from_pretrained(model, peft_model_id, torch_dtype=torch.float16).to("cuda")
# model.eval()

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
adapter_state_dict = load_peft_weights("JD97/BoolQA")

In [None]:
lora_state_dict = torch.load("lora_states.pt")['module']
lora_merge_unmerge_state_dict(llm, lora_state_dict, merge=True)

In [None]:
Load original llama to vllm with llm = LLM("llama-7b") ...
Load lora states dict lora_state_dict = torch.load("lora_states.pt")['module'].
Merge lora states to llm do lora_merge_unmerge_state_dict(llm, lora_state_dict, merge=True)
Do whatever inference job with llm ...
To unmerge and obtain the original llama, run lora_merge_unmerge_state_dict(llm, lora_state_dict, merge=False)

In [None]:
from vllm import LLM, SamplingParams

# Create an LLM.
llm = LLM(model="facebook/opt-125m", gpu_memory_utilization=0.05)

# Add LoRA adapter
lora.LoRAModel.from_pretrained(llm.llm_engine.workers[0].model, "edbeeching/opt-125m-imdb-lora")

prompts = [
    "Hello, my name is",
    "The capital of France is",
    "The future of AI is",
]

sampling_params = SamplingParams(temperature=0, top_k=-1)

outputs = llm.generate(prompts, sampling_params)

for output in outputs:
    prompt = output.prompt
    generated_text = output.outputs[0].text
    print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")