In [1]:
import subprocess
import os
os.environ['CURL_CA_BUNDLE'] = ''

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [2]:
import transformers

In [3]:
transformers.__version__

'4.45.2'

## Basics

- hf（huggingface）中使用 llama
- llama => alpaca
- lora on alpaca
- inference：推理
    - alpaca 标准 prompt 格式

## Load model/tokenizer

- https://github.com/tloen/alpaca-lora/blob/main/generate.py

In [4]:
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig

In [5]:
# import os
# os.environ['HTTP_PROXY'] = 'http://127.0.0.1:7890'
# os.environ['HTTPS_PROXY'] = 'http://127.0.0.1:7890'

### load model

In [6]:
save_path = "../../autodl-fs/model_path"

model = LlamaForCausalLM.from_pretrained("yahma/llama-7b-hf",
    load_in_8bit=True,    # 导致混合精度
    device_map="auto",    # 自动模型并行
    cache_dir=save_path
)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [7]:
model

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32000, 4096, padding_idx=0)
    (layers): ModuleList(
      (0-31): 32 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (k_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (v_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (o_proj): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (up_proj): Linear8bitLt(in_features=4096, out_features=11008, bias=False)
          (down_proj): Linear8bitLt(in_features=11008, out_features=4096, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((4096,), eps=1e-06)
        (post_attention_layernorm

In [8]:
for i, para in enumerate(model.named_parameters()):
    print(f'{i}, {para[0]}\t {para[1].device} \t{para[1].dtype}')
    # print(f'{i}, \t {para[1].device} \t{para[1].dtype}')

0, model.embed_tokens.weight	 cuda:0 	torch.float16
1, model.layers.0.self_attn.q_proj.weight	 cuda:0 	torch.int8
2, model.layers.0.self_attn.k_proj.weight	 cuda:0 	torch.int8
3, model.layers.0.self_attn.v_proj.weight	 cuda:0 	torch.int8
4, model.layers.0.self_attn.o_proj.weight	 cuda:0 	torch.int8
5, model.layers.0.mlp.gate_proj.weight	 cuda:0 	torch.int8
6, model.layers.0.mlp.up_proj.weight	 cuda:0 	torch.int8
7, model.layers.0.mlp.down_proj.weight	 cuda:0 	torch.int8
8, model.layers.0.input_layernorm.weight	 cuda:0 	torch.float16
9, model.layers.0.post_attention_layernorm.weight	 cuda:0 	torch.float16
10, model.layers.1.self_attn.q_proj.weight	 cuda:0 	torch.int8
11, model.layers.1.self_attn.k_proj.weight	 cuda:0 	torch.int8
12, model.layers.1.self_attn.v_proj.weight	 cuda:0 	torch.int8
13, model.layers.1.self_attn.o_proj.weight	 cuda:0 	torch.int8
14, model.layers.1.mlp.gate_proj.weight	 cuda:0 	torch.int8
15, model.layers.1.mlp.up_proj.weight	 cuda:0 	torch.int8
16, model.layers.1

### load tokenizer

In [9]:
tokenizer = LlamaTokenizer.from_pretrained("yahma/llama-7b-hf", cache_dir=save_path)

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565 - if you loaded a llama tokenizer from a GGUF file you can ignore this message


In [10]:
tokenizer

LlamaTokenizer(name_or_path='yahma/llama-7b-hf', vocab_size=32000, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## With lora

- https://github.com/tloen/alpaca-lora

In [18]:
from peft import PeftModel, PeftConfig

下面代码如果遇到 ` __init__() got an unexpected keyword argument 'enable_lora'` 问题，是由于新版本的 peft 不再支持旧版本配置中的部分字段。需要手动打开模型的 `adapter_config.json` 文件，删除 `"enable_lora": null,` 这行。

In [23]:
model = PeftModel.from_pretrained(model, "yahma/alpaca-7b-lora", cache_dir=save_path)

adapter_model.bin:   0%|          | 0.00/67.2M [00:00<?, ?B/s]

In [24]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): LlamaForCausalLM(
      (model): LlamaModel(
        (embed_tokens): Embedding(32000, 4096, padding_idx=0)
        (layers): ModuleList(
          (0-31): 32 x LlamaDecoderLayer(
            (self_attn): LlamaSdpaAttention(
              (q_proj): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=4096, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=4096, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
     

In [26]:
from peft import mapping
from peft.utils import other

print('model_type', model.config.model_type)
print(model.peft_config['default'].target_modules)

# 默认的 target module
other.TRANSFORMERS_MODELS_TO_LORA_TARGET_MODULES_MAPPING

model_type llama
{'v_proj', 'k_proj', 'q_proj', 'o_proj'}


{'t5': ['q', 'v'],
 'mt5': ['q', 'v'],
 'bart': ['q_proj', 'v_proj'],
 'gpt2': ['c_attn'],
 'bloom': ['query_key_value'],
 'blip-2': ['q', 'v', 'q_proj', 'v_proj'],
 'opt': ['q_proj', 'v_proj'],
 'gptj': ['q_proj', 'v_proj'],
 'gpt_neox': ['query_key_value'],
 'gpt_neo': ['q_proj', 'v_proj'],
 'bert': ['query', 'value'],
 'roberta': ['query', 'value'],
 'xlm-roberta': ['query', 'value'],
 'electra': ['query', 'value'],
 'deberta-v2': ['query_proj', 'value_proj'],
 'deberta': ['in_proj'],
 'layoutlm': ['query', 'value'],
 'llama': ['q_proj', 'v_proj'],
 'chatglm': ['query_key_value'],
 'gpt_bigcode': ['c_attn'],
 'mpt': ['Wqkv'],
 'RefinedWebModel': ['query_key_value'],
 'RefinedWeb': ['query_key_value'],
 'falcon': ['query_key_value'],
 'btlm': ['c_proj', 'c_attn'],
 'codegen': ['qkv_proj'],
 'mistral': ['q_proj', 'v_proj'],
 'mixtral': ['q_proj', 'v_proj'],
 'stablelm': ['q_proj', 'v_proj'],
 'phi': ['q_proj', 'v_proj', 'fc1', 'fc2'],
 'gemma': ['q_proj', 'v_proj'],
 'gemma2': ['q_pro

## Alpaca examples

- https://github.com/tatsu-lab/stanford_alpaca

In [27]:
def generate_prompt(instruction, input=None):
    if input:
        return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Input:
{input}

### Response:"""
    else:
        return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{instruction}

### Response:"""

In [28]:
generation_config = GenerationConfig(
    temperature=1.5,
    # nucleus sampling
    top_p=0.8,
    num_beams=4,
)

def inference(instruction, input=None):
    prompt = generate_prompt(instruction, input)
    # print(prompt)
    inputs = tokenizer(prompt, return_tensors="pt")
    input_ids = inputs["input_ids"].cuda()
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=256
    )
    for s in generation_output.sequences:
        output = tokenizer.decode(s)
        print("Response:", output.split("### Response:")[1].strip())



In [32]:
inference(input("Instruction: "))

Instruction:  tell me a joke about an animal


Response: Why don't elephants go to the movies?

Because they can't sit still for more than 20 minutes!</s>
