In [27]:
!pip list

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Package                        Version
------------------------------ ---------------
absl-py                        2.0.0
accelerate                     0.28.0
addict                         2.4.0
aiohttp                        3.9.3
aiosignal                      1.3.1
aliyun-python-sdk-core         2.15.0
aliyun-python-sdk-kms          2.16.2
anyio                          4.2.0
argon2-cffi                    23.1.0
argon2-cffi-bindings           21.2.0
arrow                          1.3.0
asttokens                      2.4.1
async-lru                      2.0.4
async-timeout                  4.0.3
attrs                          23.2.0
Babel                          2.14.0
beautifulsoup4                 4

In [None]:
!pip install transformers==4.30.0 datasets peft accelerate==0.28.0 bitsandbytes==0.39.0 safetensors
# python 3.8 + torch 2.0.0 + cuda 11.8
#或者 python 3.10 + torch 2.1.0 + cuda 12.1

In [2]:
import os, sys
import torch
import datasets
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    DataCollatorForLanguageModeling,
    DataCollatorForSeq2Seq,
    Trainer,
    TrainingArguments,
    GenerationConfig
)
from peft import PeftModel, LoraConfig, prepare_model_for_kbit_training, get_peft_model

In [3]:
### 模型配置 ###
#model_id = "./llama2/shakechen/Llama-2-7b-hf"
model_id = "./xuanyuan/Duxiaoman-DI/XuanYuan-13B-Chat"
# max_length = 512
# device_map = "auto"
# batch_size = 128
# micro_batch_size = 32
max_length = 512
device_map = "auto"
batch_size = 8
micro_batch_size = 2
gradient_accumulation_steps = batch_size // micro_batch_size

# nf4" use a symmetric quantization scheme with 4 bits precision
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

# load model from huggingface
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    device_map=device_map
)

# load tokenizer from huggingface
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

def print_number_of_trainable_model_parameters(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    print(f"trainable model parameters: {trainable_model_params}. All model parameters: {all_model_params} ")
    return trainable_model_params
    
ori_p = print_number_of_trainable_model_parameters(model)

# LoRA config
model = prepare_model_for_kbit_training(model)
peft_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    target_modules=["q_proj", "v_proj"],
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, peft_config)

### compare trainable parameters #
peft_p = print_number_of_trainable_model_parameters(model)
print(f"# Trainable Parameter \nBefore: {ori_p} \nAfter: {peft_p} \nPercentage: {round(peft_p / ori_p * 100, 2)}")

#数据处理
#dataset = datasets.load_dataset('json', data_files='data/databricks.jsonl', split='train')
dataset = datasets.load_dataset('json', data_files='data/fine-tuning-data1.jsonl', split='train')

### generate prompt based on template ###
prompt_template = {
    "prompt_input": \
    "Below is an instruction that describes a task, paired with an input that provides further context.\
    Write a response that appropriately completes the request.\
    \n\n### Instruction:\n{instruction}\n\n### Input:\n{input}\n\n### Response:\n",
    
    "prompt_no_input": \
    "Below is an instruction that describes a task.\
    Write a response that appropriately completes the request.\
    \n\n### Instruction:\n{instruction}\n\n### Response:\n",
    "response_split": "### Response:"
}

def generate_prompt(instruction, input=None, label=None, prompt_template=prompt_template):
    if input:
        res = prompt_template["prompt_input"].format(
            instruction=instruction, input=input)
    else:
        res = prompt_template["prompt_no_input"].format(
            instruction=instruction)
    if label:
        res = f"{res}{label}"
    return res

def tokenize(tokenizer, prompt, max_length=max_length, add_eos_token=False):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=max_length,
        padding=False,
        return_tensors=None)
    
    result["labels"] = result["input_ids"].copy()
    return result
    
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(
        data_point["instruction"],
        data_point["context"],
        data_point["response"],
    )
    tokenized_full_prompt = tokenize(tokenizer, full_prompt)
    user_prompt = generate_prompt(data_point["instruction"], data_point["context"])
    tokenized_user_prompt = tokenize(tokenizer, user_prompt)
    user_prompt_len = len(tokenized_user_prompt["input_ids"])
    mask_token = [-100] * user_prompt_len
    tokenized_full_prompt["labels"] = mask_token + tokenized_full_prompt["labels"][user_prompt_len:]
    return tokenized_full_prompt
    
dataset = dataset.train_test_split(test_size=16, shuffle=True, seed=42)
cols = ["instruction", "context", "response", "category"]
train_data = dataset["train"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols)
val_data = dataset["test"].shuffle().map(generate_and_tokenize_prompt, remove_columns=cols,)



Welcome to bitsandbytes. For bug reports, please run

python -m bitsandbytes

 and submit this information together with your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.6
CUDA SETUP: Detected CUDA version 121
CUDA SETUP: Loading binary /root/miniconda3/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda121.so...


  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)
  warn(msg)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

trainable model parameters: 404280320. All model parameters: 6748165120 
trainable model parameters: 6553600. All model parameters: 6754718720 
# Trainable Parameter 
Before: 404280320 
After: 6553600 
Percentage: 1.62


Map:   0%|          | 0/18 [00:00<?, ? examples/s]

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

In [4]:
#模型训练步骤
# args = TrainingArguments(
#     output_dir="./llama-7b-int4-dolly",
#     #output_dir="./xuanyuan-13B-int4",
#     num_train_epochs=20,
#     max_steps=200,
#     fp16=True,
#     optim="paged_adamw_32bit",
#     learning_rate=2e-4,
#     lr_scheduler_type="constant",
#     per_device_train_batch_size=micro_batch_size,
#     gradient_accumulation_steps=gradient_accumulation_steps,
#     gradient_checkpointing=True,
#     group_by_length=False,
#     logging_steps=10,
#     save_strategy="epoch",
#     save_total_limit=3,
#     disable_tqdm=False,
# )
args = TrainingArguments(
    #output_dir="./llama-7b-int4-dolly",
    output_dir="./xuanyuan-13B-int4",
    num_train_epochs=10,
    max_steps=30,
    fp16=True,
    optim="paged_adamw_8bit",
    learning_rate=2e-4,
    lr_scheduler_type="constant",
    per_device_train_batch_size=micro_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    gradient_checkpointing=True,
    group_by_length=False,
    logging_steps=10,
    save_strategy="epoch",
    save_total_limit=3,
    disable_tqdm=False,
)
trainer = Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=args,
    data_collator=DataCollatorForSeq2Seq(
      tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True),
)

# silence the warnings. re-enable for inference!
with torch.no_grad():
    model.config.use_cache = False
    trainer.train()
    model.save_pretrained("xuanyuan-13B-int4")
    #model.save_pretrained("llama-7b-int4-dolly")

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False)
You are using 8-bit optimizers with a version of `bitsandbytes` < 0.41.1. It is recommended to update your version as a major bug has been fixed in 8-bit optimizers.
You're using a LlamaTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn

In [9]:
#模型测试，就可以和没训练的模型进行比较
# model path and weight
model_id = "./llama2/shakechen/Llama-2-7b-hf"
peft_path = "./llama-7b-int4-dolly"
#model_id = "./xuanyuan/Duxiaoman-DI/XuanYuan-13B-Chat"
#peft_path = "./xuanyuan-13B-int4"

# loading model
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    use_cache=False,
    device_map="auto"
)

# loading peft weight
model = PeftModel.from_pretrained(
    model,
    peft_path,
    torch_dtype=torch.bfloat16,
    device_map="auto"
)
model.eval()

# generation config
generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4, # beam search
)

#generating reply
with torch.no_grad():
    prompt = "什么是币币交易？"
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")
    generation_output = model.generate(
        input_ids=inputs.input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        #max_new_tokens=320,
    )
    print('\nAnswer: ', tokenizer.decode(generation_output.sequences[0]))

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]




Answer:  <s> 什么是币币交易？OKX


In [1]:
#没训练的原始模型
import torch
from transformers import LlamaForCausalLM, LlamaTokenizer
#model_name_or_path = "./llama2/shakechen/Llama-2-7b-hf"
model_name_or_path = "./xuanyuan/Duxiaoman-DI/XuanYuan-13B-Chat"
tokenizer = LlamaTokenizer.from_pretrained(model_name_or_path, use_fast=False, legacy=True)
model = LlamaForCausalLM.from_pretrained(model_name_or_path,torch_dtype=torch.float16, device_map="auto")
inputs = tokenizer("问题：什么是币币交易？", return_tensors="pt").to("cuda")
outputs = model.generate(**inputs, max_new_tokens=1280, repetition_penalty=1.1)
outputs = tokenizer.decode(outputs.cpu()[0][len(inputs.input_ids[0]):], skip_special_tokens=True)
print(outputs)

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]


回答：在区块链领域，"币币交易"指的是一种特殊类型的数字货币兑换方式。这个过程涉及到两种或多种不同的加密货币之间进行直接转移和互相兑换。与传统金融市场中使用法定货币（如美元、欧元等）作为媒介来买卖其他资产形成鲜明对比的是，“币币”交易完全绕开了任何实体货币参与，而仅依赖于各自独立存在且具有价值属性的虚拟代码—即所谓的 "去中心化" 数字货币本身就可以被视为支付手段并执行购物消费功能。因此, 币币交易也常称为 “P2P 交易”, 即点对点(Peer-to-peer) 的电子现金系统模式下发生的事件。
