In [1]:
import os
from datasets import load_dataset
path = os.path.join("..","datasets")
data = load_dataset("json",data_files={"train":os.path.join(path,"final_data.jsonl")})

In [2]:
from transformers import AutoTokenizer
basemodel = "baichuan-inc/Baichuan2-7B-Chat"
tokenzier = AutoTokenizer.from_pretrained(basemodel,model_max_length = 512,trust_remote_code=True)
tokenzier.pad_token = tokenzier.eos_token
tokenzier.padding_side = "right"

In [3]:
data['train']

Dataset({
    features: ['question', 'answer'],
    num_rows: 160
})

In [4]:

def preprocess_function(examples):
    inputs = ["#" + examples['question'][i] + examples['answer'][i] for i in range(len(examples['question']))]
    model_inputs = tokenzier(inputs,padding="max_length",truncation=True)  
    return model_inputs

In [5]:
tokenzied_data = data.map(preprocess_function,batched=True,remove_columns=['question','answer'])

In [6]:
''.join(tokenzier.batch_decode(tokenzied_data['train']['input_ids'][0]))

'#请问可能是什么原因造成的什么状态栏，你那个是鼠标的圈还是再次重启也一样吗鼠标的圈对，重启了几次wifi不能连具体指的是什么症状状态栏点不动，点开了也没有任何WiFi选项，跟没插网卡一样，只有个飞行模式那你的壁纸是本来就是黑的吗壁纸没问题，就是状态栏这一行又重启了几次，还用电脑管家扫了，再重启不知道为啥就正常了看看设备管理器\n</s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></s></

In [7]:
block_size = 512
def group_texts(examples):
    examples["labels"] = examples["input_ids"].copy()
    return examples

In [8]:
tokenzied_data = tokenzied_data.map(group_texts,batched=True)

In [9]:
from transformers import AutoModelForCausalLM,TrainingArguments, Trainer
from peft import (
    LoraConfig,
    get_peft_model,
    prepare_model_for_int8_training,
    PeftType,
    TaskType
)
import torch
peft_type = PeftType.LORA
config = LoraConfig(
        r=8,
        lora_alpha=16,
        inference_mode=False,
        lora_dropout=0.1,
        task_type=TaskType.CAUSAL_LM,
        target_modules=[
        "o_proj",
        "gate_proj",
    ],
    )

In [10]:
from transformers import BitsAndBytesConfig
babcfig = BitsAndBytesConfig(load_in_4bit=True,llm_int4_enable_fp32_cpu_offload=True)
model = AutoModelForCausalLM.from_pretrained(basemodel,
        device_map = "cuda:0",
        quantization_config = babcfig,
        trust_remote_code=True)

#model.resize_token_embeddings(len(tokenzier))
model = prepare_model_for_int8_training(model)
model = get_peft_model(model, config)
model.print_trainable_parameters()


Xformers is not installed correctly. If you want to use memory_efficient_attention to accelerate training use the following command to install Xformers
pip install xformers.
You are using an old version of the checkpointing format that is deprecated (We will also silently ignore `gradient_checkpointing_kwargs` in case you passed it).Please update to the new format on your modeling file. To use the new format, you need to completely remove the definition of the method `_set_gradient_checkpointing` in your model.


trainable params: 5,963,776 || all params: 7,511,937,024 || trainable%: 0.07939065491292382


In [11]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [12]:
num_epochs = 20
training_args = TrainingArguments(
    output_dir="BaichuanFor_ITxia_chat_help",
    save_strategy = "epoch",
    #evaluation_strategy = "epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    num_train_epochs=num_epochs,
    warmup_steps=0.06 * (len(tokenzied_data['train']) * num_epochs),
    fp16=True,
    push_to_hub = True,
    logging_strategy="steps",
    logging_steps=10,
    save_total_limit=1,
    #load_best_model_at_end=True,
    report_to='none'
)

In [13]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenzier,mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenzied_data["train"],
    tokenizer=tokenzier,
    data_collator=data_collator,
    #eval_dataset = tokenzied_data['test']
)

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
100,5.3246
200,4.4192
300,3.5283
400,2.8607


Checkpoint destination directory BaichuanFor_ITxia_chat_help/checkpoint-100 already exists and is non-empty. Saving will proceed but saved results may be invalid.


TrainOutput(global_step=400, training_loss=4.033191986083985, metrics={'train_runtime': 2182.1914, 'train_samples_per_second': 1.466, 'train_steps_per_second': 0.183, 'total_flos': 6.87841562591232e+16, 'train_loss': 4.033191986083985, 'epoch': 20.0})

In [14]:
model.save_pretrained("ljcoutput")

In [19]:
model = model.merge_and_unload()

In [20]:
model.push_to_hub("Jchew/BaichuanFor_ITxia_chat_help")
tokenzier.push_to_hub("Jchew/BaichuanFor_ITxia_chat_help")

model-00002-of-00002.safetensors:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

CommitInfo(commit_url='https://huggingface.co/Jchew/BaichuanFor_ITxia_chat_help/commit/d4302b702de8ba55f0ef4e5168cb48da4094e6ef', commit_message='Upload tokenizer', commit_description='', oid='d4302b702de8ba55f0ef4e5168cb48da4094e6ef', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
model.push_to_hub("Jchew/BaichuanFor_ITxia_chat_help2")
tokenzier.push_to_hub("Jchew/BaichuanFor_ITxia_chat_help2")

model-00001-of-00002.safetensors:   0%|          | 0.00/4.99G [00:00<?, ?B/s]

Upload 2 LFS files:   0%|          | 0/2 [00:00<?, ?it/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.77G [00:00<?, ?B/s]

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/2.00M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Jchew/BaichuanFor_ITxia_chat_help2/commit/21bf8cf8cb7a5ebfefe148f5e044fbd9ce70ebd8', commit_message='Upload tokenizer', commit_description='', oid='21bf8cf8cb7a5ebfefe148f5e044fbd9ce70ebd8', pr_url=None, pr_revision=None, pr_num=None)