From 3d5b4f32a43e02b2bfcab2954bb4b60b7aeec27c Mon Sep 17 00:00:00 2001 From: LZHgrla Date: Wed, 6 Sep 2023 18:47:23 +0800 Subject: [PATCH 1/5] add Baichuan2 7b-chat, 13b-base, 13b-chat --- .../baichuan2_13b_base_qlora_alpaca_e3.py | 180 +++++++++++++++ ...baichuan2_13b_base_qlora_alpaca_enzh_e3.py | 198 ++++++++++++++++ ...n2_13b_base_qlora_alpaca_enzh_oasst1_e3.py | 211 +++++++++++++++++ .../baichuan2_13b_base_qlora_alpaca_zh_e3.py | 180 +++++++++++++++ ...chuan2_13b_base_qlora_arxiv_gentitle_e3.py | 215 ++++++++++++++++++ ...baichuan2_13b_base_qlora_code_alpaca_e3.py | 184 +++++++++++++++ .../baichuan2_13b_base_qlora_colorist_e5.py | 180 +++++++++++++++ .../baichuan2_13b_base_qlora_lawyer_e3.py | 206 +++++++++++++++++ .../baichuan2_13b_base_qlora_oasst1_512_e3.py | 180 +++++++++++++++ .../baichuan2_13b_base_qlora_oasst1_e3.py | 180 +++++++++++++++ ...ichuan2_13b_base_qlora_open_platypus_e3.py | 180 +++++++++++++++ .../baichuan2_13b_base_qlora_sql_e3.py | 184 +++++++++++++++ .../baichuan2_13b_chat_qlora_alpaca_e3.py | 180 +++++++++++++++ ...baichuan2_13b_chat_qlora_alpaca_enzh_e3.py | 198 ++++++++++++++++ ...n2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py | 211 +++++++++++++++++ .../baichuan2_13b_chat_qlora_alpaca_zh_e3.py | 180 +++++++++++++++ ...baichuan2_13b_chat_qlora_code_alpaca_e3.py | 184 +++++++++++++++ .../baichuan2_13b_chat_qlora_lawyer_e3.py | 206 +++++++++++++++++ .../baichuan2_13b_chat_qlora_oasst1_512_e3.py | 180 +++++++++++++++ .../baichuan2_13b_chat_qlora_oasst1_e3.py | 180 +++++++++++++++ ...ichuan2_13b_chat_qlora_open_platypus_e3.py | 180 +++++++++++++++ .../baichuan2_7b_chat_qlora_alpaca_e3.py | 180 +++++++++++++++ .../baichuan2_7b_chat_qlora_alpaca_enzh_e3.py | 198 ++++++++++++++++ ...an2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py | 211 +++++++++++++++++ .../baichuan2_7b_chat_qlora_alpaca_zh_e3.py | 180 +++++++++++++++ .../baichuan2_7b_chat_qlora_code_alpaca_e3.py | 184 +++++++++++++++ .../baichuan2_7b_chat_qlora_lawyer_e3.py | 206 +++++++++++++++++ .../baichuan2_7b_chat_qlora_oasst1_512_e3.py | 180 +++++++++++++++ .../baichuan2_7b_chat_qlora_oasst1_e3.py | 180 +++++++++++++++ ...aichuan2_7b_chat_qlora_open_platypus_e3.py | 180 +++++++++++++++ 30 files changed, 5676 insertions(+) create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_zh_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_arxiv_gentitle_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_code_alpaca_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_colorist_e5.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_lawyer_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_512_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_open_platypus_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_sql_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_zh_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_code_alpaca_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_lawyer_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_512_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_open_platypus_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_enzh_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_zh_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_code_alpaca_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_lawyer_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_oasst1_512_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_oasst1_e3.py create mode 100644 xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_open_platypus_e3.py diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_e3.py new file mode 100644 index 000000000..6e657c2ce --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.alpaca +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_e3.py new file mode 100644 index 000000000..738bdb411 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_e3.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn, + template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.alpaca +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict(alpaca_en=alpaca_en, alpaca_zh=alpaca_zh)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py new file mode 100644 index 000000000..a716ab5c6 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_enzh_oasst1_e3.py @@ -0,0 +1,211 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn, + oasst1_map_fn, template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +alpaca_en_path = 'tatsu-lab/alpaca' +oasst1_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.alpaca +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +oasst1 = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=oasst1_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict(alpaca_en=alpaca_en, alpaca_zh=alpaca_zh, oasst1=oasst1)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_zh_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_zh_e3.py new file mode 100644 index 000000000..be10bbdbb --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_alpaca_zh_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_zh_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +prompt_template = PROMPT_TEMPLATE.alpaca +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_zh, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_arxiv_gentitle_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_arxiv_gentitle_e3.py new file mode 100644 index 000000000..587088594 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_arxiv_gentitle_e3.py @@ -0,0 +1,215 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import arxiv_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +# 1. Download data from https://kaggle.com/datasets/Cornell-University/arxiv +# 2. Process data by `xtuner preprocess arxiv ${DOWNLOADED_DATA} ./data/arxiv_data.json [optional arguments]` # noqa: E501 +data_path = './data/arxiv_data.json' +prompt_template = PROMPT_TEMPLATE.title +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + ('We present InternLM, a multilingual foundational language ' + 'model with 104B parameters. InternLM is pre-trained on a large ' + 'corpora with 1.6T tokens with a multi-phase progressive ' + 'process, and then fine-tuned to align with human preferences. ' + 'We also developed a training system called Uniscale-LLM for ' + 'efficient large language model training. The evaluation on a ' + 'number of benchmarks shows that InternLM achieves ' + 'state-of-the-art performance in multiple aspects, including ' + 'knowledge understanding, reading comprehension, mathematics, ' + 'and coding. With such well-rounded capabilities, InternLM ' + 'achieves outstanding performances on comprehensive exams, ' + 'including MMLU, AGIEval, C-Eval and GAOKAO-Bench, without ' + 'resorting to external tools. On these benchmarks, InternLM ' + 'not only significantly outperforms open-source models, but ' + 'also obtains superior performance compared to ChatGPT. Also, ' + 'InternLM demonstrates excellent capability of understanding ' + 'Chinese language and Chinese culture, which makes it a ' + 'suitable foundation model to support Chinese-oriented language ' + 'applications. This manuscript gives a detailed study of ' + 'our results, with benchmarks and examples across a diverse ' + 'set of knowledge domains and tasks.'), + ('In this work, we develop and release Llama 2, a collection of ' + 'pretrained and fine-tuned large language models (LLMs) ranging ' + 'in scale from 7 billion to 70 billion parameters.\nOur ' + 'fine-tuned LLMs, called LLAMA 2-CHAT, are optimized for ' + 'dialogue use cases. Our models outperform open-source chat ' + 'models on most benchmarks we tested, and based on our human ' + 'evaluations for helpfulness and safety, may be a suitable ' + 'substitute for closedsource models. We provide a detailed ' + 'description of our approach to fine-tuning and safety ' + 'improvements of LLAMA 2-CHAT in order to enable the community ' + 'to build on our work and contribute to the responsible ' + 'development of LLMs.') +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict( + type=load_dataset, path='json', data_files=dict(train=data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=arxiv_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_code_alpaca_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_code_alpaca_e3.py new file mode 100644 index 000000000..1b01ddd23 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_code_alpaca_e3.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +data_path = 'HuggingFaceH4/CodeAlpaca_20K' +prompt_template = PROMPT_TEMPLATE.coder +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 100 +evaluation_inputs = [ + ('写一个Python函数,将十六进制颜色代码(如#0066ee)转换为对应的' + '红、绿、蓝(RGB)三个颜色分量值,并以元组的形式返回。'), + ('Write a Python function that takes a hexadecimal color code ' + '(e.g., #0066ee) as input and converts it into the corresponding ' + 'red, green, and blue (RGB) color component values.') +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=code_alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_colorist_e5.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_colorist_e5.py new file mode 100644 index 000000000..ff015a263 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_colorist_e5.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import colors_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +data_path = 'burkelibbey/colors' +prompt_template = PROMPT_TEMPLATE.colorist +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 5 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 200 +evaluation_inputs = [ + '请给我一个像天空一样清澈透明的蓝色。', 'Please give me a clear blue like the sky.' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=colors_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_lawyer_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_lawyer_e3.py new file mode 100644 index 000000000..7c5da43da --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_lawyer_e3.py @@ -0,0 +1,206 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (crime_kg_assitant_map_fn, + law_reference_map_fn, + template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +# download data from https://github.com/LiuHC0428/LAW-GPT +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' +prompt_template = PROMPT_TEMPLATE.lawyer +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = ['请问离婚需要准备什么材料?', '销售鳄鱼皮包违法吗?'] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +crime_kg_assitant = dict( + type=process_hf_dataset, + dataset=dict( + type=load_dataset, + path='json', + data_files=dict(train=crime_kg_assitant_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=crime_kg_assitant_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +law_reference_data = dict( + type=process_hf_dataset, + dataset=dict( + type=load_dataset, + path='json', + data_files=dict(train=law_reference_data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=law_reference_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict( + crime_kg_assitant=crime_kg_assitant, + law_reference_data=law_reference_data)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_512_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_512_e3.py new file mode 100644 index 000000000..eb604b8e9 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_512_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +data_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.openassistant +max_length = 512 +pack_to_max_length = False + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_e3.py new file mode 100644 index 000000000..71537289d --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_oasst1_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +data_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.openassistant +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_open_platypus_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_open_platypus_e3.py new file mode 100644 index 000000000..9a9bfa90d --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_open_platypus_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +data_path = 'garage-bAInd/Open-Platypus' +prompt_template = PROMPT_TEMPLATE.alpaca +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_sql_e3.py b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_sql_e3.py new file mode 100644 index 000000000..f4b51853c --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_base/baichuan2_13b_base_qlora_sql_e3.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import sql_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Base' + +# Data +data_path = 'b-mc2/sql-create-context' +prompt_template = PROMPT_TEMPLATE.sql +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + ('CREATE TABLE station (name VARCHAR, lat VARCHAR, city VARCHAR)\n' + 'Find the name, latitude, and city of stations with latitude ' + 'above 50.'), + ('CREATE TABLE weather (zip_code VARCHAR, mean_visibility_miles ' + 'INTEGER)\n找到mean_visibility_miles最大的zip_code。') +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=sql_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_e3.py new file mode 100644 index 000000000..48f676012 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_e3.py new file mode 100644 index 000000000..d8e3eb546 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_e3.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn, + template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict(alpaca_en=alpaca_en, alpaca_zh=alpaca_zh)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py new file mode 100644 index 000000000..c4f9d7470 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_enzh_oasst1_e3.py @@ -0,0 +1,211 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn, + oasst1_map_fn, template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +alpaca_en_path = 'tatsu-lab/alpaca' +oasst1_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +oasst1 = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=oasst1_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict(alpaca_en=alpaca_en, alpaca_zh=alpaca_zh, oasst1=oasst1)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_zh_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_zh_e3.py new file mode 100644 index 000000000..84560a36f --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_alpaca_zh_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_zh_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_zh, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_code_alpaca_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_code_alpaca_e3.py new file mode 100644 index 000000000..17f863450 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_code_alpaca_e3.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +data_path = 'HuggingFaceH4/CodeAlpaca_20K' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 100 +evaluation_inputs = [ + ('写一个Python函数,将十六进制颜色代码(如#0066ee)转换为对应的' + '红、绿、蓝(RGB)三个颜色分量值,并以元组的形式返回。'), + ('Write a Python function that takes a hexadecimal color code ' + '(e.g., #0066ee) as input and converts it into the corresponding ' + 'red, green, and blue (RGB) color component values.') +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=code_alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_lawyer_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_lawyer_e3.py new file mode 100644 index 000000000..ea3d6996c --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_lawyer_e3.py @@ -0,0 +1,206 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (crime_kg_assitant_map_fn, + law_reference_map_fn, + template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +# download data from https://github.com/LiuHC0428/LAW-GPT +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = ['请问离婚需要准备什么材料?', '销售鳄鱼皮包违法吗?'] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +crime_kg_assitant = dict( + type=process_hf_dataset, + dataset=dict( + type=load_dataset, + path='json', + data_files=dict(train=crime_kg_assitant_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=crime_kg_assitant_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +law_reference_data = dict( + type=process_hf_dataset, + dataset=dict( + type=load_dataset, + path='json', + data_files=dict(train=law_reference_data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=law_reference_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict( + crime_kg_assitant=crime_kg_assitant, + law_reference_data=law_reference_data)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_512_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_512_e3.py new file mode 100644 index 000000000..97bb8993c --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_512_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +data_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 512 +pack_to_max_length = False + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_e3.py new file mode 100644 index 000000000..b7ad395c4 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_oasst1_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +data_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_open_platypus_e3.py b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_open_platypus_e3.py new file mode 100644 index 000000000..5d15e186a --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_13b_chat/baichuan2_13b_chat_qlora_open_platypus_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-13B-Chat' + +# Data +data_path = 'garage-bAInd/Open-Platypus' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_e3.py new file mode 100644 index 000000000..4c6e309e3 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_en, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_enzh_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_enzh_e3.py new file mode 100644 index 000000000..cac420819 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_enzh_e3.py @@ -0,0 +1,198 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn, + template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +alpaca_en_path = 'tatsu-lab/alpaca' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict(alpaca_en=alpaca_en, alpaca_zh=alpaca_zh)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py new file mode 100644 index 000000000..16cb2396c --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_enzh_oasst1_e3.py @@ -0,0 +1,211 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (alpaca_map_fn, alpaca_zh_map_fn, + oasst1_map_fn, template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +alpaca_en_path = 'tatsu-lab/alpaca' +oasst1_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_en = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_en_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +oasst1 = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=oasst1_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict(alpaca_en=alpaca_en, alpaca_zh=alpaca_zh, oasst1=oasst1)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_zh_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_zh_e3.py new file mode 100644 index 000000000..1e91bb7f3 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_alpaca_zh_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_zh_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +alpaca_zh_path = 'silk-road/alpaca-data-gpt4-chinese' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +alpaca_zh = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=alpaca_zh_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_zh_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=alpaca_zh, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_code_alpaca_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_code_alpaca_e3.py new file mode 100644 index 000000000..8d29890df --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_code_alpaca_e3.py @@ -0,0 +1,184 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import code_alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +data_path = 'HuggingFaceH4/CodeAlpaca_20K' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 100 +evaluation_inputs = [ + ('写一个Python函数,将十六进制颜色代码(如#0066ee)转换为对应的' + '红、绿、蓝(RGB)三个颜色分量值,并以元组的形式返回。'), + ('Write a Python function that takes a hexadecimal color code ' + '(e.g., #0066ee) as input and converts it into the corresponding ' + 'red, green, and blue (RGB) color component values.') +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=code_alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_lawyer_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_lawyer_e3.py new file mode 100644 index 000000000..387aa9d79 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_lawyer_e3.py @@ -0,0 +1,206 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import ConcatDataset, process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import (crime_kg_assitant_map_fn, + law_reference_map_fn, + template_map_fn_factory) +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +# download data from https://github.com/LiuHC0428/LAW-GPT +crime_kg_assitant_path = './data/CrimeKgAssitant清洗后_52k.json' +law_reference_data_path = './data/训练数据_带法律依据_92k.json' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = ['请问离婚需要准备什么材料?', '销售鳄鱼皮包违法吗?'] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +crime_kg_assitant = dict( + type=process_hf_dataset, + dataset=dict( + type=load_dataset, + path='json', + data_files=dict(train=crime_kg_assitant_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=crime_kg_assitant_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +law_reference_data = dict( + type=process_hf_dataset, + dataset=dict( + type=load_dataset, + path='json', + data_files=dict(train=law_reference_data_path)), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=law_reference_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataset = dict( + type=ConcatDataset, + datasets_cfg=dict( + crime_kg_assitant=crime_kg_assitant, + law_reference_data=law_reference_data)) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_oasst1_512_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_oasst1_512_e3.py new file mode 100644 index 000000000..aafc7a093 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_oasst1_512_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +data_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 512 +pack_to_max_length = False + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_oasst1_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_oasst1_e3.py new file mode 100644 index 000000000..bbde74682 --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_oasst1_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import oasst1_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +data_path = 'timdettmers/openassistant-guanaco' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=oasst1_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) diff --git a/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_open_platypus_e3.py b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_open_platypus_e3.py new file mode 100644 index 000000000..a7483b9da --- /dev/null +++ b/xtuner/configs/baichuan/baichuan2_7b_chat/baichuan2_7b_chat_qlora_open_platypus_e3.py @@ -0,0 +1,180 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import torch +from bitsandbytes.optim import PagedAdamW32bit +from datasets import load_dataset +from mmengine.dataset import DefaultSampler +from mmengine.hooks import (CheckpointHook, DistSamplerSeedHook, IterTimerHook, + LoggerHook, ParamSchedulerHook) +from mmengine.optim import AmpOptimWrapper, CosineAnnealingLR +from peft import LoraConfig +from transformers import (AutoModelForCausalLM, AutoTokenizer, + BitsAndBytesConfig) + +from xtuner.dataset import process_hf_dataset +from xtuner.dataset.collate_fns import default_collate_fn +from xtuner.dataset.map_fns import alpaca_map_fn, template_map_fn_factory +from xtuner.engine import DatasetInfoHook, EvaluateChatHook +from xtuner.model import SupervisedFinetune +from xtuner.utils import PROMPT_TEMPLATE + +####################################################################### +# PART 1 Settings # +####################################################################### +# Model +pretrained_model_name_or_path = 'baichuan-inc/Baichuan2-7B-Chat' + +# Data +data_path = 'garage-bAInd/Open-Platypus' +prompt_template = PROMPT_TEMPLATE.baichuan2_chat +max_length = 2048 +pack_to_max_length = True + +# Scheduler & Optimizer +batch_size = 1 # per_device +accumulative_counts = 16 +dataloader_num_workers = 0 +max_epochs = 3 +optim_type = PagedAdamW32bit +lr = 2e-4 +betas = (0.9, 0.999) +weight_decay = 0 +max_norm = 1 # grad clip + +# Evaluate the generation performance during the training +evaluation_freq = 500 +evaluation_inputs = [ + '请给我介绍五个上海的景点', 'Please tell me five scenic spots in Shanghai' +] + +####################################################################### +# PART 2 Model & Tokenizer # +####################################################################### +tokenizer = dict( + type=AutoTokenizer.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + padding_side='right') + +model = dict( + type=SupervisedFinetune, + llm=dict( + type=AutoModelForCausalLM.from_pretrained, + pretrained_model_name_or_path=pretrained_model_name_or_path, + trust_remote_code=True, + torch_dtype=torch.float16, + quantization_config=dict( + type=BitsAndBytesConfig, + load_in_4bit=True, + load_in_8bit=False, + llm_int8_threshold=6.0, + llm_int8_has_fp16_weight=False, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4')), + lora=dict( + type=LoraConfig, + r=64, + lora_alpha=16, + lora_dropout=0.1, + bias='none', + task_type='CAUSAL_LM')) + +####################################################################### +# PART 3 Dataset & Dataloader # +####################################################################### +train_dataset = dict( + type=process_hf_dataset, + dataset=dict(type=load_dataset, path=data_path), + tokenizer=tokenizer, + max_length=max_length, + dataset_map_fn=alpaca_map_fn, + template_map_fn=dict( + type=template_map_fn_factory, template=prompt_template), + remove_unused_columns=True, + shuffle_before_pack=True, + pack_to_max_length=pack_to_max_length) + +train_dataloader = dict( + batch_size=batch_size, + num_workers=dataloader_num_workers, + dataset=train_dataset, + sampler=dict(type=DefaultSampler, shuffle=True), + collate_fn=dict(type=default_collate_fn)) + +####################################################################### +# PART 4 Scheduler & Optimizer # +####################################################################### +# optimizer +optim_wrapper = dict( + type=AmpOptimWrapper, + optimizer=dict( + type=optim_type, lr=lr, betas=betas, weight_decay=weight_decay), + clip_grad=dict(max_norm=max_norm, error_if_nonfinite=False), + accumulative_counts=accumulative_counts, + loss_scale='dynamic', + dtype='float16') + +# learning policy +# More information: https://github.com/open-mmlab/mmengine/blob/main/docs/en/tutorials/param_scheduler.md # noqa: E501 +param_scheduler = dict( + type=CosineAnnealingLR, + eta_min=lr * 0.1, + by_epoch=True, + T_max=max_epochs, + convert_to_iter_based=True) + +# train, val, test setting +train_cfg = dict(by_epoch=True, max_epochs=max_epochs, val_interval=1) + +####################################################################### +# PART 5 Runtime # +####################################################################### +# Log the dialogue periodically during the training process, optional +custom_hooks = [ + dict(type=DatasetInfoHook, tokenizer=tokenizer), + dict( + type=EvaluateChatHook, + tokenizer=tokenizer, + every_n_iters=evaluation_freq, + evaluation_inputs=evaluation_inputs, + instruction=prompt_template.INSTRUCTION_START) +] + +# configure default hooks +default_hooks = dict( + # record the time of every iteration. + timer=dict(type=IterTimerHook), + # print log every 100 iterations. + logger=dict(type=LoggerHook, interval=10), + # enable the parameter scheduler. + param_scheduler=dict(type=ParamSchedulerHook), + # save checkpoint per epoch. + checkpoint=dict(type=CheckpointHook, interval=1), + # set sampler seed in distributed evrionment. + sampler_seed=dict(type=DistSamplerSeedHook), +) + +# configure environment +env_cfg = dict( + # whether to enable cudnn benchmark + cudnn_benchmark=False, + # set multi process parameters + mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0), + # set distributed parameters + dist_cfg=dict(backend='nccl'), +) + +# set visualizer +visualizer = None + +# set log level +log_level = 'INFO' + +# load from which checkpoint +load_from = None + +# whether to resume training from the loaded checkpoint +resume = False + +# Defaults to use random seed and disable `deterministic` +randomness = dict(seed=None, deterministic=False) From 87746b9d72d0c0e8d6b2798408e1da66141e0d61 Mon Sep 17 00:00:00 2001 From: LZHgrla Date: Wed, 6 Sep 2023 18:49:33 +0800 Subject: [PATCH 2/5] update readme --- README.md | 3 +++ README_zh-CN.md | 3 +++ 2 files changed, 6 insertions(+) diff --git a/README.md b/README.md index c08a1eb3c..a07a4f5bd 100644 --- a/README.md +++ b/README.md @@ -83,6 +83,9 @@ XTuner is a toolkit for efficiently fine-tuning LLM, developed by the [MMRazor](
  • Baichuan-13B-Base
  • Baichuan-13B-Chat
  • Baichuan2-7B-Base
  • +
  • Baichuan2-7B-Chat
  • +
  • Baichuan2-13B-Base
  • +
  • Baichuan2-13B-Chat
  • ...
  • diff --git a/README_zh-CN.md b/README_zh-CN.md index 2bba9ad54..20b2fd6cb 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -83,6 +83,9 @@ XTuner 是一个轻量级微调大语言模型的工具库,由 [MMRazor](https
  • Baichuan-13B-Base
  • Baichuan-13B-Chat
  • Baichuan2-7B-Base
  • +
  • Baichuan2-7B-Chat
  • +
  • Baichuan2-13B-Base
  • +
  • Baichuan2-13B-Chat
  • ...
  • From b405117dd39c26e50ec190e447a984861e755e4a Mon Sep 17 00:00:00 2001 From: Zhihao Lin <36994684+LZHgrla@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:53:08 +0800 Subject: [PATCH 3/5] Update README.md --- README.md | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a07a4f5bd..1783d517a 100644 --- a/README.md +++ b/README.md @@ -72,20 +72,12 @@ XTuner is a toolkit for efficiently fine-tuning LLM, developed by the [MMRazor]( From 8d3e8b03ee5f7cc2a8775981ad0dc6de6f30f4fb Mon Sep 17 00:00:00 2001 From: Zhihao Lin <36994684+LZHgrla@users.noreply.github.com> Date: Wed, 6 Sep 2023 18:54:00 +0800 Subject: [PATCH 4/5] Update README_zh-CN.md --- README_zh-CN.md | 12 ++---------- 1 file changed, 2 insertions(+), 10 deletions(-) diff --git a/README_zh-CN.md b/README_zh-CN.md index 20b2fd6cb..443e72beb 100644 --- a/README_zh-CN.md +++ b/README_zh-CN.md @@ -72,20 +72,12 @@ XTuner 是一个轻量级微调大语言模型的工具库,由 [MMRazor](https From 5330591f332a2d3570dd116d3023464eda8d1c8a Mon Sep 17 00:00:00 2001 From: LZHgrla Date: Thu, 7 Sep 2023 13:20:22 +0800 Subject: [PATCH 5/5] add dispatch modules for baichuan flash_attn and norm_head --- xtuner/model/fast_forward/__init__.py | 46 ------- xtuner/model/modules/__init__.py | 5 + xtuner/model/modules/baichuan.py | 118 ++++++++++++++++++ xtuner/model/modules/dispatch.py | 82 ++++++++++++ .../internlm_attn.py => modules/internlm.py} | 0 xtuner/model/modules/internlm_attn.py | 77 ++++++++++++ .../llama_attn.py => modules/llama.py} | 0 xtuner/model/sft.py | 12 +- xtuner/tools/train.py | 5 +- 9 files changed, 287 insertions(+), 58 deletions(-) delete mode 100644 xtuner/model/fast_forward/__init__.py create mode 100644 xtuner/model/modules/__init__.py create mode 100644 xtuner/model/modules/baichuan.py create mode 100644 xtuner/model/modules/dispatch.py rename xtuner/model/{fast_forward/internlm_attn.py => modules/internlm.py} (100%) create mode 100644 xtuner/model/modules/internlm_attn.py rename xtuner/model/{fast_forward/llama_attn.py => modules/llama.py} (100%) diff --git a/xtuner/model/fast_forward/__init__.py b/xtuner/model/fast_forward/__init__.py deleted file mode 100644 index 86d5fb426..000000000 --- a/xtuner/model/fast_forward/__init__.py +++ /dev/null @@ -1,46 +0,0 @@ -# Copyright (c) OpenMMLab. All rights reserved. -import types -import warnings - -import torch -from mmengine import print_log -from mmengine.utils import digit_version - -from .internlm_attn import internlm_attn_forward -from .llama_attn import llama_attn_forward - - -def dispatch_llama_attn_forward(model): - if digit_version(torch.__version__) < digit_version('2.0.0'): - # flash attention is only supported after pytorch2.0 - return - print_log('dispatch llama attn forward', 'current') - warnings.warn( - 'Due to the implementation of the PyTorch version of ' - 'flash attention, even when the `output_attentions` flag is set to ' - 'True, it is not possible to return the `attn_weights`.') - for module in model.modules(): - if type(module).__name__ == 'LlamaAttention': - module.forward = types.MethodType(llama_attn_forward, module) - - -def dispatch_internlm_attn_forward(model): - if digit_version(torch.__version__) < digit_version('2.0.0'): - # flash attention is only supported after pytorch2.0 - return - print_log('dispatch internlm attn forward', 'current') - warnings.warn( - 'Due to the implementation of the PyTorch version of ' - 'flash attention, even when the `output_attentions` flag is set to ' - 'True, it is not possible to return the `attn_weights`.') - for module in model.modules(): - if type(module).__name__ == 'InternLMAttention': - module.forward = types.MethodType(internlm_attn_forward, module) - - -def dispatch_fast_forward(model): - dispatch_llama_attn_forward(model) - dispatch_internlm_attn_forward(model) - - -__all__ = ['dispatch_fast_forward'] diff --git a/xtuner/model/modules/__init__.py b/xtuner/model/modules/__init__.py new file mode 100644 index 000000000..0c41bc597 --- /dev/null +++ b/xtuner/model/modules/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) OpenMMLab. All rights reserved. + +from .dispatch import dispatch_modules + +__all__ = ['dispatch_modules'] diff --git a/xtuner/model/modules/baichuan.py b/xtuner/model/modules/baichuan.py new file mode 100644 index 000000000..738c49869 --- /dev/null +++ b/xtuner/model/modules/baichuan.py @@ -0,0 +1,118 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def baichuan2_norm_head_forward(self, hidden_states): + norm_weight = nn.functional.normalize(self.weight) + return nn.functional.linear(hidden_states, norm_weight) + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos_, sin_, position_ids): + cos = cos_.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin_.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q.float() * cos) + (rotate_half(q.float()) * sin) + k_embed = (k.float() * cos) + (rotate_half(k.float()) * sin) + return q_embed.to(q.dtype), k_embed.to(k.dtype) + + +def baichuan_7b_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + proj = self.W_pack(hidden_states) + proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose( + 0, -2).squeeze(-2) + query_states = proj[0].view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + key_states = proj[1].view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + value_states = proj[2].view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + attn_output = F.scaled_dot_product_attention( + query_states, key_states, value_states, attn_mask=attention_mask) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + return attn_output, None, past_key_value + + +def baichuan_13b_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[Tuple[torch.Tensor]]]: + bsz, q_len, _ = hidden_states.size() + + proj = self.W_pack(hidden_states) + proj = proj.unflatten(-1, (3, self.hidden_size)).unsqueeze(0).transpose( + 0, -2).squeeze(-2) + query_states = proj[0].view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + key_states = proj[1].view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + value_states = proj[2].view(bsz, q_len, self.num_heads, + self.head_dim).transpose(1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + if attention_mask is not None: + if q_len == 1: # inference with cache + if len(attention_mask.size()) == 4: + attention_mask = attention_mask[:, :, -1:, :] + else: + attention_mask = attention_mask[:, -1:, :] + attn_output = F.scaled_dot_product_attention( + query_states, key_states, value_states, attn_mask=attention_mask) + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + attn_output = self.o_proj(attn_output) + + return attn_output, None, past_key_value diff --git a/xtuner/model/modules/dispatch.py b/xtuner/model/modules/dispatch.py new file mode 100644 index 000000000..67111872a --- /dev/null +++ b/xtuner/model/modules/dispatch.py @@ -0,0 +1,82 @@ +# Copyright (c) OpenMMLab. All rights reserved. +import logging +import types + +import torch +from mmengine import print_log +from mmengine.utils import digit_version + +from .baichuan import (baichuan2_norm_head_forward, baichuan_7b_attn_forward, + baichuan_13b_attn_forward) +from .internlm import internlm_attn_forward +from .llama import llama_attn_forward + +NO_ATTN_WEIGHTS_MSG = ( + 'Due to the implementation of the PyTorch version of flash attention, ' + 'even when the `output_attentions` flag is set to True, it is not ' + 'possible to return the `attn_weights`.') + + +def dispatch_llama_attn_forward(model): + if digit_version(torch.__version__) < digit_version('2.0.0'): + # flash attention is only supported after pytorch2.0 + return + print_log('dispatch llama attn forward', 'current') + print_log(NO_ATTN_WEIGHTS_MSG, 'current', logging.WARNING) + for module in model.modules(): + if type(module).__name__ == 'LlamaAttention': + module.forward = types.MethodType(llama_attn_forward, module) + + +def dispatch_internlm_attn_forward(model): + if digit_version(torch.__version__) < digit_version('2.0.0'): + # flash attention is only supported after pytorch2.0 + return + print_log('dispatch internlm attn forward', 'current') + print_log(NO_ATTN_WEIGHTS_MSG, 'current', logging.WARNING) + for module in model.modules(): + if type(module).__name__ == 'InternLMAttention': + module.forward = types.MethodType(internlm_attn_forward, module) + + +def dispath_baichuan2_norm_head_forward(model): + print_log('dispatch baichuan2 NormHead forward', 'current') + for module in model.modules(): + if type(module).__name__ == 'NormHead': + module.forward = types.MethodType(baichuan2_norm_head_forward, + module) + + +def dispath_baichuan_7b_attn_forward(model): + if digit_version(torch.__version__) < digit_version('2.0.0'): + # flash attention is only supported after pytorch2.0 + return + print_log('dispatch baichuan2-7B attn forward', 'current') + print_log(NO_ATTN_WEIGHTS_MSG, 'current', logging.WARNING) + for module in model.modules(): + if type(module).__name__ == 'Attention': + module.forward = types.MethodType(baichuan_7b_attn_forward, module) + + +def dispath_baichuan_13b_attn_forward(model): + if digit_version(torch.__version__) < digit_version('2.0.0'): + # flash attention is only supported after pytorch2.0 + return + print_log('dispatch baichuan2-13B attn forward', 'current') + print_log(NO_ATTN_WEIGHTS_MSG, 'current', logging.WARNING) + for module in model.modules(): + if type(module).__name__ == 'BaichuanAttention': + module.forward = types.MethodType(baichuan_13b_attn_forward, + module) + + +def dispatch_modules(model): + model_name = model.__class__.__name__.lower() + if 'internlm' in model_name: + dispatch_internlm_attn_forward(model) + if 'llama' in model_name: + dispatch_llama_attn_forward(model) + if 'baichuan' in model_name: + dispath_baichuan2_norm_head_forward(model) + dispath_baichuan_7b_attn_forward(model) + dispath_baichuan_13b_attn_forward(model) diff --git a/xtuner/model/fast_forward/internlm_attn.py b/xtuner/model/modules/internlm.py similarity index 100% rename from xtuner/model/fast_forward/internlm_attn.py rename to xtuner/model/modules/internlm.py diff --git a/xtuner/model/modules/internlm_attn.py b/xtuner/model/modules/internlm_attn.py new file mode 100644 index 000000000..438ba2d89 --- /dev/null +++ b/xtuner/model/modules/internlm_attn.py @@ -0,0 +1,77 @@ +# Copyright (c) OpenMMLab. All rights reserved. +from typing import Optional, Tuple + +import torch +import torch.nn.functional as F + + +def rotate_half(x): + """Rotates half the hidden dims of the input.""" + x1 = x[..., :x.shape[-1] // 2] + x2 = x[..., x.shape[-1] // 2:] + return torch.cat((-x2, x1), dim=-1) + + +def apply_rotary_pos_emb(q, k, cos, sin, position_ids): + # The first two dimensions of cos and sin are always 1, so we can + # `squeeze` them. + cos = cos.squeeze(1).squeeze(0) # [seq_len, dim] + sin = sin.squeeze(1).squeeze(0) # [seq_len, dim] + cos = cos[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + sin = sin[position_ids].unsqueeze(1) # [bs, 1, seq_len, dim] + q_embed = (q * cos) + (rotate_half(q) * sin) + k_embed = (k * cos) + (rotate_half(k) * sin) + return q_embed, k_embed + + +def internlm_attn_forward( + self, + hidden_states: torch.Tensor, + attention_mask: Optional[torch.Tensor] = None, + position_ids: Optional[torch.LongTensor] = None, + past_key_value: Optional[Tuple[torch.Tensor]] = None, + output_attentions: bool = False, + use_cache: bool = False, +) -> Tuple[torch.Tensor, Optional[torch.Tensor], + Optional[Tuple[torch.Tensor]]]: + # Modified from https://huggingface.co/internlm/internlm-7b/blob/939a68c0dc1bd5f35b63c87d44af05ce33379061/modeling_internlm.py#L161 # noqa:E501 + bsz, q_len, _ = hidden_states.size() + + query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, + self.head_dim).transpose( + 1, 2) + key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, + self.head_dim).transpose( + 1, 2) + value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, + self.head_dim).transpose( + 1, 2) + + kv_seq_len = key_states.shape[-2] + if past_key_value is not None: + kv_seq_len += past_key_value[0].shape[-2] + cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len) + query_states, key_states = apply_rotary_pos_emb(query_states, key_states, + cos, sin, position_ids) + # [bsz, nh, t, hd] + + if past_key_value is not None: + # reuse k, v, self_attention + key_states = torch.cat([past_key_value[0], key_states], dim=2) + value_states = torch.cat([past_key_value[1], value_states], dim=2) + + past_key_value = (key_states, value_states) if use_cache else None + + # use flash attention implemented by pytorch + attn_output = F.scaled_dot_product_attention( + query_states, key_states, value_states, attn_mask=attention_mask) + + attn_output = attn_output.transpose(1, 2) + attn_output = attn_output.reshape(bsz, q_len, self.hidden_size) + + attn_output = self.o_proj(attn_output) + + # Due to the implementation of the PyTorch version of flash attention, + # even when the output_attentions flag is set to True, it is not possible + # to return the attn_weights. + return attn_output, None, past_key_value diff --git a/xtuner/model/fast_forward/llama_attn.py b/xtuner/model/modules/llama.py similarity index 100% rename from xtuner/model/fast_forward/llama_attn.py rename to xtuner/model/modules/llama.py diff --git a/xtuner/model/sft.py b/xtuner/model/sft.py index ea3a95679..55f21fe7e 100644 --- a/xtuner/model/sft.py +++ b/xtuner/model/sft.py @@ -8,7 +8,7 @@ from torch import nn from xtuner.registry import BUILDER -from .fast_forward import dispatch_fast_forward +from .modules import dispatch_modules from .utils import LoadWoInit, find_all_linear_names, traverse_dict @@ -23,6 +23,7 @@ def __init__(self, with LoadWoInit(): self.llm = self._build_from_cfg_or_module(llm) self.llm.config.use_cache = False + dispatch_modules(self.llm) if isinstance(lora, dict) or isinstance(lora, Config) or isinstance( lora, ConfigDict): @@ -33,13 +34,6 @@ def __init__(self, self.use_lora = lora is not None if self.use_lora: self._prepare_for_lora(peft_model, use_gradient_checkpointing) - try: - # for BaiChuan2, set first_flag to False to disable weight init - if self.llm.base_model.model.__class__.__name__.lower( - ) == 'BaichuanForCausalLM'.lower(): - self.llm.base_model.model.lm_head.first_flag = False - except Exception: - pass elif use_gradient_checkpointing: # For backward compatibility if hasattr(self.llm, 'enable_input_require_grads'): @@ -55,8 +49,6 @@ def make_inputs_require_grad(module, input, output): # enable gradient checkpointing for memory efficiency self.llm.gradient_checkpointing_enable() - dispatch_fast_forward(self.llm) - self._is_init = True def _prepare_for_lora(self, diff --git a/xtuner/tools/train.py b/xtuner/tools/train.py index 88f51affc..52ca9919a 100644 --- a/xtuner/tools/train.py +++ b/xtuner/tools/train.py @@ -15,7 +15,7 @@ from xtuner.configs import cfgs_name_path from xtuner.dataset.collate_fns import default_collate_fn -from xtuner.model.fast_forward import dispatch_fast_forward +from xtuner.model.modules import dispatch_modules from xtuner.model.utils import LoadWoInit, find_all_linear_names, traverse_dict from xtuner.registry import BUILDER @@ -95,6 +95,7 @@ def main(): traverse_dict(cfg.model) model = BUILDER.build(cfg.model) model.config.use_cache = False + dispatch_modules(model) if cfg.get('lora', None): lora = BUILDER.build(cfg.lora) model = prepare_model_for_kbit_training(model) @@ -102,7 +103,7 @@ def main(): modules = find_all_linear_names(model) lora.target_modules = modules model = get_peft_model(model, lora) - dispatch_fast_forward(model) + # build dataset train_dataset = BUILDER.build(cfg.train_dataset) data_collator = partial(default_collate_fn, return_hf_format=True)