# 开源指令微调数据集

## 加载数据集
在配置文件中修改data_path为指定的数据集即可
```
data_path = 'timdettmers/openassistant-guanaco'
train_dataset = dict(
    type=process_hf_dataset,
    dataset=dict(type=load_dataset, path=data_path),
    ...)
```

## 字段格式
将数据集格式转换为模型需要的格式
```
from xtuner.dataset.map_fns import oasst1_map_fn
train_dataset = dict(
    type=process_hf_dataset,
    ...
    dataset_map_fn=oasst1_map_fn,
    ...)
```

In [None]:
def oasst1_map_fn(example):
    r"""Example before preprocessing:
        example['text'] = ('### Human: Can you explain xxx'
                           '### Assistant: Sure! xxx'
                           '### Human: I didn't understand how xxx'
                           '### Assistant: It has to do with a process xxx.')

    Example after preprocessing:
        example['conversation'] = [
            {
                'input': 'Can you explain xxx',
                'output': 'Sure! xxx'
            },
            {
                'input': 'I didn't understand how xxx',
                'output': 'It has to do with a process xxx.'
            }
        ]
    """
    data = []
    for sentence in example['text'].strip().split('###'):
        sentence = sentence.strip()
        if sentence[:6] == 'Human:':
            data.append(sentence[6:].strip())
        elif sentence[:10] == 'Assistant:':
            data.append(sentence[10:].strip())
    if len(data) % 2:
        # The last round of conversation solely consists of input
        # without any output.
        # Discard the input part of the last round, as this part is ignored in
        # the loss calculation.
        data.pop()
    conversation = []
    for i in range(0, len(data), 2):
        single_turn_conversation = {'input': data[i], 'output': data[i + 1]}
        conversation.append(single_turn_conversation)
    return {'conversation': conversation}

## 训练

单机单卡  
xtuner train ./config.py --deepspeed deepspeed_zero2

单机多卡  
NPROC_PER_NODE=${GPU_NUM} xtuner train ./config.py --deepspeed deepspeed_zero2

### 多机多卡

torchrun

In [None]:
# excuete on node 0
NPROC_PER_NODE=8 NNODES=2 PORT=$PORT ADDR=$NODE_0_ADDR NODE_RANK=0 xtuner train mixtral_8x7b_instruct_full_oasst1_e3 --deepspeed deepspeed_zero2

# excuete on node 1
NPROC_PER_NODE=8 NNODES=2 PORT=$PORT ADDR=$NODE_0_ADDR NODE_RANK=1 xtuner train mixtral_8x7b_instruct_full_oasst1_e3 --deepspeed deepspeed_zero2

slurm

In [None]:
srun -p $PARTITION --nodes=2 --gres=gpu:8 --ntasks-per-node=8 xtuner train internlm2_chat_7b_qlora_oasst1_e3 --launcher slurm --deepspeed deepspeed_zero2

## 模型转换

In [None]:
xtuner convert pth_to_hf ${CONFIG_NAME_OR_PATH} ${PTH} ${SAVE_PATH}
# 例如：xtuner convert pth_to_hf ./config.py ./iter_500.pth ./iter_500_hf

## 模型合并

In [None]:
xtuner convert merge ${LLM} ${ADAPTER_PATH} ${SAVE_PATH}
# 例如：xtuner convert merge internlm/internlm2-chat-7b ./iter_500_hf ./iter_500_merged_llm

## 对话

In [None]:
xtuner chat ${NAME_OR_PATH_TO_LLM} --adapter ${NAME_OR_PATH_TO_ADAPTER} --prompt-template ${PROMPT_TEMPLATE} [optional arguments]

In [None]:
xtuner chat internlm2/internlm2-chat-7b --adapter ./iter_500_hf --prompt-template internlm2_chat
xtuner chat ./iter_500_merged_llm --prompt-template internlm2_chat