In [1]:
from datasets import load_dataset

raw_datasets = load_dataset("Logic123456789/Luotuo-QA-B")
dataset = raw_datasets
raw_datasets

Found cached dataset json (/root/.cache/huggingface/datasets/Logic123456789___json/Logic123456789--Luotuo-QA-B-6abac8029306cc7b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['story', 'questions', 'answers', 'language'],
        num_rows: 31464
    })
})

In [2]:
# filter language is "Chinese"
dataset = dataset.filter(lambda example: example["language"] == "Chinese")
dataset

Loading cached processed dataset at /root/.cache/huggingface/datasets/Logic123456789___json/Logic123456789--Luotuo-QA-B-6abac8029306cc7b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-8b8b992eafd9566b.arrow


DatasetDict({
    train: Dataset({
        features: ['story', 'questions', 'answers', 'language'],
        num_rows: 25836
    })
})

In [3]:
dataset = dataset["train"].train_test_split(train_size=0.9, seed=42)
dataset

Loading cached split indices for dataset at /root/.cache/huggingface/datasets/Logic123456789___json/Logic123456789--Luotuo-QA-B-6abac8029306cc7b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-af775d9e504fc7fe.arrow and /root/.cache/huggingface/datasets/Logic123456789___json/Logic123456789--Luotuo-QA-B-6abac8029306cc7b/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51/cache-6802697e96f2214b.arrow


DatasetDict({
    train: Dataset({
        features: ['story', 'questions', 'answers', 'language'],
        num_rows: 23252
    })
    test: Dataset({
        features: ['story', 'questions', 'answers', 'language'],
        num_rows: 2584
    })
})

In [4]:
from transformers import AutoTokenizer

base_model = "THUDM/chatglm-6b"
base_model_revision = "969290547e761b20fdb96b0602b4fd8d863bbb85"
base_model_revision = None

tokenizer = AutoTokenizer.from_pretrained(
    base_model, 
    revision=base_model_revision, 
    trust_remote_code=True,
    # padding_side="right",
)

Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading (…)enization_chatglm.py:   0%|          | 0.00/17.0k [00:00<?, ?B/s]

In [5]:
import transformers
from transformers import PreTrainedTokenizer

def preprocess(tokenizer: PreTrainedTokenizer, config, context, target, max_seq_length):
    context_tokens = tokenizer.encode(
        context,
        max_length=max_seq_length,
        truncation=True,
    )
    target_tokens = tokenizer.encode(
        target,
        max_length=max_seq_length,
        truncation=True,
        add_special_tokens=False,
    )
    return dict(
        input_ids=context_tokens + target_tokens + [config.eos_token_id],
        seq_len=len(context_tokens),
    )


def get_context_item(story: str, q1, answer):
    if q1 == "" or answer == "":
        return None, None, None
    origin_question = q1
    context = f"""给你下面的文本和问题，请先给出一个对应问题的同义转述，再给出问题的答案。
文本为：{story}
问题为：{origin_question}
"""
    target = f"""答案为：{answer}"""
    return context, origin_question, target

max_seq_length = 1024
skip_overlength = False
device_map = "auto"
config = transformers.AutoConfig.from_pretrained(base_model, trust_remote_code=True, device_map=device_map)
def tokenize_and_split(examples):
    result = {
        "input_ids": [],
        "seq_len": [],
    }
    for i, story in enumerate(examples["story"]):
        for j, (question, answer) in enumerate(zip(examples["questions"][i], examples["answers"][i])):
            # print(i, story, question, answer)
            context, origin_question, target = get_context_item(story, question, answer)
            if context is not None:
                feature = preprocess(tokenizer, config, context, target, max_seq_length)
                if skip_overlength and len(feature["input_ids"]) > max_seq_length:
                    continue
                feature["input_ids"] = feature["input_ids"][:max_seq_length]
                result["input_ids"].append(feature["input_ids"])
                result["seq_len"].append(feature["seq_len"])
    return result

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.


In [6]:
tokenized_dataset = dataset.map(tokenize_and_split, batched=True, num_proc=16, remove_columns=dataset['train'].column_names)
tokenized_dataset

Map (num_proc=16):   0%|          | 0/23252 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/2584 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'seq_len'],
        num_rows: 116260
    })
    test: Dataset({
        features: ['input_ids', 'seq_len'],
        num_rows: 12920
    })
})

In [7]:
tokenized_dataset["train"][10]

{'input_ids': [5,
  65696,
  72751,
  71455,
  63826,
  63963,
  6,
  106748,
  71432,
  63866,
  68823,
  71167,
  64095,
  64899,
  64191,
  69568,
  6,
  63921,
  71432,
  122021,
  63823,
  4,
  71455,
  63834,
  12,
  22,
  63827,
  69311,
  78955,
  72576,
  74387,
  75604,
  67215,
  867,
  63992,
  64616,
  76549,
  116774,
  63826,
  82723,
  66815,
  76549,
  80018,
  6,
  69374,
  64829,
  64276,
  68304,
  63825,
  867,
  63992,
  70974,
  7,
  63827,
  64602,
  65278,
  63839,
  68231,
  64276,
  84352,
  65278,
  68427,
  71131,
  64536,
  6,
  71432,
  68659,
  65646,
  63833,
  17,
  65520,
  64166,
  17,
  66076,
  93964,
  103725,
  64638,
  65278,
  64536,
  76968,
  68486,
  65461,
  6,
  64344,
  75511,
  66156,
  76549,
  70424,
  65690,
  66815,
  76549,
  66905,
  6,
  78930,
  66459,
  78368,
  66385,
  114895,
  67110,
  64398,
  64285,
  7,
  65951,
  67124,
  6,
  64030,
  64113,
  108580,
  67110,
  72576,
  74387,
  63833,
  100630,
  64179,
  7,
  22,
  4

In [8]:
import torch
import peft
from peft import LoraConfig, TaskType, get_peft_model
from transformers import AutoModel, AutoConfig
import torch.nn as nn

class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)

def load_train_model(model_path, lora_rank, model_revision: str = None, cache_dir: str = None, device_map: str=None, ddp: bool = False, load_in_8bit: bool = False):
    # init model
    model = AutoModel.from_pretrained(
        model_path, trust_remote_code=True, device_map=device_map, revision=model_revision, cache_dir = cache_dir
    )
    model.gradient_checkpointing_enable()
    model.enable_input_require_grads()
    model.is_parallelizable = True
    model.model_parallel = True
    model.lm_head = CastOutputToFloat(model.lm_head)
    model.config.use_cache = (
        False  # silence the warnings. Please re-enable for inference!
    )
    # setup peft
    peft_config = LoraConfig(
        task_type=TaskType.CAUSAL_LM,
        inference_mode=False,
        r=lora_rank,
        lora_alpha=32,
        lora_dropout=0.1,
    )
    model = get_peft_model(model, peft_config)
    return model



Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues
CUDA SETUP: CUDA runtime path found: /usr/local/cuda-11.7/lib64/libcudart.so
CUDA SETUP: Highest compute capability among GPUs detected: 8.0
CUDA SETUP: Detected CUDA version 117
CUDA SETUP: Loading binary /usr/local/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...


In [9]:
def data_collator(tokenizer, features: list, to_device = None) -> dict:
    len_ids = [len(feature["input_ids"]) for feature in features]
    longest = max(len_ids)
    input_ids = []
    labels_list = []
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]
        labels = (
            [-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l)
        )
        ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)
        _ids = torch.LongTensor(ids)
        labels_list.append(torch.LongTensor(labels))
        input_ids.append(_ids)
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels_list)
    if to_device:
        input_ids = input_ids.to(to_device)
        labels = labels.to(to_device)
    return {
        "input_ids": input_ids,
        "labels": labels,
    }

In [10]:
from transformers import Trainer, TrainingArguments
from transformers import Trainer, TrainingArguments, IntervalStrategy
from os import path

per_device_train_batch_size=32
num_train_epochs = 8
max_steps = -1
project_name = "luotuo-qa-b"

output_path = "output/"+project_name+"-"+str("v1")

def get_steps_in_epoch(pos: float):
    res = int(pos * (dataset["train"].num_rows // per_device_train_batch_size))
    return res if res > 0 else 1

model = load_train_model(base_model, 8, model_revision=base_model_revision, device_map=device_map, load_in_8bit=False)

Explicitly passing a `revision` is encouraged when loading a configuration with custom code to ensure no malicious code has been contributed in a newer revision.
Explicitly passing a `revision` is encouraged when loading a model with custom code to ensure no malicious code has been contributed in a newer revision.


Downloading (…)/modeling_chatglm.py:   0%|          | 0.00/57.6k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/8 [00:00<?, ?it/s]

Downloading (…)l-00001-of-00008.bin:   0%|          | 0.00/1.74G [00:00<?, ?B/s]

Downloading (…)l-00002-of-00008.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading (…)l-00003-of-00008.bin:   0%|          | 0.00/1.98G [00:00<?, ?B/s]

Downloading (…)l-00004-of-00008.bin:   0%|          | 0.00/1.91G [00:00<?, ?B/s]

Downloading (…)l-00005-of-00008.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading (…)l-00006-of-00008.bin:   0%|          | 0.00/1.88G [00:00<?, ?B/s]

Downloading (…)l-00007-of-00008.bin:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Downloading (…)l-00008-of-00008.bin:   0%|          | 0.00/1.07G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/8 [00:00<?, ?it/s]

In [11]:
class ModifiedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        return model(
            input_ids=inputs["input_ids"],
            labels=inputs["labels"],
        ).loss

training_args = TrainingArguments(
    output_dir=output_path,
    remove_unused_columns=False,
    # evaluation_strategy=IntervalStrategy.STEPS,
    # eval_steps=get_steps_in_epoch(0.1),
    save_strategy=IntervalStrategy.STEPS,
    save_steps=get_steps_in_epoch(0.1),
    logging_strategy=IntervalStrategy.STEPS,
    logging_steps=get_steps_in_epoch(0.02),
    learning_rate=5e-5,
    per_device_train_batch_size=per_device_train_batch_size,
    # gradient_accumulation_steps=1,
    # per_device_eval_batch_size=per_device_train_batch_size,
    num_train_epochs=num_train_epochs,
    max_steps=max_steps,
    warmup_ratio=0.1,
    # load_best_model_at_end=True,
    # metric_for_best_model="accuracy",
    torch_compile=True,
    dataloader_pin_memory=True,
    dataloader_num_workers=8,
    fp16=True,
)

def inner_data_collator(features: list) -> dict:
    return data_collator(tokenizer, features)

trainer = ModifiedTrainer(
    model=model,
    train_dataset=tokenized_dataset["train"],
    args=training_args,
    data_collator=inner_data_collator,
)

resume_from_checkpoint = None
# resume_from_checkpoint = path.join(output_path, "checkpoint-6410")
trainer.train(resume_from_checkpoint=resume_from_checkpoint)
# save model
model.save_pretrained(output_path)

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mliasece[0m. Use [1m`wandb login --relogin`[0m to force relogin


TorchRuntimeError: 

from user code:
   File "/root/.cache/huggingface/modules/transformers_modules/THUDM/chatglm-6b/1d240ba371910e9282298d4592532d7f0f3e9f3e/modeling_chatglm.py", line 926, in forward
    inputs_embeds = self.word_embeddings(input_ids)

Set torch._dynamo.config.verbose=True for more information


You can suppress this exception and fall back to eager by setting:
    torch._dynamo.config.suppress_errors = True
