In [1]:
from google.colab import drive
drive.mount('/content/gdrive')
import os
os.chdir('/content/gdrive/MyDrive/instruct-caption')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [4]:
!pip install -r requirements.txt

Collecting accelerate@ git+https://github.com/huggingface/accelerate.git (from -r requirements.txt (line 2))
  Cloning https://github.com/huggingface/accelerate.git to /tmp/pip-install-x_mhvziz/accelerate_70cd1bf203e24524b04013b628e2ccac
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/accelerate.git /tmp/pip-install-x_mhvziz/accelerate_70cd1bf203e24524b04013b628e2ccac
  Resolved https://github.com/huggingface/accelerate.git to commit 2c767338f29989e54ce93b3036b41e02013af7a7
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting transformers@ git+https://github.com/huggingface/transformers.git (from -r requirements.txt (line 5))
  Cloning https://github.com/huggingface/transformers.git to /tmp/pip-install-x_mhvziz/transformers_f88d30b7c46044adaa6ded282a1dc893
  Running command git clone --filter=blob:none --quiet https://git

In [2]:
import json, tqdm, torch, transformers, datasets
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import get_peft_model, LoraConfig, TaskType

In [3]:
def preprocess(tokenizer, config, example, max_seq_length):
    '''
    args:
    tokenizer
    config
    example
    max_seq_length
    returns: include inputs_id 和 seq_len
    '''
    # prompt template
    prompt = "<s>Human: " + example["instruction"] + example["input"] + "\n" + "</s><s>Assistant:"
    target = example["output"]
    # tokenizer, set truncation to be true
    prompt_ids = tokenizer.encode(prompt, max_length=max_seq_length, truncation=True)
    target_ids = tokenizer.encode(
        target,
        max_length=max_seq_length,
        truncation=True,
        add_special_tokens=False)
    # Add EOS
    input_ids = prompt_ids + target_ids + [config.eos_token_id]
    # inputs and sequence_length, labels could be obtained from these two values
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

In [4]:
# read json files, return a iterator
def read_jsonl(path, max_seq_length, model_path, skip_overlength=False):
    '''
    args:
    path: training data path
    max_seq_length: max length
    model_path: model path
    returns
    '''
    # Load tokenizer
    tokenizer = transformers.AutoTokenizer.from_pretrained(
        model_path,trust_remote_code=True)
    config = transformers.AutoConfig.from_pretrained(
        model_path, trust_remote_code=True, device_map='auto')
    # Read the file
    with open(path, "r") as f:
        # read jsonl line by line
        lst = [json.loads(line) for line in f.readlines()]
        print("Upload jsonl dataset，there are {} samples in total".format(len(lst)))
        for example in tqdm.tqdm(lst):
            # preprocess
            feature = preprocess(tokenizer, config, example, max_seq_length)
            if skip_overlength and len(feature["input_ids"]) > max_seq_length:
                continue
            # cutoff
            feature["input_ids"] = feature["input_ids"][:max_seq_length]
            # yield iterator
            yield feature

In [5]:
def data_collator(features: list, tokenizer) -> dict:
    len_ids = [len(feature["input_ids"]) for feature in features]
    longest = max(len_ids)
    input_ids = []
    labels_list = []
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]
        labels = (
            [-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l)
        )
        ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)
        _ids = torch.LongTensor(ids)
        labels_list.append(torch.LongTensor(labels))
        input_ids.append(_ids)
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels_list)
    return {
        "input_ids": input_ids,
        "labels": labels,
    }

In [6]:
class ModifiedTrainer(Trainer):

    def compute_loss(self, model, inputs, return_outputs=False):
        return model(
            input_ids=inputs["input_ids"],
            labels=inputs["labels"],
        ).loss

In [8]:
target_modules = ['W_pack', 'down_proj', 'o_proj', 'gate_proj', 'up_proj']
# Lora config
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM, # task
    inference_mode=False, # training mode
    r=2, # Lora rank
    lora_alpha=4, # Lora alaph，
    lora_dropout=0.1,# Dropout
    target_modules= target_modules # Lora layers
)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)
model_path = "Qwen/Qwen1.5-1.8B-Chat"
dataset_path = "dataset.jsonl"
max_seq_length = 256
tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_path, trust_remote_code=True,quantization_config=bnb_config,
    device_map="auto")

model = get_peft_model(model, peft_config)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.67G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/206 [00:00<?, ?B/s]

In [9]:
print(model)

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 2048)
        (layers): ModuleList(
          (0-23): 24 x Qwen2DecoderLayer(
            (self_attn): Qwen2SdpaAttention(
              (q_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
              (k_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
              (v_proj): Linear4bit(in_features=2048, out_features=2048, bias=True)
              (o_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=2048, out_features=2048, bias=False)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=2048, out_features=2, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=2, 

In [10]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

print_trainable_parameters(model)

trainable params: 1284096 || all params: 1230987264 || trainable%: 0.1043143205094931


In [11]:
training_args = TrainingArguments(
    "output",
    fp16=True,
    gradient_accumulation_steps=1,
    per_device_train_batch_size = 8,
    learning_rate = 2e-5,
    num_train_epochs=10,
    logging_steps=50,
    remove_unused_columns=False,
    seed=0,
    data_seed=0,
    group_by_length=False,
)
dataset = datasets.Dataset.from_generator(
        lambda: read_jsonl(
            dataset_path, max_seq_length, model_path, skip_overlength=False
            )
    )
trainer = ModifiedTrainer(
    model=model,
    train_dataset=dataset,
    args=training_args,
    data_collator=lambda x : data_collator(x, tokenizer),
)
trainer.train()
output_dir = 'saved_model'
print("Finish training, saved to the folder {}".format(output_dir))
# save the model
model.save_pretrained(output_dir)

Downloading and preparing dataset generator/default to /root/.cache/huggingface/datasets/generator/default-626589e2c3e62248/0.0.0...


Generating train split: 0 examples [00:00, ? examples/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Upload jsonl dataset，there are 989 samples in total



  0%|          | 0/989 [00:00<?, ?it/s][A
 10%|█         | 101/989 [00:00<00:00, 990.82it/s][A
 20%|██        | 201/989 [00:00<00:00, 888.45it/s][A
 30%|███       | 297/989 [00:00<00:00, 918.80it/s][A
 40%|███▉      | 393/989 [00:00<00:00, 932.38it/s][A
 49%|████▉     | 487/989 [00:00<00:00, 800.30it/s][A
 59%|█████▉    | 582/989 [00:00<00:00, 843.88it/s][A
 68%|██████▊   | 677/989 [00:00<00:00, 870.47it/s][A
 77%|███████▋  | 766/989 [00:00<00:00, 846.64it/s][A
 86%|████████▌ | 852/989 [00:01<00:00, 772.88it/s][A
100%|██████████| 989/989 [00:01<00:00, 807.55it/s]


Dataset generator downloaded and prepared to /root/.cache/huggingface/datasets/generator/default-626589e2c3e62248/0.0.0. Subsequent calls will reuse this data.


Step,Training Loss
50,4.5732
100,0.2589
150,0.2021
200,0.1886
250,0.2053
300,0.1928
350,0.1779
400,0.1751
450,0.1666
500,0.1742


Finish training, saved to the folder saved_model


In [None]:
!tensorboard --logdir output/runs --bind_all


2024-05-01 21:14:20.697410: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-01 21:14:20.704529: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-01 21:14:20.706695: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered

NOTE: Using experimental fast data loading logic. To disable, pass
    "--load_fast=false" and report issues on GitHub. More details:
    https://github.com/tensorflow/tensorboard/issues/4784

TensorBoard 2.15.2 at http://7ae58b219732:6006/ (Press CTRL+C to quit)
