# Training
Refer:https://colab.research.google.com/github/mymusise/ChatGLM-Tuning/blob/master/examples/finetune.ipynb

In [1]:
%pip install -qU protobuf transformers==4.30.2 cpm_kernels torch>=2.0 mdtex2html sentencepiece accelerate
%pip install -qU datasets loralib jupyter ipywidgets
%pip install -qU git+https://github.com/huggingface/peft.git

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [4]:
from huggingface_hub import login
login(token= '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',
      add_to_git_credential= True)

Token is valid (permission: write).
[1m[31mCannot authenticate through git-credential as no helper is defined on your machine.
You might have to re-authenticate when pushing to the Hugging Face Hub.
Run the following command in your terminal in case you want to set the 'store' credential helper as default.

git config --global credential.helper store

Read https://git-scm.com/book/en/v2/Git-Tools-Credential-Storage for more details.[0m
Token has not been saved to git credential helper.
Your token has been saved to /workspace/.cache/huggingface/token
Login successful


In [5]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True)
basemodel = AutoModel.from_pretrained("THUDM/chatglm2-6b", trust_remote_code=True).half().cuda()
print(basemodel)

Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]

ChatGLMForConditionalGeneration(
  (transformer): ChatGLMModel(
    (embedding): Embedding(
      (word_embeddings): Embedding(65024, 4096)
    )
    (rotary_pos_emb): RotaryEmbedding()
    (encoder): GLMTransformer(
      (layers): ModuleList(
        (0-27): 28 x GLMBlock(
          (input_layernorm): RMSNorm()
          (self_attention): SelfAttention(
            (query_key_value): Linear(in_features=4096, out_features=4608, bias=True)
            (core_attention): CoreAttention(
              (attention_dropout): Dropout(p=0.0, inplace=False)
            )
            (dense): Linear(in_features=4096, out_features=4096, bias=False)
          )
          (post_attention_layernorm): RMSNorm()
          (mlp): MLP(
            (dense_h_to_4h): Linear(in_features=4096, out_features=27392, bias=False)
            (dense_4h_to_h): Linear(in_features=13696, out_features=4096, bias=False)
          )
        )
      )
      (final_layernorm): RMSNorm()
    )
    (output_layer): Linear(in_

In [6]:
from datasets import load_dataset
train_dataset = load_dataset('csv',data_files='./train_data.csv')
def preprocess_dialogue(example):
    # Tokenize the query and response
    prompt = example["context"]
    target = example["target"]
    prompt_ids = tokenizer.encode(prompt,truncation=True,add_special_tokens=True)
    target_ids = tokenizer.encode(target,truncation=True,add_special_tokens=False)
    input_ids = prompt_ids + target_ids
    return {"input_ids": input_ids, "seq_len": len(prompt_ids)}

# Preprocess the datasets
train_dataset = train_dataset.map(preprocess_dialogue)
print(train_dataset)

Downloading and preparing dataset csv/default to /workspace/.cache/huggingface/datasets/csv/default-ccf32f2f89647e90/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset csv downloaded and prepared to /workspace/.cache/huggingface/datasets/csv/default-ccf32f2f89647e90/0.0.0/eea64c71ca8b46dd3f537ed218fc9bf495d5707789152eb2764f5c78fa66d59d. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Map:   0%|          | 0/30885 [00:00<?, ? examples/s]

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


DatasetDict({
    train: Dataset({
        features: ['context', 'target', 'input_ids', 'seq_len'],
        num_rows: 30885
    })
})


In [7]:
model_inputs = train_dataset
#model_inputs = train_dataset['train'].train_test_split(test_size=0.1,shuffle=True,seed=2023)
model_inputs

DatasetDict({
    train: Dataset({
        features: ['context', 'target', 'input_ids', 'seq_len'],
        num_rows: 30885
    })
})

In [8]:
import torch
import torch.nn as nn
for param in basemodel.parameters():
  param.requires_grad = False  # freeze the model - train adapters later
  if param.ndim == 1:
    # cast the small parameters (e.g. layernorm) to fp32 for stability
    param.data = param.data.to(torch.float32)

basemodel.gradient_checkpointing_enable()  # reduce number of stored activations
basemodel.enable_input_require_grads()
basemodel.is_parallelizable = True
basemodel.model_parallel = True

In [9]:
def print_trainable_parameters(model):
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [10]:
from peft import LoraConfig, get_peft_model, PeftModel

config = LoraConfig(
    r=16,
    lora_alpha=32,
    inference_mode=False,
    lora_dropout=0.05,
    #bias="none",
    task_type="CAUSAL_LM"
)
lora_dir = 'Jyshen/Chat_Suzumiya_GLM2LoRA'
basemodel = PeftModel.from_pretrained(basemodel, lora_dir)
model = get_peft_model(basemodel, config)
print_trainable_parameters(model)

trainable params: 3899392 || all params: 6247483392 || trainable%: 0.06241540401681151


In [11]:
from transformers import Trainer, TrainingArguments
def data_collator(features: list) -> dict:
    len_ids = [len(feature["input_ids"]) for feature in features]
    longest = max(len_ids)
    input_ids = []
    labels_list = []
    for ids_l, feature in sorted(zip(len_ids, features), key=lambda x: -x[0]):
        ids = feature["input_ids"]
        seq_len = feature["seq_len"]
        labels = (
            [-100] * (seq_len - 1) + ids[(seq_len - 1) :] + [-100] * (longest - ids_l)
        )
        ids = ids + [tokenizer.pad_token_id] * (longest - ids_l)
        _ids = torch.LongTensor(ids)
        labels_list.append(torch.LongTensor(labels))
        input_ids.append(_ids)
    input_ids = torch.stack(input_ids)
    labels = torch.stack(labels_list)
    return {
        "input_ids": input_ids,
        "labels": labels,
    }

class ModifiedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        return model(
            input_ids=inputs["input_ids"],
            labels=inputs["labels"],
        ).loss

In [12]:
training_args = TrainingArguments(
    num_train_epochs = 2,
    max_steps = -1,
    evaluation_strategy = "no",
    gradient_accumulation_steps = 1,
    group_by_length=False,
    save_strategy = "steps",
    save_steps = 500,
    output_dir = 'output',
    remove_unused_columns = False,
    per_device_train_batch_size = 32,
    per_device_eval_batch_size = 32,
    learning_rate = 1e-4,
    fp16 = True,
    seed=2023,
    data_seed=2023
)


trainer = ModifiedTrainer(
    model=model,
    train_dataset=model_inputs['train'],
    #eval_dataset=model_inputs['test'],
    args=training_args,
    data_collator=data_collator,
)
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Step,Training Loss
500,1.5634
1000,1.4177
1500,1.3488


TrainOutput(global_step=1932, training_loss=1.4167814077057452, metrics={'train_runtime': 19850.1086, 'train_samples_per_second': 3.112, 'train_steps_per_second': 0.097, 'total_flos': 2.3950802649049006e+18, 'train_loss': 1.4167814077057452, 'epoch': 2.0})

In [14]:
model.push_to_hub("Jyshen/Chat_Suzumiya_GLM2LoRA", use_auth_token=True)

Upload 1 LFS files:   0%|          | 0/1 [00:00<?, ?it/s]

adapter_model.bin:   0%|          | 0.00/15.6M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/Jyshen/Chat_Suzumiya_GLM2LoRA/commit/5eb2b9e6c6ab50d8085118a567c53422969410a0', commit_message='Upload model', commit_description='', oid='5eb2b9e6c6ab50d8085118a567c53422969410a0', pr_url=None, pr_revision=None, pr_num=None)