In [3]:
import subprocess
import os
os.environ['CURL_CA_BUNDLE'] = ''

result = subprocess.run('bash -c "source /etc/network_turbo && env | grep proxy"', shell=True, capture_output=True, text=True)
output = result.stdout
for line in output.splitlines():
    if '=' in line:
        var, value = line.split('=', 1)
        os.environ[var] = value

In [1]:
# !pip install -q bitsandbytes datasets accelerate loralib
# !pip install -q git+https://github.com/huggingface/transformers.git@main 
# !pip install -q git+https://github.com/huggingface/peft.git

# 任务概要

- bigscience/bloom-7b1
- lora fine-tune bloom: 可插拔式的（plugin/adapter）
    - freeeze original weights
    - plugin lora adapters (peft)
- huggingface transformers 库
    - trainer.train 的参数及过程
    - mlm 与 clm（都是 unsupervised learning，都可以自动地构建 input/labels）
        - mlm：bert 等
        - clm：gpt、bloom 等
    - pipeline
        - dataset/tasks
        - tokenizer
        - training (fine-tune by lora)
        - inference

# Base model & Lora adapters

In [2]:
import torch
import torch.nn as nn
import bitsandbytes as bnb 
from transformers import AutoTokenizer, AutoConfig, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model 

In [5]:
model = AutoModelForCausalLM.from_pretrained(
    "bigscience/bloom-7b1", 
    load_in_8bit=True, 
    device_map='auto',
)

tokenizer = AutoTokenizer.from_pretrained("bigscience/bloom-7b1")

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# model.config
AutoConfig.from_pretrained("bigscience/bloom-7b1")

BloomConfig {
  "_name_or_path": "bigscience/bloom-7b1",
  "apply_residual_connection_post_layernorm": false,
  "architectures": [
    "BloomForCausalLM"
  ],
  "attention_dropout": 0.0,
  "attention_softmax_in_fp32": true,
  "bias_dropout_fusion": true,
  "bos_token_id": 1,
  "eos_token_id": 2,
  "hidden_dropout": 0.0,
  "hidden_size": 4096,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "masked_softmax_fusion": true,
  "model_type": "bloom",
  "n_head": 32,
  "n_inner": null,
  "n_layer": 30,
  "offset_alibi": 100,
  "pad_token_id": 3,
  "pretraining_tp": 1,
  "skip_bias_add": true,
  "skip_bias_add_qkv": false,
  "slow_but_exact": false,
  "torch_dtype": "float16",
  "transformers_version": "4.45.2",
  "unk_token_id": 0,
  "use_cache": true,
  "vocab_size": 250880
}

In [7]:
model

BloomForCausalLM(
  (transformer): BloomModel(
    (word_embeddings): Embedding(250880, 4096)
    (word_embeddings_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
    (h): ModuleList(
      (0-29): 30 x BloomBlock(
        (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (self_attention): BloomAttention(
          (query_key_value): Linear8bitLt(in_features=4096, out_features=12288, bias=True)
          (dense): Linear8bitLt(in_features=4096, out_features=4096, bias=True)
          (attention_dropout): Dropout(p=0.0, inplace=False)
        )
        (post_attention_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (mlp): BloomMLP(
          (dense_h_to_4h): Linear8bitLt(in_features=4096, out_features=16384, bias=True)
          (gelu_impl): BloomGelu()
          (dense_4h_to_h): Linear8bitLt(in_features=16384, out_features=4096, bias=True)
        )
      )
    )
    (ln_f): LayerNorm((4096,), eps=1e-05, eleme

In [8]:
# model.transformer.word_embeddings
model.get_input_embeddings()

Embedding(250880, 4096)

In [9]:
tokenizer

BloomTokenizerFast(name_or_path='bigscience/bloom-7b1', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

## Freeze original weights

In [10]:
list(model.parameters())[0].dtype

torch.float16

In [12]:
for i, param in enumerate(model.parameters()):
    param.requires_grad = False   # freeze the model - train adapters later
    # print(i, 'param.requires_grad = False')
    if param.ndim == 1:
        # cast the small parameters (e.g. layernorm) to fp32 for stability
        param.data = param.data.to(torch.float32)
        # print(i, 'ndim == 1, torch.float16 to torch.float32')

In [13]:
# reduce number of stored activations
model.gradient_checkpointing_enable()
model.enable_input_require_grads()

In [14]:
class CastOutputToFloat(nn.Sequential):
    def forward(self, x):
        return super().forward(x).to(torch.float32)

model.lm_head = CastOutputToFloat(model.lm_head)

## LoRa Adapters

In [15]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(f"trainable params: {trainable_params} || all params: {all_param} || trainable: {round(100 * trainable_params / all_param, 2)} %")

In [16]:
print_trainable_parameters(model)

trainable params: 0 || all params: 7069016064 || trainable: 0.0 %


In [17]:
from peft import LoraConfig, get_peft_model

config = LoraConfig(
    r=16,                  # low rank
    lora_alpha=32,         # alpha scaling， scale lora weights/outputs
    # target_modules=["q_proj", "v_proj"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"  # set this for CLM or Seq2Seq
)

In [18]:
model = get_peft_model(model, config)

In [19]:
model

PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): BloomForCausalLM(
      (transformer): BloomModel(
        (word_embeddings): Embedding(250880, 4096)
        (word_embeddings_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
        (h): ModuleList(
          (0-29): 30 x BloomBlock(
            (input_layernorm): LayerNorm((4096,), eps=1e-05, elementwise_affine=True)
            (self_attention): BloomAttention(
              (query_key_value): lora.Linear8bitLt(
                (base_layer): Linear8bitLt(in_features=4096, out_features=12288, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.05, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=4096, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=12288, bias=False)
                )
       

In [20]:
print_trainable_parameters(model)

trainable params: 7864320 || all params: 7076880384 || trainable: 0.11 %


# Pipeline

## data

In [24]:
import transformers
from datasets import load_dataset, load_from_disk

# dataset = load_dataset("Abirate/english_quotes")
dataset = load_from_disk("data/english_quotes")

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags'],
        num_rows: 2508
    })
})

In [26]:
dataset['train']

Dataset({
    features: ['quote', 'author', 'tags'],
    num_rows: 2508
})

In [27]:
dataset['train'].to_pandas()

Unnamed: 0,quote,author,tags
0,“Be yourself; everyone else is already taken.”,Oscar Wilde,"[be-yourself, gilbert-perreira, honesty, inspi..."
1,"“I'm selfish, impatient and a little insecure....",Marilyn Monroe,"[best, life, love, mistakes, out-of-control, t..."
2,“Two things are infinite: the universe and hum...,Albert Einstein,"[human-nature, humor, infinity, philosophy, sc..."
3,"“So many books, so little time.”",Frank Zappa,"[books, humor]"
4,“A room without books is like a body without a...,Marcus Tullius Cicero,"[books, simile, soul]"
...,...,...,...
2503,“Morality is simply the attitude we adopt towa...,"Oscar Wilde,","[morality, philosophy]"
2504,“Don't aim at success. The more you aim at it ...,"Viktor E. Frankl,","[happiness, success]"
2505,"“In life, finding a voice is speaking and livi...",John Grisham,[inspirational-life]
2506,"“Winter is the time for comfort, for good food...",Edith Sitwell,"[comfort, home, winter]"


In [28]:
dataset['train']['quote'][:4]

['“Be yourself; everyone else is already taken.”',
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”",
 '“So many books, so little time.”']

In [29]:
dataset['train']['author'][:4]

['Oscar Wilde', 'Marilyn Monroe', 'Albert Einstein', 'Frank Zappa']

In [30]:
dataset['train'][:4]

{'quote': ['“Be yourself; everyone else is already taken.”',
  "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.”",
  "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.”",
  '“So many books, so little time.”'],
 'author': ['Oscar Wilde', 'Marilyn Monroe', 'Albert Einstein', 'Frank Zappa'],
 'tags': [['be-yourself',
   'gilbert-perreira',
   'honesty',
   'inspirational',
   'misattributed-oscar-wilde',
   'quote-investigator'],
  ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst'],
  ['human-nature',
   'humor',
   'infinity',
   'philosophy',
   'science',
   'stupidity',
   'universe'],
  ['books', 'humor']]}

In [31]:
str(dataset['train']['tags'][0])

"['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']"

In [32]:
def merge(row):
    row['prediction'] = row['quote'] + ' ->: ' + str(row['tags'])
    return row

dataset['train'] = dataset['train'].map(merge)

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [34]:
dataset['train']['prediction'][:5]

["“Be yourself; everyone else is already taken.” ->: ['be-yourself', 'gilbert-perreira', 'honesty', 'inspirational', 'misattributed-oscar-wilde', 'quote-investigator']",
 "“I'm selfish, impatient and a little insecure. I make mistakes, I am out of control and at times hard to handle. But if you can't handle me at my worst, then you sure as hell don't deserve me at my best.” ->: ['best', 'life', 'love', 'mistakes', 'out-of-control', 'truth', 'worst']",
 "“Two things are infinite: the universe and human stupidity; and I'm not sure about the universe.” ->: ['human-nature', 'humor', 'infinity', 'philosophy', 'science', 'stupidity', 'universe']",
 "“So many books, so little time.” ->: ['books', 'humor']",
 "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"]

In [35]:
dataset['train'][4]

{'quote': '“A room without books is like a body without a soul.”',
 'author': 'Marcus Tullius Cicero',
 'tags': ['books', 'simile', 'soul'],
 'prediction': "“A room without books is like a body without a soul.” ->: ['books', 'simile', 'soul']"}

In [43]:
tokenizer

BloomTokenizerFast(name_or_path='bigscience/bloom-7b1', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [44]:
tokenizer_res = tokenizer(dataset['train']['prediction'][:4])
tokenizer_res

{'input_ids': [[1502, 17143, 33218, 30, 39839, 4384, 632, 11226, 15713, 17, 982, 11953, 29, 24629, 2765, 17731, 3240, 15407, 10, 15, 83077, 354, 26624, 31683, 71421, 10, 15, 756, 19218, 56452, 10, 15, 756, 71538, 3383, 10, 15, 29412, 290, 96783, 11914, 43555, 5231, 16728, 51464, 10, 15, 756, 67091, 15595, 51261, 2623, 3166], [1502, 10203, 239002, 15, 136192, 1049, 530, 267, 10512, 3131, 133716, 17, 473, 5219, 120496, 15, 473, 912, 1800, 461, 5048, 530, 919, 11866, 12587, 427, 21053, 17, 7702, 1320, 1152, 11229, 21053, 1074, 919, 2670, 69583, 15, 3816, 1152, 11097, 661, 62798, 5926, 158808, 1074, 919, 2670, 7733, 17, 982, 11953, 29, 24629, 42415, 10, 15, 34037, 6266, 10, 15, 756, 150243, 10, 15, 29412, 617, 23427, 10, 15, 756, 1199, 40404, 49359, 10, 15, 756, 454, 10607, 10, 15, 63281, 153698, 3166], [1502, 35417, 11217, 1306, 61759, 29, 368, 71300, 530, 7384, 78851, 131856, 530, 6782, 1130, 11097, 3638, 368, 71300, 17, 982, 11953, 29, 24629, 62524, 9317, 3864, 10, 15, 756, 28498, 21623

## tokenize

In [52]:
dataset = dataset.map(lambda samples: tokenizer(samples['prediction']), batched=True)

Map:   0%|          | 0/2508 [00:00<?, ? examples/s]

In [54]:
# 后续用到的是 'input_ids', 'attention_mask'
dataset

DatasetDict({
    train: Dataset({
        features: ['quote', 'author', 'tags', 'prediction', 'input_ids', 'attention_mask'],
        num_rows: 2508
    })
})

## training

In [55]:
from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [56]:
trainer = Trainer(
    model=model, 
    train_dataset=dataset['train'],
    args=TrainingArguments(
        per_device_train_batch_size=4, 
        gradient_accumulation_steps=4,
        warmup_steps=100, 
        max_steps=200, 
        learning_rate=2e-4, 
        fp16=True,
        logging_steps=1, 
        output_dir='outputs',
        report_to='none'
    ),
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)
model.config.use_cache = False  
trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
max_steps is given, it will override any value given in num_train_epochs
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss
1,3.4328
2,3.4033
3,2.8681
4,3.4131
5,2.8875
6,3.3041
7,3.4173
8,3.1861
9,3.1572
10,3.4196


TrainOutput(global_step=200, training_loss=2.310559794306755, metrics={'train_runtime': 779.8424, 'train_samples_per_second': 4.103, 'train_steps_per_second': 0.256, 'total_flos': 1.3312956044279808e+16, 'train_loss': 2.310559794306755, 'epoch': 1.2759170653907497})

In [61]:
trainer.data_collator

DataCollatorForLanguageModeling(tokenizer=BloomTokenizerFast(name_or_path='bigscience/bloom-7b1', vocab_size=250680, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=False),  added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	3: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, mlm=False, mlm_probability=0.15, pad_to_multiple_of=None, tf_experimental_compile=False, return_tensors='pt')

## inference

In [69]:
batch = tokenizer("“Training models with PEFT and LoRa is cool” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

  with torch.cuda.amp.autocast():




 “Training models with PEFT and LoRa is cool” ->:  [', ', ', ', ', ', ', ', ', ', ', ', ', ', ', ‘, ‘, ‘, ‘, ‘, ‘, ‘, ‘, ‘, ‘,


In [70]:
batch = tokenizer("“An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains.” ->: ", return_tensors='pt')

with torch.cuda.amp.autocast():
    output_tokens = model.generate(**batch, max_new_tokens=50)

print('\n\n', tokenizer.decode(output_tokens[0], skip_special_tokens=True))

  with torch.cuda.amp.autocast():




 “An important paradigm of natural language processing consists of large-scale pre-training on general domain data and adaptation to particular tasks or domains.” ->:  ['. domain ont ont ont ont ont ont ont ont ont ont’ ont ont ont’ ont ont’ ont ont’ ont’ ont’ ont’ ont’ ont’ ont’ ont’ ont’ ont’ ont’ ont’ ont’ ont’
