# **Preparation**

In [1]:
!pip install bitsandbytes==0.43.0
!pip install datasets==2.10.1
!pip install transformers==4.38.2
!pip install peft==0.9.0
!pip install sentencepiece==0.1.99
!pip install -U accelerate==0.28.0
!pip install colorama==0.4.6

Collecting bitsandbytes==0.43.0
  Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl.metadata (1.8 kB)
Downloading bitsandbytes-0.43.0-py3-none-manylinux_2_24_x86_64.whl (102.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m102.2/102.2 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: bitsandbytes
Successfully installed bitsandbytes-0.43.0
Collecting datasets==2.10.1
  Downloading datasets-2.10.1-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.7,>=0.3.0 (from datasets==2.10.1)
  Downloading dill-0.3.6-py3-none-any.whl.metadata (9.8 kB)
Collecting xxhash (from datasets==2.10.1)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets==2.10.1)
  Downloading multiprocess-0.70.17-py310-none-any.whl.metadata (7.2 kB)
Collecting responses<0.19 (from datasets==2.10.1)
  Downloading responses-0.18.0-py3-none-any.whl.metadata (29 kB)
INFO

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from transformers.generation import GenerationConfig
from transformers import BitsAndBytesConfig
import transformers

from datasets import Dataset
import json
import pandas as pd
import torch
import os

from peft import PeftModel
from peft import (
    prepare_model_for_int8_training,
    LoraConfig,
    get_peft_model
)



In [4]:
model_name="MediaTek-Research/Breeze-7B-Instruct-v0_1"
cache_dir='/content/drive/MyDrive/Breeze'

# **load the pre_trained model**
包括模型参数，分词器，设定解码策略

In [5]:
# quantify
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_use_triton=True
)

# load model parameter
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    cache_dir=cache_dir,
    quantization_config=quantization_config

)
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    cache_dir=cache_dir,
    quantization_config=quantization_config
)

# set decoding stategy
# random sampling
max_len=128
generation_config = GenerationConfig(
    max_length=max_len,
    do_sample=True,
    temperature=0.9,
    top_p=0.5,
    no_repeat_ngram_size=3,
    pad_token_id=2,
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


ImportError: Using `bitsandbytes` 8-bit quantization requires Accelerate: `pip install accelerate` and the latest version of bitsandbytes: `pip install -i https://pypi.org/simple/ bitsandbytes`

In [None]:
instruction="我会给你一首诗的前两句，然后你续写两句"
input="窗前明月光，疑是地上霜"
prompt="""
[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem.
<</SYS>>
{instruction}
{input}
[/INST]
"""
input_text=prompt.format(instruction=instruction,input=input)
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()
print(input_ids)
print("-"*20)
response=model.generate(
    input_ids=input_ids,
    max_length=max_len,
    do_sample=generation_config.do_sample,
    temperature=generation_config.temperature,
    num_beams=generation_config.num_beams,
    top_p=generation_config.top_p,
    no_repeat_ngram_size=generation_config.no_repeat_ngram_size,
    pad_token_id=generation_config.pad_token_id
)
print(response)
print("-"*20)
# 解码生成的输出
generated_text = tokenizer.decode(response[0], skip_special_tokens=True)

# 打印生成的文本
print(generated_text)
print("-"*20)

tensor([[    1, 28705,    13, 28792, 16289, 28793,  2087, 18741,  4060,    13,
          1976,   460,   264, 10865, 13892,   304,  1179,   438,  3653,   320,
           602, 16067, 28723,    13, 28789,   700, 18741,  4060,    13, 29242,
         29179, 29709, 29383, 47223,   235,   178,   154, 28914, 29087, 29745,
         30347, 28924, 51540, 29383, 30199, 29503, 29745, 30347,    13, 30171,
         29087, 29381, 49533, 28924, 33982, 28971, 45312, 35525,    13, 28792,
         28748, 16289, 28793,    13]], device='cuda:0')
--------------------


The attention mask is not set and cannot be inferred from input because pad token is same as eos token.As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


tensor([[    1, 28705,    13, 28792, 16289, 28793,  2087, 18741,  4060,    13,
          1976,   460,   264, 10865, 13892,   304,  1179,   438,  3653,   320,
           602, 16067, 28723,    13, 28789,   700, 18741,  4060,    13, 29242,
         29179, 29709, 29383, 47223,   235,   178,   154, 28914, 29087, 29745,
         30347, 28924, 51540, 29383, 30199, 29503, 29745, 30347,    13, 30171,
         29087, 29381, 49533, 28924, 33982, 28971, 45312, 35525,    13, 28792,
         28748, 16289, 28793,    13, 30171,   233,   170,   158, 29376, 29395,
           233,   184,   136, 28924, 29783, 29065, 30370, 29870, 31835, 29085,
           236,   181,   159,     2]], device='cuda:0')
--------------------

[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem.
<</SYS>>
我会给你一首诗的前两句，然后你续写两句
窗前明月光，疑是地上霜
[/INST]
窗槛月色浅，步出院门冰点鲜
--------------------


# finetuning
使用唐诗数据集https://github.com/CheeEn-Yu/GenAI-Hw5

## Fix Random Seeds
There may be some randomness involved in the fine-tuning process. We fix random seeds to make the result reproducible.

In [None]:
seed = 42
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

## load datasets

In [None]:
cd /content/drive/MyDrive

/content/drive/MyDrive


In [None]:
!git clone https://github.com/CheeEn-Yu/GenAI-Hw5

Cloning into 'GenAI-Hw5'...
remote: Enumerating objects: 38, done.[K
remote: Counting objects: 100% (38/38), done.[K
remote: Compressing objects: 100% (29/29), done.[K
remote: Total 38 (delta 15), reused 26 (delta 7), pack-reused 0 (from 0)[K
Receiving objects: 100% (38/38), 3.68 MiB | 6.33 MiB/s, done.
Resolving deltas: 100% (15/15), done.


In [None]:
def generate_training_data(data_point):
    """
    (1) Goal:
        - This function is used to transform a data point (input and output texts) to tokens that our model can read

    (2) Arguments:
        - data_point: dict, with field "instruction", "input", and "output" which are all str

    (3) Returns:
        - a dict with model's input tokens, attention mask that make our model causal, and corresponding output targets

    (3) Example:
        - If you construct a dict, data_point_1, with field "instruction", "input", and "output" which are all str, you can use the function like this:
            formulate_article(data_point_1)

    """
    # construct full input prompt
    prompt = f"""\
[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem. 你是一個樂於助人的助手且擅長寫唐詩。
<</SYS>>

{data_point["instruction"]}
{data_point["input"]}
[/INST]"""
    # count the number of input tokens
    len_user_prompt_tokens = (
        len(
            tokenizer(
                prompt,
                truncation=True,
                max_length=CUTOFF_LEN + 1,
                padding="max_length",
            )["input_ids"]
        ) - 1
    )
    # transform input prompt into tokens
    full_tokens = tokenizer(
        prompt + " " + data_point["output"] + "</s>",
        truncation=True,
        max_length=CUTOFF_LEN + 1,
        padding="max_length",
    )["input_ids"][:-1]
    return {
        "input_ids": full_tokens,
        "labels": [-100] * len_user_prompt_tokens
        + full_tokens[len_user_prompt_tokens:],
        "attention_mask": [1] * (len(full_tokens)),
    }

In [None]:
CUTOFF_LEN = 256  # 設定文本截斷的最大長度
data_file='/content/drive/MyDrive/GenAI-Hw5/Tang_training_data.json'
with open(data_file,'r',encoding='utf-8') as f:
  data_json=json.load(f)
data = Dataset.from_pandas(pd.DataFrame(data_json[:256]))
train_data=data.shuffle().map(generate_training_data)

Map:   0%|          | 0/256 [00:00<?, ? examples/s]

## finetune model

In [None]:
check=False
checkpoint=os.path.join(cache_dir,'exp')
logging_steps = 20  # 定義訓練過程中每隔多少步驟輸出一次訓練誌
save_steps = 65  # 定義訓練過程中每隔多少步驟保存一次模型
save_total_limit = 3  # 控制最多保留幾個模型checkpoint
report_to = None  # 設定上報實驗指標的目標，預設為無
num_epoch=1
LEARNING_RATE=3e-4
MICRO_BATCH_SIZE = 4  # 定義微批次的大小
BATCH_SIZE = 16  # 定義一個批次的大小
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE  # 計算每個微批次累積的梯度步數
CUTOFF_LEN = 256  # 設定文本截斷的最大長度
LORA_R = 8  # 設定LORA（Layer-wise Random Attention）的R值
LORA_ALPHA = 16  # 設定LORA的Alpha值
LORA_DROPOUT = 0.05  # 設定LORA的Dropout率
VAL_SET_SIZE = 0  # 設定驗證集的大小，預設為無
TARGET_MODULES = ["q_proj", "up_proj", "o_proj", "k_proj", "down_proj", "gate_proj", "v_proj"] # 設定目標模組，這些模組的權重將被保存為checkpoint
device_map = "auto"  # 設定設備映射，預設為"auto"
world_size = int(os.environ.get("WORLD_SIZE", 1))  # 獲取環境變數"WORLD_SIZE"的值，若未設定則預設為1
ddp = world_size != 1  # 根據world_size判斷是否使用分散式數據處理(DDP)，若world_size為1則不使用DDP
if ddp:
    device_map = {"": int(os.environ.get("LOCAL_RANK") or 0)}
    GRADIENT_ACCUMULATION_STEPS = GRADIENT_ACCUMULATION_STEPS // world_size

In [None]:

# load model from checkpoint
if check:
  model=PeftModel.from_pretrained(model,checkpoint)

# int8 training model
model=prepare_model_for_int8_training(model)

# lora training model
lora_config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model=get_peft_model(model,lora_config)
# 如果加载不成功，可能是包版本问题
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=None,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=MICRO_BATCH_SIZE,
        gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
        warmup_steps=50,
        num_train_epochs=num_epoch,
        learning_rate=LEARNING_RATE,
        fp16=True,  # 使用混合精度訓練
        logging_steps=logging_steps,
        save_strategy="steps",
        save_steps=save_steps,
        output_dir=checkpoint,
        save_total_limit=save_total_limit,
        ddp_find_unused_parameters=False if ddp else None,  # 是否使用 DDP，控制梯度更新策略
        report_to=report_to,
    ),
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()
os.makedirs(checkpoint, exist_ok = True)
model.save_pretrained(checkpoint)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


Step,Training Loss




config.json:   0%|          | 0.00/618 [00:00<?, ?B/s]

# test finetuning model

In [None]:
# find all available checkpoints
ckpts = []
for ckpt in os.listdir(checkpoint):
    if (ckpt.startswith("checkpoint-")):
        ckpts.append(ckpt)

# list all the checkpoints
ckpts = sorted(ckpts, key = lambda ckpt: int(ckpt.split("-")[-1]))
print("all available checkpoints:")
print(" id: checkpoint name")
for (i, ckpt) in enumerate(ckpts):
    print(f"{i:>3}: {ckpt}")

all available checkpoints:
 id: checkpoint name


In [None]:
model = PeftModel.from_pretrained(model, checkname, device_map={'': 0})

In [None]:
CUTOFF_LEN = 256  # 設定文本截斷的最大長度
data_file='/content/drive/MyDrive/GenAI-Hw5/Tang_testing_data.json'
with open(data_file,'r',encoding='utf-8') as f:
  test_datas=json.load(f)


prompt="""
[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem.
<</SYS>>
{instruction}
{input}
[/INST]
"""

for (i,test_data) in enumerate(test_datas):

  input_text=prompt.format(instruction=test_data['instruction'],input=test_data['input'])
  input_ids = tokenizer(input_text, return_tensors="pt").input_ids.cuda()

  response=model.generate(
      input_ids=input_ids,
      max_length=max_len,
      do_sample=generation_config.do_sample,
      temperature=generation_config.temperature,
      num_beams=generation_config.num_beams,
      top_p=generation_config.top_p,
      no_repeat_ngram_size=generation_config.no_repeat_ngram_size,
      pad_token_id=generation_config.pad_token_id
  )

  # 解码生成的输出
  generated_text = tokenizer.decode(response[0], skip_special_tokens=True)

  # 打印生成的文本
  print(i,generated_text)


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


0 
[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem.
<</SYS>>
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。
雪霽銀妝素，桔高映瓊枝。
[/INST]
[Inst] 玉堂一簾簾，金簾一簾開。

[SYS] 紫雲輕掩下，金光微映邊。
鳳冠鳳冠出，鳳裳鳳裳回。
一窗一窗明，一窗以一開。
1 
[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem.
<</SYS>>
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。
夫子何爲者？栖栖一代中。
[/INST]
道心自無惑，而物亦何如。
自能不以物，而能以心。
心之所以定，乃在無欲之。
無欲則無欲，無欲是無欲。
欲之所欲求，欲之所以求
2 
[INST] <<SYS>>
You are a helpful assistant and good at writing Tang poem.
<</SYS>>
以下是一首唐詩的第一句話，請用你的知識判斷並完成整首詩。
飛蓋去芳園，蘭橈遊翠渚。
[/INST]
一池碧色水，百枝紅姿華。玉香散風中，金露滴日下。欲見雙鳳姿，欲聞雙鳳歌。誰家有此園，莫待百花花。
</SYS>
