In [1]:
import os

os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
os.environ["WANDB_DISABLED"] = "true"   

In [24]:
import sys
import json
import os.path as osp
import transformers
from typing import Union, List

import fire
import torch
from datasets import load_dataset

from peft import (
    LoraConfig,
    get_peft_model,
    get_peft_model_state_dict,
    prepare_model_for_kbit_training,
    set_peft_model_state_dict,
)
from transformers import LlamaForCausalLM, LlamaTokenizer, BitsAndBytesConfig, GenerationConfig

In [3]:
if torch.cuda.is_available():
    device_count = torch.cuda.device_count()
    print("Number of GPUs:", device_count)
    
    for i in range(device_count):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
else:
    print("CUDA is not available.")

Number of GPUs: 2
GPU 0: NVIDIA A40
GPU 1: NVIDIA A40


In [4]:
base_model = "../pretrain/nlp/llama/llama-7b/"
device_map = "auto"
config = BitsAndBytesConfig(
    load_in_8bit=True
)

model = LlamaForCausalLM.from_pretrained(
    base_model,
    quantization_config=config,
    torch_dtype=torch.float16,
    device_map=device_map,
)

model = prepare_model_for_kbit_training(model)

# 打印模型的结构
# for name, module in model.named_modules():
#     print(name)

Loading checkpoint shards:   0%|          | 0/33 [00:00<?, ?it/s]

In [5]:
tokenizer = LlamaTokenizer.from_pretrained(base_model)

tokenizer.pad_token_id = (
    0  # unk. we want this to be different from the eos token
)
tokenizer.padding_side = "left"

You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [6]:
train_data_path = './RTE/train.json'
test_data_path = './RTE/dev.json'

data_files = {
    'train': train_data_path,
    'test': test_data_path
}
data = load_dataset('json', data_files=data_files)

print(data)
print(data['train'])
print(data['train'][0])


Using custom data configuration default-1acfae2c9ea1e8fc
Reusing dataset json (/home/1004chr/.cache/huggingface/datasets/json/default-1acfae2c9ea1e8fc/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['index', 'instruction', 'input', 'output'],
        num_rows: 2490
    })
    test: Dataset({
        features: ['index', 'instruction', 'input', 'output'],
        num_rows: 277
    })
})
Dataset({
    features: ['index', 'instruction', 'input', 'output'],
    num_rows: 2490
})
{'index': '0', 'instruction': 'No Weapons of Mass Destruction Found in Iraq Yet.', 'input': 'Weapons of Mass Destruction Found in Iraq.', 'output': '0'}


In [10]:
# 加载模板文件
path_to_template_file = './RTE/template.json'
with open(path_to_template_file, 'r') as file:
    template = json.load(file)

def generate_prompt(data_point):
    instruction = data_point['instruction']
    input_text = data_point.get('input', None)
    label = data_point.get('output', None)

    if input_text:
        res = template["prompt_input"].format(
            instruction=instruction, input=input_text
        )
    else:
        res = template["prompt_no_input"].format(
            instruction=instruction
        )
    if label:
        res = f"{res}{label}"
    
    return res

prompt = generate_prompt(data['train'][0])
print(prompt)

### Premise:
No Weapons of Mass Destruction Found in Iraq Yet.

### Hypothesis:
Weapons of Mass Destruction Found in Iraq.

Does the hypothesis entail the premise?

### Response:
0


In [28]:
CUTOFF_LEN = 256
def tokenize(prompt, add_eos_token=True):
    result = tokenizer(
        prompt,
        truncation=True,
        max_length=CUTOFF_LEN,
        padding=True,  
        return_tensors=None,
    )
    if (
        result["input_ids"][-1] != tokenizer.eos_token_id
        and len(result["input_ids"]) < CUTOFF_LEN
        and add_eos_token
    ):
        result["input_ids"].append(tokenizer.eos_token_id)
        result["attention_mask"].append(1)

    result["labels"] = result["input_ids"].copy()

    return result

tokenize_prompt = tokenize(prompt)
print(tokenize_prompt.keys())
print(tokenize_prompt)

dict_keys(['input_ids', 'attention_mask', 'labels'])
{'input_ids': [0, 835, 6097, 895, 29901, 13, 3782, 1334, 481, 787, 310, 7360, 15435, 4080, 7460, 297, 21375, 29939, 15175, 29889, 13, 13, 2277, 29937, 28984, 720, 6656, 29901, 13, 4806, 481, 787, 310, 7360, 15435, 4080, 7460, 297, 21375, 29939, 29889, 13, 13, 25125, 278, 20051, 875, 737, 278, 5188, 895, 29973, 13, 13, 2277, 29937, 13291, 29901, 13, 29900, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [0, 835, 6097, 895, 29901, 13, 3782, 1334, 481, 787, 310, 7360, 15435, 4080, 7460, 297, 21375, 29939, 15175, 29889, 13, 13, 2277, 29937, 28984, 720, 6656, 29901, 13, 4806, 481, 787, 310, 7360, 15435, 4080, 7460, 297, 21375, 29939, 29889, 13, 13, 25125, 278, 20051, 875, 737, 278, 5188, 895, 29973, 13, 13, 2277, 29937, 13291, 29901, 13, 29900, 0]}


In [12]:
def generate_and_tokenize_prompt(data_point):
    full_prompt = generate_prompt(data_point)
    tokenized_full_prompt = tokenize(full_prompt)
    return tokenized_full_prompt

tokenize_data = generate_and_tokenize_prompt(data['train'][0])
print(tokenize_data.keys())
print(tokenize_data)

dict_keys(['input_ids', 'attention_mask', 'labels'])
{'input_ids': [0, 835, 6097, 895, 29901, 13, 3782, 1334, 481, 787, 310, 7360, 15435, 4080, 7460, 297, 21375, 29939, 15175, 29889, 13, 13, 2277, 29937, 28984, 720, 6656, 29901, 13, 4806, 481, 787, 310, 7360, 15435, 4080, 7460, 297, 21375, 29939, 29889, 13, 13, 25125, 278, 20051, 875, 737, 278, 5188, 895, 29973, 13, 13, 2277, 29937, 13291, 29901, 13, 29900, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], 'labels': [0, 835, 6097, 895, 29901, 13, 3782, 1334, 481, 787, 310, 7360, 15435, 4080, 7460, 297, 21375, 29939, 15175, 29889, 13, 13, 2277, 29937, 28984, 720, 6656, 29901, 13, 4806, 481, 787, 310, 7360, 15435, 4080, 7460, 297, 21375, 29939, 29889, 13, 13, 25125, 278, 20051, 875, 737, 278, 5188, 895, 29973, 13, 13, 2277, 29937, 13291, 29901, 13, 29900, 0]}


In [13]:
train_val = data["train"].train_test_split(
    test_size=200, shuffle=True, seed=42
)
train_data = (
    train_val["train"].map(generate_and_tokenize_prompt)
)
val_data = (
    train_val["test"].map(generate_and_tokenize_prompt)
)

# 输出第一条数据
print("First data point in train_data:", train_data[0])
print("First data point in val_data:", val_data[0])

Loading cached split indices for dataset at /home/1004chr/.cache/huggingface/datasets/json/default-1acfae2c9ea1e8fc/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-b8befd04bd68df81.arrow and /home/1004chr/.cache/huggingface/datasets/json/default-1acfae2c9ea1e8fc/0.0.0/c90812beea906fcffe0d5e3bb9eba909a80a998b5f88e9f8acbd320aa91acfde/cache-ec25260d6346e191.arrow


  0%|          | 0/2290 [00:00<?, ?ex/s]

  0%|          | 0/200 [00:00<?, ?ex/s]

First data point in train_data: {'index': '776', 'instruction': '20th Century spokesman Rick Dinon said that between the time it applied for the rate increase and changed its mind, the estimated quake losses had ballooned by several hundred million dollars.', 'input': 'Rick Dinon is the senior vice president of 20th Century Insurance Co.', 'output': '0', 'input_ids': [0, 835, 6097, 895, 29901, 13, 29906, 29900, 386, 24027, 805, 23195, 1171, 24218, 15651, 265, 1497, 393, 1546, 278, 931, 372, 7436, 363, 278, 6554, 7910, 322, 3939, 967, 3458, 29892, 278, 15899, 439, 1296, 28495, 750, 6411, 417, 22367, 491, 3196, 6893, 7284, 17208, 29889, 13, 13, 2277, 29937, 28984, 720, 6656, 29901, 13, 29934, 860, 15651, 265, 338, 278, 16336, 11289, 6673, 310, 29871, 29906, 29900, 386, 24027, 512, 7610, 749, 3189, 29889, 13, 13, 25125, 278, 20051, 875, 737, 278, 5188, 895, 29973, 13, 13, 2277, 29937, 13291, 29901, 13, 29900, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [30]:
# test model infer.
model.eval()
infer_tokenizer = LlamaTokenizer.from_pretrained(base_model)

data_point = data['train'][0]
prompt = generate_prompt(data_point)
print(prompt)

inputs = tokenizer(prompt, return_tensors="pt")
input_ids = inputs["input_ids"].to(model.device)
# print(inputs)
# print(input_ids)

generation_config = GenerationConfig(
    temperature=0.1,
    top_p=0.75,
    top_k=40,
    num_beams=4,
)

with torch.no_grad():
    generation_output = model.generate(
        input_ids=input_ids,
        generation_config=generation_config,
        return_dict_in_generate=True,
        output_scores=True,
        max_new_tokens=128,
    )
s = generation_output.sequences[0]
output = tokenizer.decode(s)


### Premise:
No Weapons of Mass Destruction Found in Iraq Yet.

### Hypothesis:
Weapons of Mass Destruction Found in Iraq.

Does the hypothesis entail the premise?

### Response:
0




<unk>### Premise:
No Weapons of Mass Destruction Found in Iraq Yet.

### Hypothesis:
Weapons of Mass Destruction Found in Iraq.

Does the hypothesis entail the premise?

### Response:
0

### Hypothesis:
Weapons of Mass Destruction Found in Iraq.

Does the hypothesis entail the premise?

### Response:
0

### Hypothesis:
Weapons of Mass Destruction Found in Iraq.

Does the hypothesis entail the premise?

### Response:
0

### Hypothesis:
Weapons of Mass Destruction Found in Iraq.

Does the hypothesis entail the premise?

### Response:
0

### Hypothesis:


In [None]:
output.split(template["response_split"])[1].strip()

In [11]:
LORA_R = 8
LORA_ALPHA = 16
LORA_DROPOUT= 0.05
LORA_TARGET_MODULES = [
    "q_proj",
    "v_proj",
]

BATCH_SIZE = 128
MICRO_BATCH_SIZE = 4
GRADIENT_ACCUMULATION_STEPS = BATCH_SIZE // MICRO_BATCH_SIZE
LEARNING_RATE = 3e-4
TRAIN_STEPS = 300
OUTPUT_DIR = "experiments"

In [12]:
config = LoraConfig(
    r=LORA_R,
    lora_alpha=LORA_ALPHA,
    target_modules=LORA_TARGET_MODULES,
    lora_dropout=LORA_DROPOUT,
    bias="none",
    task_type="CAUSAL_LM",
)
model = get_peft_model(model, config)

model.print_trainable_parameters()  # Be more transparent about the % of trainable params.

trainable params: 4,194,304 || all params: 6,742,609,920 || trainable%: 0.0622


In [13]:
training_arguments = transformers.TrainingArguments(
     per_device_train_batch_size=MICRO_BATCH_SIZE,
     gradient_accumulation_steps=GRADIENT_ACCUMULATION_STEPS,
     warmup_steps=100,
     max_steps=TRAIN_STEPS,
     learning_rate=LEARNING_RATE,
     fp16=True,
     logging_steps=10,
     optim="adamw_torch",
     evaluation_strategy="steps",
     save_strategy="steps",
     eval_steps=50,
     save_steps=50,
     output_dir=OUTPUT_DIR,
     save_total_limit=3,
     load_best_model_at_end=True,
     report_to="none"
 )

In [14]:
data_collator = transformers.DataCollatorForSeq2Seq(
    tokenizer, pad_to_multiple_of=8, return_tensors="pt", padding=True
)

# 映射生成和标记化函数
tokenized_data = [generate_and_tokenize_prompt(data['train'][i]) for i in list(range(10))]
collated_batch = data_collator(tokenized_data)

# 输出处理后的数据
print("Collated input_ids:", collated_batch['input_ids'])
print("Collated attention_mask:", collated_batch['attention_mask'])
print("Collated labels:", collated_batch['labels'])

Collated input_ids: tensor([[    0,     0,     0,  ..., 29901,    13,     0],
        [    0,     0,     0,  ..., 29901,    13,     0],
        [    0,     0,     0,  ..., 29901,    13,     0],
        ...,
        [    0,     0,     0,  ..., 29901,    13,     0],
        [    0,     0,     0,  ..., 29901,    13,     0],
        [    0,     0,     0,  ..., 29901,    13,     0]])
Collated attention_mask: tensor([[0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        ...,
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]])
Collated labels: tensor([[ -100,  -100,  -100,  ..., 29901,    13,     0],
        [ -100,  -100,  -100,  ..., 29901,    13,     0],
        [ -100,  -100,  -100,  ..., 29901,    13,     0],
        ...,
        [ -100,  -100,  -100,  ..., 29901,    13,     0],
        [ -100,  -100,  -100,  ..., 29901,    13,     0],
        [ -100,  -100,  -100,  ..., 29901,    13,     0

In [15]:
trainer = transformers.Trainer(
    model=model,
    train_dataset=train_data,
    eval_dataset=val_data,
    args=training_arguments,
    data_collator=data_collator
)
model.config.use_cache = False
old_state_dict = model.state_dict

# These lines need to be commented out, torch and peft libraries are incompatible
# model.state_dict = (
#     lambda self, *_, **__: get_peft_model_state_dict(
#         self, old_state_dict()
#     )
# ).__get__(model, type(model))

# model = torch.compile(model)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [16]:
trainer.train()



Step,Training Loss,Validation Loss
50,1.5725,1.537473
100,1.4116,1.445972
150,1.3613,1.43329
200,1.3247,1.431099
250,1.2878,1.434796
300,1.2741,1.437509




SafetensorError: Error while deserializing header: InvalidHeaderDeserialization