In [15]:
from peft import LoraConfig, get_peft_model
import torch
from torch.optim import SGD
import yaml
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

from data.math10k import Math10k
from models.llama import MODEL_TYPES
from utility.initialize import initialize

tokenizer = None 

with open("./conf/model/llama.yaml", 'r') as f:
    config = yaml.safe_load(f)

print(config)

# 命令行参数
args = {}
args["batch_size"] = 256
args["threads"] = 0
args["model_type"] = "llama"
args["model_name"] = "meta-llama/Meta-Llama-3-8B"
args["learning_rate"] = 2.0e-4
args["epochs"] = 3

# 初始化
# index_num = random.randint(1, 2000)
index_num = 42
initialize(index_num)

# 检测 CUDA
print('Cuda:', torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 加载数据集
args["storage_size"] = 9919
dataset = Math10k(args["batch_size"], args["threads"])

# 使用模型 llama3-8B
model_name = config['model_name']
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 设置 pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 加载模型
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16,
    # quantization_config=bnb_config,
    device_map="auto"
)

peft_config = LoraConfig(
    r=config['lora_r'],
    lora_alpha=config['lora_alpha'],
    lora_dropout=config['lora_dropout'],
    target_modules=config['lora_target_modules'],
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

print(model.parameters())

# 配置不同的优化器
base_optimizer = SGD(model.parameters(), lr=args["learning_rate"], momentum=0.9)

for epoch in range(args["epochs"]):
    model.train()
    start_time = time.time()

    for batch in dataset.train:
        inputs, targets, index = batch[0].to(device), batch[1].to(device), batch[2].to(device)

{'model_name': 'NousResearch/Meta-Llama-3-8B', 'use_flash_attention_2': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], 'train_dataset_name': 'meta-math/Math-10k', 'eval_dataset_name': 'gsm8k', 'eval_dataset_config': 'main', 'prompt_template': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{response}<|eot_id|>', 'max_seq_length': 1024, 'output_dir': './results/llama3_math_AsyFlat', 'num_train_epochs': 2, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0002, 'warmup_ratio': 0.03, 'lr_scheduler_type': 'cosine', 'logging_steps': 10, 'bf16': True, 'seed': 42}
Cuda: True


Loading checkpoint shards: 100%|██████████| 4/4 [00:00<00:00, 3992.67it/s]


ValueError: You are trying to offload the whole model to the disk. Please use the `disk_offload` function instead.

In [8]:
from peft import LoraConfig, get_peft_model
import torch
from torch.optim import SGD
import yaml
import time
from transformers import AutoModelForCausalLM, AutoTokenizer
from tqdm import tqdm

from data.math10k import Math10k
from models.llama import MODEL_TYPES
from utility.initialize import initialize

In [13]:
tokenizer = None 
with open("./conf/model/llama.yaml", 'r') as f:
        config = yaml.safe_load(f)

print(config)

# 命令行参数
args = {}
args["batch_size"] = 256
args["threads"] = 0
args["model_type"] = "llama"
args["model_name"] = "meta-llama/Meta-Llama-3-8B"
args["learning_rate"] = 2.0e-4
args["epochs"] = 3

# 初始化
# index_num = random.randint(1, 2000)
index_num = 42
initialize(index_num)

# 检测 CUDA
print('Cuda:', torch.cuda.is_available())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# 加载数据集
args["storage_size"] = 9919
dataset = Math10k(args["batch_size"], args["threads"])

# 使用模型 llama3-8B
model_name = config['model_name']
tokenizer = AutoTokenizer.from_pretrained(model_name)
# 设置 pad token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 加载模型
from transformers import BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
)


peft_config = LoraConfig(
    r=config['lora_r'],
    lora_alpha=config['lora_alpha'],
    lora_dropout=config['lora_dropout'],
    target_modules=config['lora_target_modules'],
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

print(model.parameters())

{'model_name': 'NousResearch/Meta-Llama-3-8B', 'use_flash_attention_2': True, 'lora_r': 16, 'lora_alpha': 32, 'lora_dropout': 0.05, 'lora_target_modules': ['q_proj', 'k_proj', 'v_proj', 'o_proj', 'gate_proj', 'up_proj', 'down_proj'], 'train_dataset_name': 'meta-math/Math-10k', 'eval_dataset_name': 'gsm8k', 'eval_dataset_config': 'main', 'prompt_template': '<|begin_of_text|><|start_header_id|>user<|end_header_id|>\n\n{instruction}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n{response}<|eot_id|>', 'max_seq_length': 1024, 'output_dir': './results/llama3_math_AsyFlat', 'num_train_epochs': 2, 'per_device_train_batch_size': 2, 'per_device_eval_batch_size': 4, 'gradient_accumulation_steps': 8, 'learning_rate': 0.0002, 'warmup_ratio': 0.03, 'lr_scheduler_type': 'cosine', 'logging_steps': 10, 'bf16': True, 'seed': 42}
Cuda: True


PackageNotFoundError: No package metadata was found for bitsandbytes

In [None]:
base_optimizer = SGD(model.parameters(), lr=args["learning_rate"], momentum=0.9)

for epoch in range(args["epochs"]):
    model.train()
    start_time = time.time()

    for batch in tqdm(dataset.train):
        print(batch)

  0%|          | 0/39 [00:00<?, ?it/s]


KeyError: 'question'