In [None]:
# pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -U --user datasets accelerate peft trl tensorboard bitsandbytes langchain sentencepiece transformers

In [None]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th
import transformers

from pprint import pp
from datasets import (load_dataset, load_from_disk, Dataset)
from transformers import (AutoTokenizer, 
                          BitsAndBytesConfig,
                          AutoModel, 
                          AutoModelForCausalLM, 
                          AutoModelForSequenceClassification,
                          DataCollatorWithPadding, 
                          DataCollatorForLanguageModeling,
                          DataCollatorForSeq2Seq, 
                          DataCollatorForTokenClassification,
                          TrainingArguments, Trainer)
from peft import (LoraConfig, get_peft_model, PeftModel, TaskType, get_peft_model_state_dict)

In [4]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(f"torch version = {th.__version__}")
print(f"cuda version = {th.version.cuda}")
print(f"transformers version = {transformers.__version__}")

device = cuda; devive_cnt = 1
torch version = 2.5.1+cu121
cuda version = 12.1
transformers version = 4.49.0


In [5]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM"
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 载入数据源

In [6]:
filename = "HuggingFaceFW/fineweb/000_00000.parquet"

In [8]:
dataset = load_dataset(
    path="parquet",
    data_files=os.path.join(path_data, filename),
    split="all"
)

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:
dataset = dataset.select(range(200))
dataset = dataset.train_test_split(test_size=0.2, shuffle=True, seed=0) 
train_dataset, eval_dataset = dataset["train"], dataset["test"]

In [None]:
train_dataset[0]

{'text': 'Well, this 2020 spring sure snuck up on us. Down here in the southwest, the temps are getting into the 70s already in March, so we are looking at some DIY projects. We have decided that an DIY outdoor kitchen is what my father-in-law’s backyard needs. He’s super excited and we started talking and discussing the build and what we would want in the structure.\nWe have a few hundred square feet to work with, so we have some different shapes and configurations available to us. There is a gorgeous mountain view to our east, so we definitely want to keep that on our mind when we design the layout.\nFirst and foremost, my father-in-law loves to cook on charcoal, so we are headed in that direction with the grill. We picked out a Char-Broil brand structure that we’ll modify to fit the space. That’s a real cool way to build and very economical, so we’ll be getting into that in some future posts. Secondly, we are looking to match the exterior of the outdoor fireplace that we built last 

## step-2: tokenizer

In [11]:
checkpoint = "Qwen/Qwen2.5-3B-Instruct"

In [12]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [13]:
print(tokenizer.pad_token)
print(tokenizer.eos_token)
print(tokenizer.padding_side)

<|endoftext|>
<|im_end|>
right


## step-3: 配置量化参数

In [14]:
config_bnb = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=th.bfloat16,
    bnb_4bit_use_double_quant=True
)  # QLoRA

## step-4: 载入基模

In [15]:
base_model = AutoModelForCausalLM.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map="auto",
    low_cpu_mem_usage=True,
    torch_dtype=th.bfloat16,
    # attn_implementation="flash_attention_2",  # flash_attention_2, sdpa
    quantization_config=config_bnb,
)

Sliding Window Attention is enabled but not implemented for `sdpa`; unexpected results may be encountered.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
base_model.gradient_checkpointing_enable()
base_model.enable_input_require_grads()
base_model.config.use_cache = False

if th.cuda.device_count() > 1:
    base_model.is_parallelizable = True
    base_model.model_parallel = True

In [None]:
allocated_memory = th.cuda.memory_allocated()
cached_memory = th.cuda.memory_cached()
print(f"已分配的GPU内存：{allocated_memory / 1024**3:.2f}G, 已缓存的GPU内存：{cached_memory / 1024**3:.2f}G")

In [None]:
tokenizer_size = len(tokenizer)
embedding_size = base_model.get_input_embeddings().weight.shape[0]
if tokenizer_size > embedding_size:
    base_model.resize_token_embeddings(tokenizer_size)