In [1]:
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import load_dataset
import torch 
from itertools import chain
from config import CHAT_MODEL

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
Model_name = "Qwen/Qwen2.5-1.5B"
OUTPUT_DIR = "./Qwen2.5-1.5B-finetuned-stage1"

In [None]:
tokenizer = AutoTokenizer.from_pretrained(
    Model_name,
    trust_remote_code = True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
model = AutoModelForCausalLM.from_pretrained(
    Model_name,
    trust_remote_code = True ,
    torch_dtype = torch.bfloat16,
    device_map = "auto"
)

`torch_dtype` is deprecated! Use `dtype` instead!


In [None]:
from peft import LoraConfig, get_peft_model, TaskType

lora_config = LoraConfig(
    r=16,                  
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"] 
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 18,464,768 || all params: 1,562,179,072 || trainable%: 1.1820


In [6]:
dataset = load_dataset(
    "json",
    data_files={"train": "./legal_corpus_stage1.jsonl"}
)


Generating train split: 199 examples [00:00, 882.30 examples/s]


In [None]:
Max_Length = 2048
def tokenizer_fn(examples):
    # Thêm EOS token vào cuối mỗi văn bản để ngăn cách các văn bản luật khác nhau
    texts = [t + tokenizer.eos_token for t in examples["text"]] 
    return tokenizer(texts)

In [None]:
tokenizer_ds = dataset.map(
    tokenizer_fn,
    batched = True,
    remove_columns = ["text"],#Xóa text gốc tiết kiệm bộ nhớ
    num_proc = 4
)

Map (num_proc=4):  75%|███████▌  | 150/199 [00:03<00:00, 54.43 examples/s]Token indices sequence length is longer than the specified maximum sequence length for this model (142777 > 131072). Running this sequence through the model will result in indexing errors
Map (num_proc=4): 100%|██████████| 199/199 [00:06<00:00, 31.72 examples/s]


In [None]:
def group_texts(examples):
    # Nối tất cả các token lại
    concatenated_examples = {k: list(chain(*examples[k])) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # Bỏ phần dư ở cuối nếu không chia hết cho block_size
    if total_length >= Max_Length:
        total_length = (total_length // Max_Length) * Max_Length
    # Chia nhỏ thành các chunks
    result = {
        k: [t[i : i + Max_Length] for i in range(0, total_length, Max_Length)]
        for k, t in concatenated_examples.items()
    }
    # Trong Causal LM, label chính là input_ids (tự đoán chính nó)
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenizer_ds.map(
    group_texts,
    batched=True,
    num_proc=4
)

Map (num_proc=4): 100%|██████████| 199/199 [00:03<00:00, 63.25 examples/s]


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer = tokenizer,
    mlm = False,# Causal Language Modeling
)

In [None]:
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,  # giả batch = 16
    learning_rate=2e-4,              
    num_train_epochs=1,              # 1–2 epoch là đủ
    fp16=False,
    bf16=True,
    logging_steps=50,
    save_steps=1000,
    save_total_limit=2,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    report_to="none",
    optim="adamw_torch"
)

In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=lm_datasets["train"],
    data_collator=data_collator,
)

The model is already on multiple devices. Skipping the move to device specified in `args`.


In [None]:
trainer.train()

In [None]:
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
