In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, BitsAndBytesConfig
from peft import LoraConfig, get_peft_model, TaskType
from datasets import load_dataset
from transformers import DataCollatorForLanguageModeling
import torch

In [6]:
model_name = "tiiuae/falcon-rw-1b"

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token

In [8]:
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

In [9]:
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    trust_remote_code=True,
    quantization_config=bnb_config
)

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
You are calling `save_pretrained` to a 4-bit converted model, but your `bitsandbytes` version doesn't support it. If you want to save 4-bit models, make sure to have `bitsandbytes>=0.41.3` installed.


In [None]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
    target_modules=["query_key_value", "dense"]
)

In [11]:
model = get_peft_model(model, lora_config)

In [14]:
dataset = load_dataset("json", data_files="../data/legal_qa.json", split="train")

Generating train split: 14543 examples [00:00, 92565.59 examples/s]


In [15]:
def format_example(example):
    return f"### Question:\n{example['instruction']}\n\n### Answer:\n{example['output']}"

In [16]:
dataset = dataset.map(lambda x: {"text": format_example(x)})

Map: 100%|██████████| 14543/14543 [00:00<00:00, 15188.07 examples/s]


In [17]:
tokenized = dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=512), batched=True)
tokenized.set_format("torch", columns=["input_ids", "attention_mask"])

Map: 100%|██████████| 14543/14543 [00:02<00:00, 5156.04 examples/s]


In [25]:
training_args = TrainingArguments(
    output_dir="finetune/qlora-legalease",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    num_train_epochs=2,
    logging_steps=10,
    save_strategy="epoch",
    fp16=True,
    report_to="none"
)
collator = DataCollatorForLanguageModeling(tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=tokenized,
    data_collator=collator
)

trainer.train()

  0%|          | 0/908 [00:29<?, ?it/s]

  0%|          | 0/1818 [07:58<?, ?it/s]            

{'loss': 1.9439, 'grad_norm': 0.988476574420929, 'learning_rate': 4.9862486248624866e-05, 'epoch': 0.01}



  0%|          | 0/1818 [18:31<?, ?it/s]            

{'loss': 1.988, 'grad_norm': 2.2013208866119385, 'learning_rate': 4.972497249724973e-05, 'epoch': 0.01}


KeyboardInterrupt: 

In [None]:
model.save_pretrained("finetune/qlora-legalease")
tokenizer.save_pretrained("finetune/qlora-legalease")