In [1]:
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import (
    LoraConfig,
    PeftModel,
    prepare_model_for_kbit_training,
    get_peft_model,
)
import os, torch, wandb
from datasets import load_dataset
from trl import SFTTrainer, setup_chat_format

from huggingface_hub import login
from datasets import *

import os
import glob 
import json
import bitsandbytes as bnb

hf_token = os.getenv("HUGGINGFACE_API_KEY")
login(token=hf_token)

# wandb.login(key=wb_token)
# run = wandb.init(
#     project='Fine-tune Llama 3.2 on Customer Support Dataset', 
#     job_type="training", 
#     anonymous="allow"
# )

# # Set torch dtype and attention implementation
# if torch.cuda.get_device_capability()[0] >= 8:
#     torch_dtype = torch.bfloat16
#     attn_implementation = "flash_attention_2"
# else:
torch_dtype = torch.float16
attn_implementation = "eager"

# QLoRA config
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch_dtype,
    bnb_4bit_use_double_quant=True,
)
# Load model
base_model = "unsloth/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    base_model,
    quantization_config=bnb_config,
    device_map="auto",
    attn_implementation=attn_implementation
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(
    base_model, 
    trust_remote_code=True,
    padding=True,
    padding_side="right",
    truncation=True
    )

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)

# LoRA config
peft_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)
#model, tokenizer = setup_chat_format(model, tokenizer)
model = get_peft_model(model, peft_config) 

new_model = "llama-3.2-1b-it-finetuned"

# prep the dataset
def load_json_data(folder_path, key):
    data = []
    for file_path in glob.glob(folder_path + '/*.json'):
        with open(file_path, 'r') as file:
            json_data = json.load(file)
            data.extend(json_data[key])
    return data

raw_pages = load_json_data("../data_prep/data", key="ocr_results")
cleaned_pages = load_json_data("../data_prep/data", key="cleaned_pages")

test_size = 0.2
train_size = int(len(cleaned_pages) * (1 - test_size))
train_raw_pages, test_raw_pages = raw_pages[:train_size], raw_pages[train_size:]
train_cleaned_pages, test_cleaned_pages = cleaned_pages[:train_size], cleaned_pages[train_size:]

dataset = DatasetDict({
    'train': Dataset.from_dict({"raw_pages": train_raw_pages[:len(train_cleaned_pages)], "cleaned_pages": train_cleaned_pages}),
    'test': Dataset.from_dict({"raw_pages": test_raw_pages[:len(test_cleaned_pages)], "cleaned_pages": test_cleaned_pages})
})

#dataset = Dataset.from_dict({"raw_pages": train_raw_pages[:len(train_cleaned_pages)], "cleaned_pages": train_cleaned_pages})

base_model = "unsloth/Llama-3.2-1B-Instruct"
tokenizer = AutoTokenizer.from_pretrained(base_model, trust_remote_code=True)




In [8]:
instruction = """
Du bist ein Experte für Textbereinigung. Deine Aufgabe ist es, einen Eingabetext zu bereinigen, der aus einem PDF-Dokument extrahiert wurde. Der Inhalt ist immer nur von einer einzelnen Seite, es sollte also nicht zu viel Text auf einmal sein. Es ist sehr wichtig, dass keine Daten und Informationen verloren gehen und dass die Originaltexte in keiner Weise verändert werden!
Antworte ausschließlich in Deutsch und keiner anderen Sprache.

Du hast die folgenden Aufgaben:
- Entferne alle seltsamen Textteile und Sonderzeichen.
- Entferne alle unnötigen Leerzeichen und Zeilenumbrüche.
- Organisiere die Formatierung.
- Korrektur von Rechtschreibfehlern.
- Handling von Formatierungsfehlern.

Gib nur den bereinigten und formatierten Text zurück und nichts anderes! Füge keinen eigenen Text hinzu! Achte auf Vollständigkeit, es darf kein Inhalt verloren gehen und es muss alles 100 % vollständig sein!
"""

def format_chat_template(row):
    
    row_json = [{"role": "system", "content": instruction},
               {"role": "user", "content": row["raw_pages"]},
               {"role": "assistant", "content": row["cleaned_pages"]}]
    
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False, padding=True, truncation=True)
    return row

dataset = dataset.map(
    format_chat_template
)

print(dataset)

Map:   0%|          | 0/1101 [00:00<?, ? examples/s]

Map:   0%|          | 0/276 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['raw_pages', 'cleaned_pages', 'text'],
        num_rows: 1101
    })
    test: Dataset({
        features: ['raw_pages', 'cleaned_pages', 'text'],
        num_rows: 276
    })
})


In [14]:
#Hyperparamter
training_arguments = TrainingArguments(
    output_dir=new_model,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=2,
    optim="paged_adamw_32bit",
    num_train_epochs=1,
    eval_strategy="steps",
    eval_steps=0.2,
    logging_steps=1,
    warmup_steps=10,
    logging_strategy="steps",
    learning_rate=2e-4,
    fp16=False,
    bf16=False,
    group_by_length=True,
    report_to=None,
    remove_unused_columns=False
)

# Setting sft parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    peft_config=peft_config,
    max_seq_length= 512,
    dataset_text_field="text",
    tokenizer=tokenizer,
    args=training_arguments,
    packing=True,
)

trainer.train()
#wandb.finish()

# Save the fine-tuned model
trainer.model.save_pretrained(new_model)
#trainer.model.push_to_hub(new_model, use_temp_dir=False)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Generating train split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]



  0%|          | 0/3169 [00:00<?, ?it/s]

{'loss': 3.1817, 'grad_norm': 2.1253957748413086, 'learning_rate': 2e-05, 'epoch': 0.0}
{'loss': 2.8507, 'grad_norm': 2.068655252456665, 'learning_rate': 4e-05, 'epoch': 0.0}
{'loss': 2.9772, 'grad_norm': 1.954174518585205, 'learning_rate': 6e-05, 'epoch': 0.0}
{'loss': 2.4482, 'grad_norm': 1.8914507627487183, 'learning_rate': 8e-05, 'epoch': 0.0}
{'loss': 3.1032, 'grad_norm': 2.1943624019622803, 'learning_rate': 0.0001, 'epoch': 0.0}
{'loss': 3.1245, 'grad_norm': 1.888848900794983, 'learning_rate': 0.00012, 'epoch': 0.0}
{'loss': 3.0004, 'grad_norm': 1.9238353967666626, 'learning_rate': 0.00014, 'epoch': 0.0}
{'loss': 2.7938, 'grad_norm': 2.0145702362060547, 'learning_rate': 0.00016, 'epoch': 0.0}
{'loss': 2.8444, 'grad_norm': 2.233994722366333, 'learning_rate': 0.00018, 'epoch': 0.0}
{'loss': 2.8166, 'grad_norm': 1.8144105672836304, 'learning_rate': 0.0002, 'epoch': 0.0}
{'loss': 2.2182, 'grad_norm': 1.825373888015747, 'learning_rate': 0.00019993668882557772, 'epoch': 0.0}
{'loss': 2

KeyboardInterrupt: 

In [2]:
dataset = load_dataset("ruslanmv/ai-medical-chatbot", split="all")
dataset = dataset.shuffle(seed=65).select(range(1000)) # Only use 1000 samples for quick demo
print(dataset)

def format_chat_template(row):
    row_json = [{"role": "user", "content": row["Patient"]},
               {"role": "assistant", "content": row["Doctor"]}]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

dataset = dataset.map(
    format_chat_template,
    num_proc=4,
)

dataset['text'][3]

README.md:   0%|          | 0.00/863 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


dialogues.parquet:   0%|          | 0.00/142M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/256916 [00:00<?, ? examples/s]

Dataset({
    features: ['Description', 'Patient', 'Doctor'],
    num_rows: 1000
})


Map (num_proc=4):   0%|          | 0/1000 [00:00<?, ? examples/s]

NameError: name 'tokenizer' is not defined