In [None]:
import warnings

from unsloth import FastLanguageModel

warnings.filterwarnings("ignore")

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Qwen2.5-14B-Instruct",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

```
<|im_start|>system
You are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>
<|im_start|>user
Hello!<|im_end|>
<|im_start|>assistant
Hey there! How are you?<|im_end|>
<|im_start|>user
I'm great thanks!<|im_end|>
```

<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nHow do astronomers determine the original wavelength of light emitted by a celestial body at rest, which is necessary for measuring its speed using the Doppler effect?<|im_end|>\n<|im_start|>assistant\nAstronomers make use of the unique spectral fingerprints of elements found in stars. These elements emit and absorb light at specific, known wavelengths, forming an absorption spectrum. By analyzing the light received from distant stars and comparing it to the laboratory-measured spectra of these elements, astronomers can identify the shifts in these wavelengths due to the Doppler effect. The observed shift tells them the extent to which the light has been redshifted or blueshifted, thereby allowing them to calculate the speed of the star along the line of sight relative to Earth.<|im_end|>\n

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj", "embed_tokens", "lm_head"],
    lora_alpha=16,
    lora_dropout=0,  # Supports any, but = 0 is optimized
    bias="none",  # Supports any, but = "none" is optimized

    use_gradient_checkpointing="unsloth",  # True or "unsloth" for very long context
    random_state=3407,
    use_rslora=False,  # We support rank stabilized LoRA
    loftq_config=None,  # And LoftQ
)

In [None]:
import pandas as pd
import datasets

df = pd.read_excel("../dataset/company_accounting_item_10_groups.xlsx")
df = df[["item_name", "account_name"]]
df.dropna(inplace=True)

In [None]:
def create_conversation(row):
    item_name = row["item_name"]
    account_name = row["account_name"]

    user_input = (
        f'Please provide the "account_name" for the following "item_name" in JSON format:\n\n'
        f'"item_name": "{item_name}"'
    )

    assistant_input = (
        f'{{"item_name": "{item_name}", "account_name": "{account_name}"}}'
    )

    conversation = [
        {"role": "user", "content": user_input},
        {"role": "assistant", "content": assistant_input}
    ]

    return conversation

In [None]:
df["text"] = df.apply(create_conversation, axis=1)

In [None]:
dataset = datasets.Dataset.from_pandas(df[["text"]])

In [None]:
system_msg = """
You are an AI language model trained to map various \"item_name\"s to their corresponding \"account_name\"s for accounting purposes. Your goal is to assist users by providing accurate account classifications based on the item names they provide.\n\n**Instructions:**\n\n- When given an \"item_name\", return the corresponding \"account_name\" in a JSON object.\n\n- The JSON object must include both the \"item_name\" provided by the user and the correct \"account_name\".\n\n- Always output the result strictly in JSON format without additional text or explanations.\n\n- If you are unsure of the correct \"account_name\" for a given \"item_name\", set the \"account_name\" value to \"Unknown\".\n\n- Use the knowledge you have been trained on to make the most accurate mappings possible.\n\n- Here is an example of the expected JSON format:\n\n{\"item_name\": \"Support Januar\", \"account_name\": \"Aufwendungen für Lizenzen, Konzessionen\"}
"""

In [None]:
from unsloth.chat_templates import get_chat_template

tokenizer = get_chat_template(
    tokenizer,
    chat_template="qwen-2.5",
    system_message=system_msg
)

def formatting_prompts_func(examples):
    conversation = examples["text"]
    # texts = [tokenizer.apply_chat_template(c, tokenize=False, add_generation_prompt=False) for c in conversation]
    texts = [tokenizer.apply_chat_template(c, tokenize=False, add_generation_prompt=False)[:-4] for c in conversation]
    return {"text": texts}

In [None]:
dataset = dataset.map(formatting_prompts_func, batched=True)

In [None]:
dataset[2]["text"]

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments, DataCollatorForSeq2Seq
from unsloth import is_bfloat16_supported

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    data_collator=DataCollatorForSeq2Seq(tokenizer=tokenizer),
    dataset_num_proc=1,
    packing=False,  # Can make training 5x faster for short sequences.
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=10,        
        # num_train_epochs = 4, # Set this for 1 full training run.
        max_steps=5,
        learning_rate=1e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=5,
        save_steps=50,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=19,
        output_dir="../test/outputs",
        report_to="none",
        save_total_limit = 3,
        # load_best_model_at_end = True
    ),
)

In [None]:
from unsloth.chat_templates import train_on_responses_only

trainer = train_on_responses_only(
    trainer,
    instruction_part="<|im_start|>user\n",
    response_part="<|im_start|>assistant\n",
)

In [None]:
trainer_stats = trainer.train(
    # resume_from_checkpoint=True
)

In [None]:
# model.save_pretrained("lora_model") # Local saving
# tokenizer.save_pretrained("lora_model")

In [None]:
import warnings
from unsloth import FastLanguageModel

warnings.filterwarnings("ignore")

max_seq_length = 2048  # Choose any! We auto support RoPE Scaling internally!
dtype = None  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True  # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="outputs/checkpoint-8684",
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
)

FastLanguageModel.for_inference(model)  # Enable native 2x faster inference

In [None]:
import pandas as pd

df_account_name_1271 = pd.read_excel("../dataset/accounting_1271.xlsx")
account_name_list = df_account_name_1271["account_name"].tolist()

In [None]:
# sorted(account_name_list)

In [None]:
item_name = "Rüstzeit"
# account_name_input = '|'.join([f'"{account_name}"' for account_name in account_name_list])
# 
user_input = f'''
"item_name": "{item_name}"
'''
# 
# system_msg = f"""
# Given the following list of account names, please provide the correct account name for the following item name in JSON format. Output the result in JSON format without additional text or explanations. Choose the correct account name from the list below:
# 
# ** Account Names: **
# {account_name_input}
# 
# """

messages = [
    # {"role": "system","content": system_msg},
    {"role": "user", "content": user_input}]
inputs = tokenizer.apply_chat_template(
    messages,
    tokenize=True,
    add_generation_prompt=True,  # Must add for generation
    return_tensors="pt",
).to("cuda")

outputs = model.generate(input_ids=inputs, max_new_tokens=80, use_cache=True, temperature=1.5, min_p=0.1)
tokenizer.batch_decode(outputs)

In [None]:
model.save_pretrained_merged("merged_model", tokenizer, save_method = "merged_16bit",)