# Train the Model !

In [23]:
from dataclasses import dataclass, field
from typing import Optional

import torch

from transformers import AutoTokenizer, HfArgumentParser, AutoModelForCausalLM, BitsAndBytesConfig
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from trl import SFTTrainer, SFTConfig, setup_chat_format
from accelerate import Accelerator

In [24]:
@dataclass
class ScriptArguments:
    """
    Arguments for the fine_tuning
    """
    base_model = "Qwen/Qwen2.5-3B-Instruct"
    fine_tuned_model = "Qwen2.5-3B-Instruct-software-model_completion_fine_tuned"
    merged_model = "Qwen/Qwen2.5-3B-Instruct-software-model_completion"
    dataset_name = "/home/ubuntu/dataset/one_elem_processed_4000"
    per_device_train_batch_size: Optional[int] = field(default=1)
    per_device_eval_batch_size: Optional[int] = field(default=1)
    gradient_accumulation_steps: Optional[int] = field(default=4)
    evaluation_strategy: Optional[str] = field(default="steps")
    evaluation_accumulation_steps: Optional[int] = field(default=5)
    learning_rate: Optional[float] = field(default=2e-4)
    max_grad_norm: Optional[float] = field(default=0.3)
    weight_decay: Optional[int] = field(default=0.001)
    lora_alpha= 64,
    lora_dropout =  0.5,
    lora_r = 32
    max_seq_length: Optional[int] = field(default=4100)
    fp16 = True
    bf16 = False
    gradient_checkpointing: Optional[bool] = field(
        default=True,
        metadata={"help": "Enables gradient checkpointing."},
    )
    use_flash_attention_2: Optional[bool] = field(
        default=False,
        metadata={"help": "Enables Flash Attention 2."},
    )
    optim: Optional[str] = field(
        default="paged_adamw_32bit",
        metadata={"help": "The optimizer to use."},
    )
    lr_scheduler_type: str = field(
        default="constant",
        metadata={"help": "Learning rate schedule. Constant a bit better than cosine, and has advantage for analysis"},
    )
    max_steps: int = field(default=100, metadata={"help": "How many optimizer update steps to take"}),
    epochs : int = field(default=1, metadata={"help": "How many epochs to train for"})
    warmup_ratio: float = field(default=0.03, metadata={"help": "Fraction of steps to do a warmup for"})
    save_steps: int = field(default=87, metadata={"help": "Save checkpoint every X updates steps."})
    logging_steps: int = field(default=87, metadata={"help": "Log every X updates steps."})
    output_dir: str = field(
        default="./qwen3b_instruct/results",
        metadata={"help": "The output directory where the model predictions and checkpoints will be written."},
    )
    logging_dir: str = field(
        default="./qwen3b_instruct/logs",
        metadata={"help": "The output directory where the logs will be written."},
    )
    eval_steps: int = field(default=87, metadata={"help": "How often to evaluate the model"})

parser = HfArgumentParser(ScriptArguments)
# Parse the arguments, ignoring unrecognized ones
script_args, remaining_args = parser.parse_args_into_dataclasses(return_remaining_strings=True)

In [25]:
access_token = "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"

In [26]:
#from huggingface_hub import login
#login()

In [27]:
# the quantization config
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)

In [28]:
# Load the model
model = AutoModelForCausalLM.from_pretrained(
    script_args.base_model,
    quantization_config=quantization_config,
    device_map ="auto",
    attn_implementation="eager"
)

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(script_args.base_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
import bitsandbytes as bnb

def find_all_linear_names(model):
    cls = bnb.nn.Linear4bit
    lora_module_names = set()
    for name, module in model.named_modules():
        if isinstance(module, cls):
            names = name.split('.')
            lora_module_names.add(names[0] if len(names) == 1 else names[-1])
    if 'lm_head' in lora_module_names:  # needed for 16 bit
        lora_module_names.remove('lm_head')
    return list(lora_module_names)

modules = find_all_linear_names(model)
print(modules)

['v_proj', 'down_proj', 'k_proj', 'up_proj', 'q_proj', 'gate_proj', 'o_proj']


In [None]:
#Lora config
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=modules
)

In [None]:
# Prepare the model for kbit training
#model, tokenizer = setup_chat_format(model, tokenizer)

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
# load the dataset
abs_path = script_args.dataset_name
train_dataset_url = f"{abs_path}/train.jsonl"
test_dataset_url = f"{abs_path}/test.jsonl"
validation_dataset_url = f"{abs_path}/validation.jsonl"

data_files = {
    'train': train_dataset_url,
    'test': test_dataset_url,
    'validation': validation_dataset_url
}

dataset = load_dataset('json', data_files=data_files)

train_dataset = dataset['train']
test_dataset = dataset['test']
validation_dataset = dataset['validation']

In [None]:
# transform the data
instruction = "You are an AI assistant that specializes in UML model completion. Given an incomplete UML model represented in JSON format, output the missing portions of the model in JSON format."

def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f'Here is the incomplete UML model:\n{row["input"]}'},
        {"role": "assistant", "content": row["output"]}
    ]
    row["text"] = tokenizer.apply_chat_template(row_json, tokenize=False)
    return row

trained_data = train_dataset.map(format_chat_template)
validation_data = validation_dataset.map(format_chat_template)
test_data = test_dataset.map(format_chat_template)

In [None]:
sft_config = SFTConfig(
    output_dir=script_args.output_dir,
    per_device_train_batch_size=script_args.per_device_train_batch_size,
    per_device_eval_batch_size=script_args.per_device_eval_batch_size,
    gradient_accumulation_steps=script_args.gradient_accumulation_steps,
    save_steps=script_args.save_steps,
    logging_steps=script_args.logging_steps,
    optim=script_args.optim,
    num_train_epochs=script_args.epochs,
    lr_scheduler_type=script_args.lr_scheduler_type,
    gradient_checkpointing=script_args.gradient_checkpointing,
    eval_strategy=script_args.evaluation_strategy,
    eval_steps=script_args.eval_steps,
    eval_accumulation_steps=script_args.evaluation_accumulation_steps,
    logging_dir=script_args.logging_dir,
    warmup_ratio=script_args.warmup_ratio,
    logging_strategy="steps",
    learning_rate=script_args.learning_rate,
    max_seq_length= script_args.max_seq_length,
    fp16=script_args.fp16,
    bf16=script_args.bf16,

)

In [None]:
#train
tokenizer.pad_token = tokenizer.eos_token
trainer = SFTTrainer(
    model=model,
    train_dataset=trained_data,
    eval_dataset=validation_data,
    tokenizer=tokenizer,
    args=sft_config,
    peft_config=lora_config,
    max_seq_length=script_args.max_seq_length,
    dataset_text_field="text"
    #compute_metrics=compute_metrics,
    #preprocess_logits_for_metrics=preprocess_logits_for_metrics
)


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


In [None]:
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss,Validation Loss
87,0.1274,0.082275


  return fn(*args, **kwargs)


TrainOutput(global_step=145, training_loss=0.10631517870672817, metrics={'train_runtime': 2587.5888, 'train_samples_per_second': 0.224, 'train_steps_per_second': 0.056, 'total_flos': 2.6866241010180096e+16, 'train_loss': 0.10631517870672817, 'epoch': 1.0})

# Save the Model 

In [None]:
trainer.model.save_pretrained(script_args.fine_tuned_model)

In [None]:
# Reload tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(script_args.base_model)

base_model_reload = AutoModelForCausalLM.from_pretrained(
    script_args.base_model,
    low_cpu_mem_usage=True,
    return_dict=True,
    torch_dtype=torch.float16,
    device_map="cpu",
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from peft import PeftModel

#base_model_reload, tokenizer = setup_chat_format(base_model_reload, tokenizer)
model = PeftModel.from_pretrained(base_model_reload, script_args.fine_tuned_model)

model = model.merge_and_unload()

In [None]:
model.save_pretrained(script_args.merged_model)
tokenizer.save_pretrained(script_args.merged_model)

('Qwen/Qwen2.5-3B-Instruct-software-model_completion/tokenizer_config.json',
 'Qwen/Qwen2.5-3B-Instruct-software-model_completion/special_tokens_map.json',
 'Qwen/Qwen2.5-3B-Instruct-software-model_completion/vocab.json',
 'Qwen/Qwen2.5-3B-Instruct-software-model_completion/merges.txt',
 'Qwen/Qwen2.5-3B-Instruct-software-model_completion/added_tokens.json',
 'Qwen/Qwen2.5-3B-Instruct-software-model_completion/tokenizer.json')

# Evaluate the model

In [29]:
from torch.utils.data import DataLoader
import evaluate

In [30]:
checkpoint = "/home/ubuntu/fine-tuning/Qwen/Qwen2.5-3B-Instruct-software-model_completion"

In [31]:
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=True,
)
model = AutoModelForCausalLM.from_pretrained(
    checkpoint, device_map="auto", quantization_config=quantization_config
)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [33]:
def compute_length(row):
    # Tokenize the "output" text
    tokens_output = tokenizer(row['output'], truncation=False)  # En
    tokens_input= tokenizer(row['input'], truncation=False) 
    
    # Compute the length (number of tokens)
    return {
        'output_length': len(tokens_output['input_ids']),
        'input_length': len(tokens_input['input_ids'])
    }

In [34]:
test_dataset = test_dataset.map(compute_length)
max_length = test_dataset['output_length']
max_length_output = max(max_length)
max_length = test_dataset['input_length']
max_length_input = max(max_length)
print(max_length_input)
print(max_length_output)

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

3709
183


In [None]:
def format_chat_template(row):
    row_json = [
        {"role": "system", "content": instruction},
        {"role": "user", "content": f'Here is the incomplete UML model:\n{row["input"]}'}
    ]
    tokenized_input  = tokenizer.apply_chat_template(row_json, tokenize=True, padding=True, padding_side="right", add_generation_prompt=True ,max_length=4100)
    #print(len(tokenized_input))
    tokenized_output = tokenizer(row['output'], truncation=False)
    #print(len(tokenized_output['input_ids']))
    
    row['input_ids'] = tokenized_input
    row['labels'] = tokenized_output['input_ids']
    return row

In [36]:
# Apply the function to the dataset
test_dataset = test_dataset.map(format_chat_template)
test_dataset = test_dataset.remove_columns(["input", "output","output_length","input_length"])
test_dataset.set_format(type='torch', columns=['input_ids','labels'])

Map:   0%|          | 0/49 [00:00<?, ? examples/s]

2815
3461
3436
3085
1546
3221
3105
2393
2829
3101
1652
3495
2930
1701
2273
2157
1973
2953
2868
1876
3088
2536
1972
2836
2738
3565
2188
2307
3123
1829
3603
3101
3765
3414
3101
1712
1472
1810
1510
3492
1547
3219
3169
1109
2769
1771
2842
2887
3388




In [37]:
#print(encode)
def align_predictions_labels(generated_ids, labels, pad_token_id, max_length):
    """
    Aligns the lengths of predictions and labels by padding or truncating.

    Args:
        generated_ids (Tensor): Tensor of shape (batch_size, pred_seq_length)
        labels (Tensor): Tensor of shape (batch_size, label_seq_length)
        pad_token_id (int): The token ID used for padding.

    Returns:
        Tuple[Tensor, Tensor]: Aligned predictions and labels.
    """
    global true_predictions
    
    generated_text = tokenizer.decode(generated_ids[0], skip_special_tokens=False)
    predicted_text = generated_text.split("assistant\n")[1]
    predicted_tokens = tokenizer(predicted_text, return_tensors='pt', padding=True, truncation=True, max_length=max_length)['input_ids']
    
    batch_size, pred_len = predicted_tokens.size()
    _, label_len = labels.size()
    print("prediction length: ")
    print(pred_len)
    print("label length: ")
    print(label_len)

    if pred_len < label_len:
        # Pad predictions
        padding = torch.full((batch_size, label_len - pred_len), pad_token_id, dtype=predicted_tokens.dtype).to( predicted_tokens.device)
        true_predictions = torch.cat([predicted_tokens, padding], dim=1)
    elif pred_len > label_len:
        # Truncate predictions
        true_predictions =  predicted_tokens[:, :label_len]
    else:
        true_predictions = predicted_tokens
        
    
    batch_size, pred_len = true_predictions.size()
    print("true prediction length: ")
    print(pred_len)
    if  pred_len!= label_len:
        raise ValueError("The size of the predictions and labels should match.")

    return true_predictions, labels

In [38]:
eval_dataloader = DataLoader(test_dataset, batch_size=1)

In [40]:
device = 'cuda'
metric = evaluate.load("accuracy")
model.eval()
batch1 = next(iter(eval_dataloader))
for batch in eval_dataloader:
    input_ids = batch['input_ids'].to(device)
    labels = batch['labels'].to(device)
    with torch.no_grad():
         # Generate outputs
        generated_ids = model.generate(
            input_ids=input_ids,
            max_new_tokens=max_length_output,      # Ensure generation does not exceed allocated length
            pad_token_id=tokenizer.pad_token_id,
            do_sample=False                        # Use deterministic generation; set to True for diversity
        ) 
    aligned_preds, aligned_labels = align_predictions_labels(generated_ids, labels, tokenizer.pad_token_id, 4100)
    
    # Convert tensors to lists
    preds = aligned_preds.cpu().tolist()
    refs = aligned_labels.cpu().tolist()
    
    
    # Flatten the lists for overall token-level accuracy
    preds_flat = [token for seq in preds for token in seq]
    refs_flat = [token for seq in refs for token in seq]
    
    metric.add_batch(predictions=preds_flat, references=refs_flat)

metric.compute() 

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


prediction length: 
64
label length: 
45
true prediction length: 
45
prediction length: 
66
label length: 
47
true prediction length: 
47
prediction length: 
91
label length: 
45
true prediction length: 
45
prediction length: 
65
label length: 
51
true prediction length: 
51
prediction length: 
89
label length: 
103
true prediction length: 
103
prediction length: 
141
label length: 
45
true prediction length: 
45
prediction length: 
93
label length: 
165
true prediction length: 
165
prediction length: 
68
label length: 
70
true prediction length: 
70
prediction length: 
67
label length: 
45
true prediction length: 
45
prediction length: 
65
label length: 
165
true prediction length: 
165
prediction length: 
99
label length: 
98
true prediction length: 
98
prediction length: 
51
label length: 
53
true prediction length: 
53
prediction length: 
68
label length: 
183
true prediction length: 
183
prediction length: 
61
label length: 
101
true prediction length: 
101
prediction length: 
106

{'accuracy': 0.2488849241748439}

### Inference

In [None]:
test_data = dataset["test"]

messages = [{"role": "system", "content": instruction},
            {"role": "user", "content": "I bought the same item twice, cancel order {{Order Number}}"}]

prompt = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)

inputs = tokenizer(prompt, return_tensors='pt', padding=True, truncation=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, num_return_sequences=1)

text = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(text.split("assistant")[1])