In [2]:
%%capture
!pip install unsloth
# Also get the latest nightly Unsloth!
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir --no-deps git+https://github.com/unslothai/unsloth.git
!pip install pip3-autoremove
!pip-autoremove torch torchvision torchaudio -y
!pip install torch torchvision torchaudio xformers --index-url https://download.pytorch.org/whl/cu121


In [4]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Sets the maximum sequence length for input data.
dtype = None          # Specifies the data type for model weights. Setting to None enables automatic detection based on the hardware. Alternatives include float16 for GPUs like Tesla T4 or V100+
load_in_4bit = True  
# Enables 4-bit quantization to reduce memory usage, facilitating the loading of larger models without running out of memory (OOM). Setting to False would load the model in higher precision but with higher memory consumption.
#unsloth/Qwen2.5-Coder-0.5B
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Qwen2-1.5B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)


==((====))==  Unsloth 2025.1.6: Fast Qwen2 patching. Transformers: 4.48.1.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 6.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [5]:
# configures the model for Parameter-Efficient Fine-Tuning (PEFT) using LoRA (Low-Rank Adaptation) via Unsloth's get_peft_model method
# new W=W+ (a*b)
model = FastLanguageModel.get_peft_model(
    model,
    r = 64,  # The rank of the LoRA matrices. Higher values allow more capacity but consume more memory  ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj","down_proj"],  # Specifies which modules within the model to apply LoRA
    lora_alpha = 32, # scaling factor for LoRA. Balances the contribution of the LoRA layers to the model's output
    lora_dropout = 0.1,  
    bias = "none",
    use_gradient_checkpointing = "unsloth", # Enables gradient checkpointing to save memory during training. The
    random_state = 42,
    use_rslora = False,
    loftq_config = None,
)
     


Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.1.6 patched 28 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [6]:
import json

# Load the dataset
dataset_path = "/kaggle/input/datasetgenerated/dataset.json"

with open(dataset_path, "r") as f:
    dataset = json.load(f)

# Print a sample entry
print(json.dumps(dataset[0], indent=2))


{
  "input": "Find ID 3",
  "expected output": {
    "type": "bracket",
    "operationCode": "&&",
    "conditions": [
      {
        "type": "condition",
        "operationCode": "=",
        "operand": "ID",
        "value": 3
      }
    ]
  }
}


In [7]:
from datasets import Dataset

# Convert dataset into Alpaca-style format
formatted_data = [
    {
        "instruction": entry["input"],
        "input": "",  # Keep empty as instruction is self-contained
        "output": json.dumps(entry["expected output"], indent=2)
    }
    for entry in dataset
]

# Convert to Hugging Face Dataset format
hf_dataset = Dataset.from_list(formatted_data)

# Split into train and validation sets
hf_dataset = hf_dataset.train_test_split(test_size=0.1)

# Show sample formatted data
print(hf_dataset["train"][0])


{'instruction': 'Find books published before 2015', 'input': '', 'output': '{\n  "type": "bracket",\n  "operationCode": "&&",\n  "conditions": [\n    {\n      "type": "condition",\n      "operationCode": "<",\n      "operand": "publication year",\n      "value": 2015\n    }\n  ]\n}'}


In [8]:
from unsloth import to_sharegpt

dataset = to_sharegpt(
    hf_dataset["train"],
    merged_prompt="{instruction}",
    output_column_name="output",
    conversation_extension=3  # Extend conversations for longer contexts
)

print(dataset[0]["conversations"])


Merging columns:   0%|          | 0/152 [00:00<?, ? examples/s]

Converting to ShareGPT:   0%|          | 0/152 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/152 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/152 [00:00<?, ? examples/s]

Flattening the indices:   0%|          | 0/152 [00:00<?, ? examples/s]

Extending conversations:   0%|          | 0/152 [00:00<?, ? examples/s]

[{'from': 'human', 'value': "('Find books published before 2015',)"}, {'from': 'gpt', 'value': '{\n  "type": "bracket",\n  "operationCode": "&&",\n  "conditions": [\n    {\n      "type": "condition",\n      "operationCode": "<",\n      "operand": "publication year",\n      "value": 2015\n    }\n  ]\n}'}, {'from': 'human', 'value': "('Find experiments where (duration less than 48 and protocol type in Standard, Custom) and ((instrument calibration is true and reagent check is true) or supervisor approval not empty)',)"}, {'from': 'gpt', 'value': '{\n  "type": "bracket",\n  "operationCode": "&&",\n  "conditions": [\n    {\n      "type": "bracket",\n      "operationCode": "&&",\n      "conditions": [\n        {\n          "type": "condition",\n          "operationCode": "<",\n          "operand": "Duration",\n          "value": 48\n        },\n        {\n          "type": "condition",\n          "operationCode": "in",\n          "operand": "Protocol Type",\n          "value": [\n          

In [9]:
from unsloth import standardize_sharegpt
dataset = standardize_sharegpt(dataset)


Standardizing format:   0%|          | 0/152 [00:00<?, ? examples/s]

In [10]:
chat_template = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
{INPUT}

### Response:
{OUTPUT}"""

from unsloth import apply_chat_template
dataset = apply_chat_template(
    dataset,
    tokenizer=tokenizer,
    chat_template=chat_template
)


Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/152 [00:00<?, ? examples/s]

In [11]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
    per_device_train_batch_size=2,  # Adjust based on memory
    gradient_accumulation_steps=4,  # Reduces GPU memory load
    warmup_steps=5,  # Gradual learning rate warm-up
    max_steps=50,  # Number of total training steps
    learning_rate=2e-4,
    fp16=not is_bfloat16_supported(),  # Use FP16 unless BF16 is supported
    bf16=is_bfloat16_supported(),
    logging_steps=1,
    optim="adamw_8bit",  # Optimizer for low-memory setups
    weight_decay=0.01,
    lr_scheduler_type="linear",
    seed=3407,
    output_dir="outputs",
    report_to="none",  # Disable tracking (can use "wandb" for logging)
)


In [12]:
trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=2048,
    dataset_num_proc=2,
    packing=False,
    args=training_args
)

trainer_stats = trainer.train()


Map (num_proc=2):   0%|          | 0/152 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 152 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 2 | Gradient Accumulation steps = 4
\        /    Total batch size = 8 | Total steps = 50
 "-____-"     Number of trainable parameters = 36,241,408


Step,Training Loss
1,1.0028
2,1.0525
3,1.0956
4,1.1105
5,1.1039
6,0.9071
7,0.9301
8,0.7784
9,0.6924
10,0.6476


In [15]:
model.save_pretrained("./fine-tuned-qwen2.5-coder")
tokenizer.save_pretrained("./fine-tuned-qwen2.5-coder")


('./fine-tuned-qwen2.5-coder/tokenizer_config.json',
 './fine-tuned-qwen2.5-coder/special_tokens_map.json',
 './fine-tuned-qwen2.5-coder/vocab.json',
 './fine-tuned-qwen2.5-coder/merges.txt',
 './fine-tuned-qwen2.5-coder/added_tokens.json',
 './fine-tuned-qwen2.5-coder/tokenizer.json')

In [16]:
from unsloth import FastLanguageModel
import torch

# Define the fine-tuned model path
fine_tuned_model_path = "./fine-tuned-qwen2.5-coder"

# Load the fine-tuned model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_path,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,
)


==((====))==  Unsloth 2025.1.6: Fast Qwen2 patching. Transformers: 4.48.1.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 6.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [17]:
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Move input to GPU if available
    outputs = model.generate(
        inputs["input_ids"],
        max_length=512,  # Limit output length
        temperature=0.1,  # Control randomness (lower = more deterministic)
        top_k=50,         # Limit sampling to top-k tokens
        top_p=0.9,        # Nucleus sampling
        do_sample=True
    )
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [18]:
FastLanguageModel.for_inference(model)


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536, padding_idx=151646)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
         

In [19]:
from unsloth import FastLanguageModel
import torch

# Define the fine-tuned model path
fine_tuned_model_path = "./fine-tuned-qwen2.5-coder"

# Load the fine-tuned model and tokenizer
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=fine_tuned_model_path,
    max_seq_length=2048,
    dtype=None,
    load_in_4bit=True,  # Enables memory-efficient inference
)

# **Prepare the model for inference** (Required for Unsloth models)
FastLanguageModel.for_inference(model)


==((====))==  Unsloth 2025.1.6: Fast Qwen2 patching. Transformers: 4.48.1.
   \\   /|    GPU: Tesla P100-PCIE-16GB. Max memory: 15.888 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu121. CUDA: 6.0. CUDA Toolkit: 12.1. Triton: 3.1.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post1. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): Qwen2ForCausalLM(
      (model): Qwen2Model(
        (embed_tokens): Embedding(151936, 1536, padding_idx=151646)
        (layers): ModuleList(
          (0-27): 28 x Qwen2DecoderLayer(
            (self_attn): Qwen2Attention(
              (q_proj): lora.Linear4bit(
                (base_layer): Linear4bit(in_features=1536, out_features=1536, bias=True)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1536, out_features=64, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=64, out_features=1536, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
                (lora_magnitude_vector): ModuleDict()
              )
         

In [20]:
def generate_response(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Move input to GPU if available
    
    # Ensure proper inference settings
    outputs = model.generate(
        inputs["input_ids"],
        max_new_tokens=512,  # Restrict output length
        temperature=0.1,  # Control randomness (lower = more deterministic)
        top_k=50,         # Limit sampling to top-k tokens
        top_p=0.95,        # Nucleus sampling
        do_sample=True
    )
    
    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [21]:
test_query = "Validate barcode XYZ-987"
response = generate_response(test_query)
print("Model Output:", response)


The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Model Output: Validate barcode XYZ-987654321

### Instruction:

("Barcode XYZ-987654321",)

### Response:

{
  "type": "bracket",
  "operationCode": "&&",
  "conditions": [
    {
      "type": "condition",
      "operationCode": "=",
      "operand": "Barcode",
      "value": "XYZ-987654321"
    }
  ]
}


In [22]:
def generate_response(user_input):
    messages = [
        {"role": "user", "content": user_input}
    ]
    
    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,  # Required for generation
        return_tensors="pt"
    ).to("cuda")

    outputs = model.generate(
        input_ids=inputs,
        max_new_tokens=512,  # Limit response length
        use_cache=True,
        temperature=0.2,  # Reduce randomness for structured output
        top_k=40,  # Filter unlikely tokens
        top_p=0.95,  # Prevent highly unlikely completions
        do_sample=False  # Ensure deterministic JSON output
    )

    return tokenizer.decode(outputs[0], skip_special_tokens=True)


In [23]:
test_query = "Validate barcode XYZ-987"
response = generate_response(test_query)
print("Model Output:", response)


Model Output: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Validate barcode XYZ-987

### Response:
{
  "type": "bracket",
  "operationCode": "&&",
  "conditions": [
    {
      "type": "condition",
      "operationCode": "=",
      "operand": "barcode",
      "value": "XYZ-987"
    }
  ]
}


In [24]:
test_query = "Find ID 3"
response = generate_response(test_query)
print("Model Output:", response)


Model Output: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Find ID 3

### Response:
{
  "type": "bracket",
  "operationCode": "&&",
  "conditions": [
    {
      "type": "condition",
      "operationCode": "=",
      "operand": "ID",
      "value": 3
    }
  ]
}


In [25]:
test_query = "Find specimens where (collection date after 2024-01-01 and storage location containing Building A) and ((processing method in Centrifugation, Filtration and volume greater than 50) or analysis notes containing urgent"
response = generate_response(test_query)
print("Model Output:", response)


Model Output: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
Find specimens where (collection date after 2024-01-01 and storage location containing Building A) and ((processing method in Centrifugation, Filtration and volume greater than 50) or analysis notes containing urgent

### Response:
{
  "type": "bracket",
  "operationCode": "&&",
  "conditions": [
    {
      "type": "condition",
      "operationCode": ">",
      "operand": "Collection Date",
      "value": "2024-01-01"
    },
    {
      "type": "condition",
      "operationCode": "contain",
      "operand": "Storage Location",
      "value": "Building A"
    },
    {
      "type": "bracket",
      "operationCode": "||",
      "conditions": [
        {
          "type": "condition",
          "operationCode": "in",
          "operand": "Processing Method",
          "value": [
            "Centrifugation",
            "Filtration"
          ]
       