### Distilled R1 LoRA Finetune for Discriminator

Imports

In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
import torch  # Required for tensor operations and GPU usage
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

import copy
import logging
from dataclasses import dataclass, field
from typing import Dict, Optional, Sequence

import json
import numpy as np
import torch
import transformers

from utils.constants import TEMPLATE
from utils.train_utils import jload
from torch.utils.data import Dataset
from transformers import Trainer,TrainingArguments

import os

Define parameters

In [None]:
evaluator_names =[
    "TinyLlama/TinyLlama-1.1B-Chat-v1.0",
    "deepseek-ai/DeepSeek-R1-Distill-Qwen-1.5B",
    "stabilityai/stable-code-3b",
    "deepseek-ai/deepseek-coder-1.3b-base"
]


# generate lora model names
lora_model_names = []
for m in evaluator_names:
   lora_model_names.append( m.split("/")[1]+"_spider")

In [None]:
# set the parameters
model_max_length = 300 # max length of the model
data_path = "./data/spider_evaluator_train_cls_exec.json" # path to the data
model_indx = 2 # choose the model to evaluate

# autometically set the model name and the model save name
model_name = evaluator_names[model_indx]
model_savename = lora_model_names[model_indx]
print(f"evaluator_name: {model_name}")
print(f"model_savename: {model_savename}")
current_directory = os.getcwd() #parameters
train_savedatapath = os.path.join(current_directory,f"checkpts/{model_savename}")
model_savedatapath = os.path.join(current_directory,f"checkpts/{model_savename}/model")

Load quantized model

In [3]:
bnb_config = BitsAndBytesConfig(
    load_in_8bit=True,  
    llm_int8_threshold=6.0, 
    llm_int8_enable_fp32_cpu_offload=True  
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    torch_dtype=torch.float16,  # Explicitly set dtype to float16
    device_map="auto"
)

model = prepare_model_for_kbit_training(model)  # Equivalent to prepare_model_for_int8_training


Load Tokenizer

In [4]:
try:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        model_max_length=model_max_length,
        padding_side="right",
        use_fast=False
    )
except:
    tokenizer = AutoTokenizer.from_pretrained(
        model_name,
        model_max_length=model_max_length,
        padding_side="right"
    )


# Setting `pad_token` to `eos_token` for open-end generation.
tokenizer.pad_token = tokenizer.eos_token

Demo prompting

----

In [5]:
# Define a prompt
prompt = """
Answer the following Yes/No question: Is the SQL correct given the utterance?

-- Utterance: How many different countries are all the swimmers from?
-- SQL:
SELECT COUNT(DISTINCT nationality) FROM swimmer;
-- Answer:
"""

# Tokenize input

inputs = tokenizer(prompt, return_tensors="pt").to("cuda")  # Move tensors to GPU if available
"""
messages = [{"role": "user", "content": prompt}]
text = tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
inputs = tokenizer([text], return_tensors="pt").to("cuda")  # Move tensors to GPU if available
"""

# Convert input tensors to float16, but keep integer tensors (like 'input_ids') as long
inputs = {key: value.to(torch.float16) if value.dtype != torch.long and value.dtype != torch.int else value 
          for key, value in inputs.items()}

# Generate text
with torch.no_grad():  # Disable gradient calculation for inference
    output = model.generate(**inputs, max_length=300, do_sample=True, temperature=0.6, num_return_sequences=5)

print(output.shape)  # Output shape: (batch_size, sequence_length)

  attn_output = torch.nn.functional.scaled_dot_product_attention(


torch.Size([5, 109])


In [6]:
# Decode and print generated text
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
#print(text)
print(generated_text[len(prompt):])  # Print only the generated text

Yes, the SQL is correct given the utterance. The COUNT(DISTINCT nationality) FROM swimmer is a proper SELECT statement that counts the number of distinct values in the nationality column of the swimmer table.


----

Define LoRA

In [7]:
target_modules = ["q_proj", "v_proj"]

config = LoraConfig(
        r=8,
        lora_alpha=16,
        target_modules=target_modules,
        lora_dropout=0.05,
        bias="none",
        task_type="CAUSAL_LM",
    )

model = get_peft_model(model, config)
model.print_trainable_parameters()

trainable params: 1,126,400 || all params: 1,101,174,784 || trainable%: 0.1023


Load supervised training data

In [8]:
IGNORE_INDEX = -100

def preprocess(
    sources: Sequence[str],
    targets: Sequence[str],
    tokenizer: transformers.PreTrainedTokenizer,
) -> Dict:
    """Preprocess the data by tokenizing."""

    # Tokenize
    tokenized_sources_with_prompt = tokenizer(
        sources,
        max_length=1600 - 300,
        truncation=True,
        add_special_tokens=False,
    )
    tokenized_targets = tokenizer(
        targets,
        max_length=300,
        truncation=True,
        add_special_tokens=False,
    )

    # you might need to convert to float 16:
    # tokenized_sources_with_prompt
    # tokenized_targets     

    # Build the input and labels for causal LM
    input_ids = []
    labels = []
    for tokenized_source, tokenized_target in zip(
        tokenized_sources_with_prompt['input_ids'],
        tokenized_targets['input_ids']
    ):
        input_ids.append(torch.tensor(tokenized_source + tokenized_target))
        labels.append(
            torch.tensor([IGNORE_INDEX for _ in range(len(tokenized_source))] + copy.deepcopy(tokenized_target))
        )

    return dict(input_ids=input_ids, labels=labels)

class SupervisedDataset(Dataset):
    """Dataset for supervised fine-tuning."""

    def __init__(self, data_path: str, tokenizer: transformers.PreTrainedTokenizer):
        super(SupervisedDataset, self).__init__()
        logging.warning("Loading data...")
        list_data_dict = jload(data_path)

        logging.warning("Formatting inputs...")
        sources = [
            example["src"]
            for example in list_data_dict
        ]
        targets = [
            f"{example['tgt']}{tokenizer.eos_token}"
            for example in list_data_dict
        ]

        logging.warning("Tokenizing inputs... This may take some time...")
        data_dict = preprocess(sources, targets, tokenizer)

        self.input_ids = data_dict["input_ids"]
        self.labels = data_dict["labels"]

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, i) -> Dict[str, torch.Tensor]:
        return dict(input_ids=self.input_ids[i], labels=self.labels[i])


@dataclass
class DataCollatorForSupervisedDataset(object):
    """Collate examples for supervised fine-tuning."""

    tokenizer: transformers.PreTrainedTokenizer

    def __call__(self, instances: Sequence[Dict]) -> Dict[str, torch.Tensor]:
        input_ids, labels = tuple([instance[key] for instance in instances] for key in ("input_ids", "labels"))
        input_ids = torch.nn.utils.rnn.pad_sequence(
            input_ids, batch_first=True, padding_value=self.tokenizer.pad_token_id
        )
        labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=IGNORE_INDEX)
        return dict(
            input_ids=input_ids,
            labels=labels,
            attention_mask=input_ids.ne(self.tokenizer.pad_token_id),
        )


def make_supervised_data_module(tokenizer: transformers.PreTrainedTokenizer,data_path,dev_data_path=None) -> Dict:
    """Make dataset and collator for supervised fine-tuning."""
    train_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=data_path)
    dev_dataset = None 
    if dev_data_path:
        dev_dataset = SupervisedDataset(tokenizer=tokenizer, data_path=dev_data_path)
    data_collator = DataCollatorForSupervisedDataset(tokenizer=tokenizer)
    return dict(train_dataset=train_dataset, eval_dataset=dev_dataset, data_collator=data_collator)



In [9]:
data_module = make_supervised_data_module(tokenizer=tokenizer, data_path=data_path )



Train

In [10]:
training_args = TrainingArguments(
    output_dir=train_savedatapath,
    overwrite_output_dir=True,
    optim="adamw_torch",
    num_train_epochs=1,
    per_device_train_batch_size=2,# 1 in code, 128 in paper
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=32,
    save_total_limit=1,
    push_to_hub=False,
    learning_rate=1e-5,
    warmup_ratio=0.03,
    lr_scheduler_type='cosine',
    do_train=True,
    do_eval=False,
    eval_strategy='no',
    save_strategy='steps',
    save_steps=1000,
    bf16=True,  # Enables bfloat16 precision (for NVIDIA Ampere+ GPUs)
    tf32=True,  # Enables TF32 mode (for NVIDIA Ampere+ GPUs
#    load_best_model_at_end=True,
#    metric_for_best_model='loss', 
#    greater_is_better=False
)

# Ensure TF32 is enabled globally for better performance
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True

In [11]:
# Ensure `use_cache=False` in model.config
model.config.use_cache = False

trainer = Trainer(model=model, tokenizer=tokenizer,args=training_args, **data_module)
trainer.train()
trainer.save_state()
model.save_pretrained(model_savedatapath)

  trainer = Trainer(model=model, tokenizer=tokenizer,args=training_args, **data_module)


Step,Training Loss
500,0.5293
