In [1]:
from enum import Enum
from functools import partial
import pandas as pd
import torch
import json

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, BitsAndBytesConfig, set_seed
from datasets import load_dataset
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM
from peft import get_peft_model, IA3Config, TaskType

seed = 42
set_seed(seed)

In [3]:
model_name = "Locutusque/TinyMistral-248M"
dataset_name = "wikisql"

In [4]:
dataset = load_dataset(dataset_name)
dataset

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


Downloading builder script:   0%|          | 0.00/6.57k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.80k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/26.2M [00:00<?, ?B/s]

Generating test split:   0%|          | 0/15878 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/8421 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/56355 [00:00<?, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 15878
    })
    validation: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 8421
    })
    train: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 56355
    })
})

In [6]:
def preprocess(sample):
  column_names = sample["table"]["header"]
  table_id = sample["table"]["id"]
  natural_query = sample["question"]
  sql_query = sample["sql"]["human_readable"].replace("table", table_id)
  content = f"Table: {table_id}\n Columns: {column_names}\n Natural Query: {natural_query}\n SQL Query: {sql_query}</s>"
  return {"content": content}

In [7]:
dataset = dataset.map(
    preprocess,
    batched=False,
    remove_columns=dataset["train"].column_names
)

Map:   0%|          | 0/15878 [00:00<?, ? examples/s]

Map:   0%|          | 0/8421 [00:00<?, ? examples/s]

Map:   0%|          | 0/56355 [00:00<?, ? examples/s]

In [8]:
peft_config = IA3Config(target_modules=["k_proj", "v_proj", "down_proj"], 
                        feedforward_modules=["down_proj"], 
                        task_type=TaskType.CAUSAL_LM)

In [9]:
response_template = "SQL Query:"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = 0
collator = DataCollatorForCompletionOnlyLM(response_template, tokenizer=tokenizer)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/67.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


config.json:   0%|          | 0.00/562 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/992M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]

In [10]:
# cast non-trainable params in bf16
for p in model.parameters():
  if not p.requires_grad:
    p.data = p.to(torch.float16)

In [14]:
output_dir = "mistral_sql_instruct"
per_device_train_batch_size = 8
per_device_eval_batch_size = 8
gradient_accumulation_steps = 4
logging_steps = 5
learning_rate = 5e-4
max_grad_norm = 1.0
num_train_epochs= 15
warmup_ratio = 0.1
lr_scheduler_type = "cosine"
max_seq_length = 512

In [15]:
training_arguments = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=per_device_eval_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    save_strategy="no",
    evaluation_strategy="epoch",
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    max_grad_norm=max_grad_norm,
    weight_decay=0.1,
    warmup_ratio=warmup_ratio,
    lr_scheduler_type=lr_scheduler_type,
    fp16=True,
    report_to=["tensorboard", "wandb"],
    hub_private_repo=True,
    push_to_hub=True,
    num_train_epochs=num_train_epochs,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False}
)



In [13]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [16]:
trainer = SFTTrainer(
    model=model,
    args=training_arguments,
    train_dataset=dataset["validation"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    packing=False,
    dataset_text_field="content",
    max_seq_length=max_seq_length,
    peft_config=peft_config,
    data_collator=collator,
)

Map:   0%|          | 0/8421 [00:00<?, ? examples/s]

Map:   0%|          | 0/15878 [00:00<?, ? examples/s]

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.


In [17]:
trainer.model.print_trainable_parameters()
trainer.model

trainable params: 55,296 || all params: 248,079,360 || trainable%: 0.0223


PeftModelForCausalLM(
  (base_model): IA3Model(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32005, 1024)
        (layers): ModuleList(
          (0-11): 12 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (k_proj): Linear(
                (base_layer): Linear(in_features=1024, out_features=256, bias=False)
                (ia3_l): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 256x1 (GPU 0)])
              )
              (v_proj): Linear(
                (base_layer): Linear(in_features=1024, out_features=256, bias=False)
                (ia3_l): ParameterDict(  (default): Parameter containing: [torch.cuda.FloatTensor of size 256x1 (GPU 0)])
              )
              (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (rotary_emb): MistralRotaryEmbedding()
   

In [18]:
trainer.train()
trainer.save_model()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...


Epoch,Training Loss,Validation Loss
0,3.9072,3.954828
1,3.0428,2.992874
2,2.6577,2.700804
4,2.393,2.373924
5,2.2391,2.280632
6,2.177,2.212597
8,2.0566,2.128338
9,2.2033,2.104685
10,2.0711,2.089388
12,1.9439,2.075672




adapter_model.safetensors:   0%|          | 0.00/225k [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

events.out.tfevents.1717326638.33cd504b2df6.208.0:   0%|          | 0.00/176k [00:00<?, ?B/s]

training_args.bin:   0%|          | 0.00/4.73k [00:00<?, ?B/s]

In [19]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
import torch
import random

In [20]:
dataset_name = "wikisql"
def preprocess(sample):
    column_names = sample["table"]["header"]
    table_id = sample["table"]["id"]
    natural_query = sample["question"]
    sql_query = sample["sql"]["human_readable"].replace("table", table_id)
    content = f"Table: {table_id}\n Columns: {column_names}\n Natural Query: {natural_query}\n SQL Query: {sql_query}</s>"
    return {"content": content}

dataset = load_dataset(dataset_name)
dataset = dataset.map(
    preprocess,
    batched=False,
    remove_columns=dataset["train"].column_names
)

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this dataset from the next major release of `datasets`.


In [21]:
peft_model_id = "Kamran1367/mistral_sql_instruct"
device = "cuda"
config = PeftConfig.from_pretrained(peft_model_id)
model = AutoModelForCausalLM.from_pretrained(config.base_model_name_or_path)
tokenizer = AutoTokenizer.from_pretrained(peft_model_id)
model = PeftModel.from_pretrained(model, peft_model_id)
model.to(torch.float16)
model.cuda()
model.eval()

adapter_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.01k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/639 [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


adapter_model.safetensors:   0%|          | 0.00/225k [00:00<?, ?B/s]

PeftModelForCausalLM(
  (base_model): IA3Model(
    (model): MistralForCausalLM(
      (model): MistralModel(
        (embed_tokens): Embedding(32005, 1024)
        (layers): ModuleList(
          (0-11): 12 x MistralDecoderLayer(
            (self_attn): MistralAttention(
              (q_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (k_proj): Linear(
                (base_layer): Linear(in_features=1024, out_features=256, bias=False)
                (ia3_l): ParameterDict(  (default): Parameter containing: [torch.cuda.HalfTensor of size 256x1 (GPU 0)])
              )
              (v_proj): Linear(
                (base_layer): Linear(in_features=1024, out_features=256, bias=False)
                (ia3_l): ParameterDict(  (default): Parameter containing: [torch.cuda.HalfTensor of size 256x1 (GPU 0)])
              )
              (o_proj): Linear(in_features=1024, out_features=1024, bias=False)
              (rotary_emb): MistralRotaryEmbedding()
     

In [None]:
split = "test"
length = len(dataset[split])
for i in range(10):
    index = random.randint(0,length)
    text = f'{dataset[split][index]["content"].split("SQL Query:")[0]}SQL Query:'
    inputs = tokenizer(text, return_tensors="pt")#, add_special_tokens=False)
    inputs = {k: v.to("cuda") for k,v in inputs.items()}
    with torch.autocast(dtype=torch.bfloat16, device_type="cuda"):
        outputs = model.generate(**inputs, 
                                 max_new_tokens=128, 
                                 eos_token_id=tokenizer.eos_token_id)
    predicted = tokenizer.decode(outputs[0]).split("SQL Query:")[-1].strip()
    expected = dataset[split][index]["content"].split("SQL Query:")[-1].strip()
    
    print(f"{text=}\n\n{predicted=}\n\n{expected=}")