In [2]:
import pandas as pd
import transformers
import torch
from datasets import load_dataset

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
dataset = load_dataset("Salesforce/wikisql", trust_remote_code=True)
print(dataset)

DatasetDict({
    test: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 15878
    })
    validation: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 8421
    })
    train: Dataset({
        features: ['phase', 'question', 'table', 'sql'],
        num_rows: 56355
    })
})


In [4]:
dataset_train = dataset['train'].shuffle().select(range(3000))
# dataset_val = dataset['train'].shuffle().select(range(1000))
dataset_test = dataset['train'].shuffle().select(range(1000))

In [13]:
dataset_train[0]

{'phase': 1,
 'question': 'How many teams have a combination classification of Alejandro Valverde and a Points classification of Alessandro Petacchi?',
 'table': {'header': ['Stage',
   'Winner',
   'General classification',
   'Points classification',
   'Mountains classification',
   'Combination classification',
   'Team classification'],
  'page_title': '2003 Vuelta a España',
  'page_id': '',
  'types': ['real', 'text', 'text', 'text', 'text', 'text', 'text'],
  'id': '1-15059783-1',
  'section_title': 'Jersey Progress',
  'caption': 'Jersey Progress',
  'rows': [['1',
    'ONCE-Eroski',
    'Igor González de Galdeano',
    'Igor González de Galdeano',
    'Jan Hruška',
    'Igor González de Galdeano',
    'ONCE-Eroski'],
   ['2',
    'Luis Pérez',
    'Joaquim Rodríguez',
    'Joaquim Rodríguez',
    'Luis Pérez',
    'Luis Pérez',
    'ONCE-Eroski'],
   ['3',
    'Alessandro Petacchi',
    'Joaquim Rodríguez',
    'Joaquim Rodríguez',
    'Luis Pérez',
    'Luis Pérez',
    'ONC

## Mistral 7b

## Preprocessing

In [6]:
import json

In [7]:
# forma data 
system_message = """You are a natural language to sql query translator model. Users will ask you a question in English and you will generate a SQL query based on the table provided: {table}"""

def format_data(dataset):

    # format table
    try:
        table_str = json.dumps(dataset["table"], indent=4)
        return {
        "messages": [
            {"role": "system", "content": system_message.format(table=table_str)},
            {"role": "user", "content": dataset["question"]},
            {"role": "assistant", "content": dataset["sql"]["human_readable"]}
        ]}
    except KeyError as e: 
        print("Missing key in dataset: {e}")
        return None

train_data = dataset_train.map(format_data)
# val_data = dataset_val.map(format_data)
test_data = dataset_test.map(format_data)

Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map: 100%|██████████| 3000/3000 [00:02<00:00, 1356.46 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1184.76 examples/s]


In [9]:
train_data = dataset_train.map(format_data)
# val_data = dataset_val.map(format_data)
test_data = dataset_test.map(format_data)
train_data[0]

{'phase': 1,
 'question': 'What is the enrollment at the institution in New London, Connecticut? ',
 'table': {'header': ['Institution',
   'Location',
   'Nickname',
   'Founded',
   'Founding Religious Affiliation',
   'Enrollment',
   'Joined'],
  'page_title': 'New England Small College Athletic Conference',
  'page_id': '',
  'types': ['text', 'text', 'text', 'real', 'text', 'real', 'real'],
  'id': '1-261931-2',
  'section_title': 'Current members',
  'caption': 'Current members',
  'rows': [['Amherst College',
    'Amherst, Massachusetts',
    'Lord Jeffs',
    '1821',
    'Congregationalist',
    '1817',
    '1971'],
   ['Bates College',
    'Lewiston, Maine',
    'Bobcats',
    '1855',
    'Free Will Baptist',
    '1769',
    '1971'],
   ['Bowdoin College',
    'Brunswick, Maine',
    'Polar Bears',
    '1794',
    'Congregationalist',
    '1777',
    '1971'],
   ['Colby College',
    'Waterville, Maine',
    'White Mules',
    '1813',
    'Northern Baptist',
    '1838',
    '

In [10]:
df = pd.DataFrame(train_data)
# df2 = pd.DataFrame(val_data)
df3 = pd.DataFrame(test_data)
df3.head()

Unnamed: 0,phase,question,table,sql,messages
0,2,Which water has green wood?,"{'header': ['Element', 'Wood', 'Fire', 'Earth'...",{'human_readable': 'SELECT Water FROM table WH...,[{'content': 'You are a natural language to sq...
1,2,Who won the most recent favorite rap/hip-hop n...,"{'header': ['Year', 'Association', 'Category',...",{'human_readable': 'SELECT MAX Year FROM table...,[{'content': 'You are a natural language to sq...
2,1,List the highest number of assists when zaza p...,"{'header': ['Game', 'Date', 'Team', 'Score', '...",{'human_readable': 'SELECT COUNT High assists ...,[{'content': 'You are a natural language to sq...
3,1,Did the round 8 race get reported,"{'header': ['Rd', 'Race', 'Pole Position', 'Fa...",{'human_readable': 'SELECT Report FROM table W...,[{'content': 'You are a natural language to sq...
4,2,What is the number of population values having...,"{'header': ['Year', 'Population', 'Five Year %...",{'human_readable': 'SELECT COUNT Population FR...,[{'content': 'You are a natural language to sq...


In [None]:
train_data = df["messages"].to_list()
# val_data = df2["messages"].to_list()
test_data = df3["messages"].to_list()

In [26]:
len(train_data)

10000

In [None]:
from huggingface_hub import login

login(token='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')


In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AdamW, BitsAndBytesConfig, get_scheduler, DataCollatorWithPadding, AutoModelForSeq2SeqLM
from peft import prepare_model_for_kbit_training, LoraConfig, get_peft_model
import torch
from accelerate import Accelerator, init_empty_weights, infer_auto_device_map

# enable distributed training
accelerate = Accelerator()

checkpoint = "mistralai/Mistral-7B-Instruct-v0.3"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

# OOV token
tokenizer.pad_token = tokenizer.unk_token
tokenizer.pad_token_id = tokenizer.unk_token_id


# padding side
tokenizer.pad_padding_side = "right"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

device_map = {"":0}

with init_empty_weights():
    model = AutoModelForCausalLM.from_pretrained(checkpoint, quantization_config=bnb_config, device_map={"":0})
# device_map = infer_auto_device_map(model, max_memory={"cuda": "2GiB", "cpu": "16GiB"}) 
# model = model.to(device_map)
model = get_peft_model(model, bnb_config)



In [None]:
# gradient checkpointing
model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

In [None]:
# parameters
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0
    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()
    print(
        f"trainable params: {trainable_params} || all params: {all_param} || trainable%: {100 * trainable_params / all_param}"
    )

In [None]:
# Lora config
lora_config = LoraConfig(
    r = 256,
    lora_alpha=128,
    target_modules="all-linear",
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
print_trainable_parameters(model)

In [None]:
layers = model.state_dict().keys()
for name in layers:
    print(name)

In [None]:
def tokenize_function(dataset):
    encoding = tokenizer.apply_chat_template(dataset,
            tokenize=True,
            padding=True,
            truncation=True,
            max_length=512,
            add_generation_prompt=True,
            return_tensors="pt",
            return_dict=True)
    encoding["labels"] = encoding["input_ids"].clone()
    return encoding


In [None]:
train_data = tokenize_function(train_data)
#val_data = tokenize_function(val_data)
test_data = tokenize_function(test_data)


In [None]:
class TokenizedDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.tokenized_data.items()}
        return item


In [None]:
train_dataset = TokenizedDataset(train_data)
#val_dataset = TokenizedDataset(val_data)
test_dataset = TokenizedDataset(test_data)


## Training

In [None]:
# Evaluation
# raw_datasets = load_dataset("glue", "mrpc")
# raw_datasets["train"][0]


{'sentence1': 'Amrozi accused his brother , whom he called " the witness " , of deliberately distorting his evidence .',
 'sentence2': 'Referring to him as only " the witness " , Amrozi accused his brother of deliberately distorting his evidence .',
 'label': 1,
 'idx': 0}

In [None]:
from tqdm.auto import tqdm
from torch.cuda.amp import GradScaler, autocast
from accelerate import Accelerator



optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 3
num_train_steps = len(train_data) * num_epochs

accelerator = Accelerator()
train_dataloader, model, optimizer = accelerate.prepare(train_dataloader, model, optimizer)

lr_scheduler = get_scheduler(
    "linear",
    optimizer=optimizer,
    num_warmup_steps=0,
    num_training_steps=num_train_steps,
    gradient_accumulation_steps=4
)

progress_bar = tqdm(range(num_train_steps))

model.train()
tr_loss = []
for epoch in range(num_epochs):
    for batch in train_dataloader:
        #batch = {k: v.to(device) for k, v in batch.items()}
        outputs = model(**batch)
        loss = outputs.loss
    
        #clear gradients
        optimizer.zero_grad()

        #compute gradients
        accelerator.backward(loss)
        #update weights
        optimizer.step()
        #update lr
        lr_scheduler.step()
       # scaler.update()
        progress_bar.update(1)

        tr_loss.append(loss.item())
    print(f"Epoch {epoch+1}/{num_epochs}, {loss.item()}")
progresss_bar.close()

torch.save(m.state_dict(), 'natural_sql.pt')

       

In [None]:
# test model
# model_predict = "natural_langauge_to_sql"

# model = AutoModelForCausalLM.from_pretrained(model_predict, torch_dtype=torch.float16)
