In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, AutoConfig, Trainer, TrainingArguments, PreTrainedTokenizer, BitsAndBytesConfig
from nltk.translate.bleu_score import sentence_bleu
import torch
import torch.distributed
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

import copy
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence
from datasets import load_dataset

from typing import List, Dict
import random
from nltk.translate.bleu_score import sentence_bleu


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.bfloat16)
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
model.to(device)


LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm)

## Dataset

In [3]:
from datasets import load_dataset

ds = load_dataset("NumbersStation/NSText2SQL")

In [None]:
def split_dataset(dataset, eval_ratio=0.1, train_ratio=0.8, test_ratio=0.1):
    """
    Split the Hugging Face dataset into evaluation, training, and test sets.

    Parameters:
    dataset (Dataset): The dataset loaded using `load_dataset`.
    eval_ratio (float): Proportion of the dataset to be used for evaluation.
    train_ratio (float): Proportion of the dataset to be used for training.
    test_ratio (float): Proportion of the dataset to be used for testing.

    Returns:
    tuple: A tuple containing (train_set, eval_set, test_set).
    """
    assert eval_ratio + train_ratio + test_ratio == 1, "Ratios must sum to 1"
    train_test_split = dataset.train_test_split(test_size=(eval_ratio + test_ratio))
    eval_test_split = train_test_split['test'].train_test_split(test_size=(test_ratio / (eval_ratio + test_ratio)))
    return train_test_split['train'], eval_test_split['train'], eval_test_split['test']

In [None]:
train_split = ds['train']

train_set, eval_set, test_set = split_dataset(train_split)

print("Training Set Size:", len(train_set))
print("Evaluation Set Size:", len(eval_set))
print("Test Set Size:", len(test_set))

Training Set Size: 231430
Evaluation Set Size: 28929
Test Set Size: 28929


In [None]:
def filter_datasets_by_source(train_set, eval_set, test_set, source_value='spider'):
    """
    Filters the input datasets based on the 'source' column matching the given source_value.

    Args:
        train_set: The training dataset.
        eval_set: The evaluation dataset.
        test_set: The test dataset.
        source_value (str): The value in the 'source' column to filter by (default is 'spider').

    Returns:
        tuple: A tuple containing the filtered train_set, eval_set, and test_set.
    """
    filtered_train_set = train_set.filter(lambda example: example['source'] == source_value)
    filtered_eval_set = eval_set.filter(lambda example: example['source'] == source_value)
    filtered_test_set = test_set.filter(lambda example: example['source'] == source_value)
    
    return filtered_train_set, filtered_eval_set, filtered_test_set

In [7]:
spider_train, spider_eval,spider_test = filter_datasets_by_source(train_set, eval_set, test_set)

Filter:   0%|          | 0/231430 [00:00<?, ? examples/s]

Filter:   0%|          | 0/28929 [00:00<?, ? examples/s]

Filter:   0%|          | 0/28929 [00:00<?, ? examples/s]

In [8]:
print("Training Set Size:", len(spider_train))
print("Evaluation Set Size:", len(spider_eval))
print("Test Set Size:", len(spider_test))

Training Set Size: 5564
Evaluation Set Size: 721
Test Set Size: 709


In [9]:
spider_test

Dataset({
    features: ['instruction', 'output', 'source'],
    num_rows: 709
})

## Evaluation

In [None]:
from tqdm import tqdm
import torch
from nltk.translate.bleu_score import sentence_bleu
import random
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

def evaluate_bleu(
    model: AutoModelForSeq2SeqLM,
    tokenizer: AutoTokenizer,
    test_data,  
    batch_size: int = 1,
    eval_percentage: float = 1.0,
    max_new_tokens: int = 50,
) -> float:
    """
    Evaluate the model using BLEU score on a subset of the test dataset.

    Parameters:
        model (AutoModelForSeq2SeqLM): The pre-trained model for evaluation.
        tokenizer (AutoTokenizer): The tokenizer for the model.
        test_data: The test dataset containing 'instruction' and 'output' keys.
        batch_size (int): The number of samples to process at once.
        eval_percentage (float): The percentage of the test data to use for evaluation (0 to 1).
        max_new_tokens (int): The maximum number of tokens to generate for each prediction.
        device (str): The device to run the model on ('cuda' or 'cpu').

    Returns:
        float: The average BLEU score over the evaluated subset.
    """
    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    test_data = list(test_data)

    model = model.to(device)

    eval_data = random.sample(test_data, int(len(test_data) * eval_percentage))

    bleu_scores = []
    with torch.no_grad():
        for i in tqdm(range(0, len(eval_data), batch_size), desc="Evaluating BLEU score"):
            batch = eval_data[i:i + batch_size]
            batch_instructions = [sample["instruction"] for sample in batch]
            batch_outputs = [sample["output"] for sample in batch]
            inputs = tokenizer(
                batch_instructions,
                return_tensors="pt",
                padding=True,
                truncation=True
            ).to(device)
            generated_outputs = model.generate(
                inputs["input_ids"], 
                attention_mask=inputs["attention_mask"], 
                max_new_tokens=max_new_tokens
            )
            for j, output in enumerate(generated_outputs):
                predicted_text = tokenizer.decode(output, skip_special_tokens=True)
                reference = [batch_outputs[j].split()]  
                hypothesis = predicted_text.split()     
                score = sentence_bleu(reference, hypothesis)
                
                bleu_scores.append(score)
    average_bleu = sum(bleu_scores) / len(bleu_scores) if bleu_scores else 0.0
    print(f"Average BLEU score: {average_bleu}")
    return average_bleu


In [11]:
average_bleu_score = evaluate_bleu(
    model=model,
    tokenizer=tokenizer,
    test_data=spider_test,
    batch_size=8,
    eval_percentage=0.1,  
    max_new_tokens=500
)

Evaluating BLEU score:   0%|          | 0/9 [00:00<?, ?it/s]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
Evaluating BLEU score:  11%|█         | 1/9 [01:00<08:00, 60.04s/it]Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Evaluating BLEU score:  22%|██▏       | 2/9 [01:58<06:5

Average BLEU score: 0.006862448861117331





# Inference

In [None]:
def generate_response(
    model: AutoModelForSeq2SeqLM,
    tokenizer: AutoTokenizer,
    query: str,
    max_new_tokens: int = 50
) -> str:
    """
    Generates a response from the model given a query.

    Parameters:
        model (AutoModelForSeq2SeqLM): The pre-trained model.
        tokenizer (AutoTokenizer): The tokenizer for the model.
        query (str): The input query string.
        max_new_tokens (int): The maximum number of tokens to generate for the response.
        device (str): The device to run the model on ('cuda' if available, otherwise 'cpu').

    Returns:
        str: The generated response text.
    """
    model = model.to(device)
    inputs = tokenizer(query, return_tensors="pt").to(device)
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            pad_token_id=tokenizer.pad_token_id
        )
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return response

In [13]:
query = spider_test['instruction'][0]
response = generate_response(model, tokenizer, query)
print("Generated Response:", response)

Generated Response: CREATE TABLE people (
    people_id number,
    name text,
    age number,
    height number,
    hometown text
)

CREATE TABLE gymnast (
    gymnast_id number,
    floor_exercise_points number,
    pommel_horse_points number,
    rings_points number,
    vault_points number,
    parallel_bars_points number,
    horizontal_bar_points number,
    total_points number
)


-- Using valid SQLite, answer the following questions for the tables provided above.

-- What are the hometowns that are shared by at least two gymnasts?
SELECT hometown
FROM gymnast
GROUP BY hometown
HAVING COUNT(*) >= 2

-- What is the average age of a gymnast?
SELECT AVG(age)
FROM gymnast
