In [1]:
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, AutoModelForCausalLM, AutoConfig, Trainer, TrainingArguments, PreTrainedTokenizer, BitsAndBytesConfig
from nltk.translate.bleu_score import sentence_bleu
import torch
import torch.distributed
from peft import LoraConfig, PeftModel, get_peft_model, prepare_model_for_kbit_training

import copy
from dataclasses import dataclass, field
from typing import Optional, Dict, Sequence
from datasets import load_dataset

from typing import List, Dict
import random
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
MODEL_NAME = "deepseek-ai/deepseek-coder-1.3b-instruct"
OUTPUT_DIR = "./fine_tuned_model/"  
merged_model = './model/'
#fine_tuned_model_path = 'fine_tuned_model'

In [3]:
print(merged_model)

./model/


In [4]:
config = AutoConfig.from_pretrained(merged_model)
model = AutoModelForCausalLM.from_pretrained(
    merged_model,
    config=config,
    trust_remote_code=True,
    device_map="auto",
    torch_dtype=torch.bfloat16
)
tokenizer = AutoTokenizer.from_pretrained(merged_model)
model.to(device)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

LlamaForCausalLM(
  (model): LlamaModel(
    (embed_tokens): Embedding(32256, 2048)
    (layers): ModuleList(
      (0-23): 24 x LlamaDecoderLayer(
        (self_attn): LlamaSdpaAttention(
          (q_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (v_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2048, bias=False)
          (rotary_emb): LlamaRotaryEmbedding()
        )
        (mlp): LlamaMLP(
          (gate_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (up_proj): Linear(in_features=2048, out_features=5504, bias=False)
          (down_proj): Linear(in_features=5504, out_features=2048, bias=False)
          (act_fn): SiLU()
        )
        (input_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
        (post_attention_layernorm): LlamaRMSNorm((2048,), eps=1e-06)
      )
    )
    (norm)

## Dataset

In [5]:
ds = load_dataset("NumbersStation/NSText2SQL")

In [6]:
def split_dataset(dataset, eval_ratio=0.1, train_ratio=0.8, test_ratio=0.1):
    """
    Split the Hugging Face dataset into evaluation, training, and test sets.

    Parameters:
    dataset (Dataset): The dataset loaded using `load_dataset`.
    eval_ratio (float): Proportion of the dataset to be used for evaluation.
    train_ratio (float): Proportion of the dataset to be used for training.
    test_ratio (float): Proportion of the dataset to be used for testing.

    Returns:
    tuple: A tuple containing (train_set, eval_set, test_set).
    """
    # Ensure the ratios sum to 1
    assert eval_ratio + train_ratio + test_ratio == 1, "Ratios must sum to 1"

    # First, split the dataset into train+temp (train + eval/test)
    train_test_split = dataset.train_test_split(test_size=(eval_ratio + test_ratio))

    # From the remaining (eval + test), split it into eval and test
    eval_test_split = train_test_split['test'].train_test_split(test_size=(test_ratio / (eval_ratio + test_ratio)))

    # Return the splits
    return train_test_split['train'], eval_test_split['train'], eval_test_split['test']

In [7]:
train_split = ds['train']

# Split the dataset
train_set, eval_set, test_set = split_dataset(train_split)

# Example to show the result
print("Training Set Size:", len(train_set))
print("Evaluation Set Size:", len(eval_set))
print("Test Set Size:", len(test_set))

Training Set Size: 231430
Evaluation Set Size: 28929
Test Set Size: 28929


## Evaluation

In [None]:
import random
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from tqdm import tqdm
import re
from typing import List


def evaluate_bleu(
    model: AutoModelForSeq2SeqLM,
    tokenizer: AutoTokenizer,
    test_data,
    eval_percentage: float = 1.0,
    device: str = "cpu",
    batch_size: int = 16,
    max_new_tokens = 50  
) -> float:
    """
    Evaluate the model using BLEU score on a subset of the test dataset with batch processing.

    Parameters:
        model (AutoModelForSeq2SeqLM): The pre-trained model for evaluation.
        tokenizer (AutoTokenizer): The tokenizer for the model.
        test_data: The test dataset containing 'instruction' and 'output' keys.
        eval_percentage (float): The percentage of the test data to use for evaluation (0 to 1).
        device (str): The device to run the model on ('cuda' or 'cpu').
        batch_size (int): Number of samples to process in a single batch.

    Returns:
        float: The average BLEU score over the evaluated subset.
    """
    if not (0 < eval_percentage <= 1):
        raise ValueError("eval_percentage must be between 0 (exclusive) and 1 (inclusive).")

    if tokenizer.pad_token_id is None:
        tokenizer.pad_token_id = tokenizer.eos_token_id

    test_data = list(test_data)
    if not test_data:
        raise ValueError("The test_data dataset is empty.")
    eval_size = int(len(test_data) * eval_percentage)
    eval_data = random.sample(test_data, eval_size)

    model.eval()
    model = model.to(device)


    smoothing_function = SmoothingFunction().method1

    bleu_scores = []
    running_sum = 0.0  

    batches = [
        eval_data[i:i + batch_size]
        for i in range(0, len(eval_data), batch_size)
    ]

    with torch.no_grad():
        with tqdm(total=len(batches), desc="Evaluating BLEU score", unit="batch") as pbar:
            for batch_idx, batch in enumerate(batches, 1):
                instructions = [sample["instruction"] for sample in batch]
                reference_outputs = [sample["output"] for sample in batch]

                response_marker = "Response:"

                prompts = [f"{instruction}\n\n{response_marker} " for instruction in instructions]

                inputs = tokenizer(
                    prompts,
                    return_tensors="pt",
                    padding=True,
                    truncation=True
                ).to(device) 
                try:
                    generated_outputs = model.generate(
                        inputs["input_ids"],
                        attention_mask=inputs["attention_mask"],
                        max_new_tokens=max_new_tokens,
                        eos_token_id=tokenizer.eos_token_id,
                        do_sample=False,  
                        num_beams=1,     
                    )
                except RuntimeError as e:
                    print(f"Error during generation in batch {batch_idx}: {e}")
                    raise e

                generated_texts = [
                    tokenizer.decode(output, skip_special_tokens=True)
                    for output in generated_outputs
                ]

                for generated_text, reference_output, instruction in zip(generated_texts, reference_outputs, instructions):
                    if response_marker in generated_text:
                        predicted_text = generated_text.split(response_marker)[-1].strip()
                    else:
                        if generated_text.startswith(instruction):
                            predicted_text = generated_text[len(instruction):].strip()
                        else:
                            predicted_text = re.split(r'--|#|;', generated_text)[0].strip()
                    predicted_text = re.split(r';', predicted_text)[0].strip()
                    reference = [reference_output.lower().split()]
                    hypothesis = predicted_text.lower().split()

                    score = sentence_bleu(reference, hypothesis, smoothing_function=smoothing_function)
                    bleu_scores.append(score)

                    running_sum += score

                current_average = running_sum / len(bleu_scores)

                pbar.set_postfix({
                    "Average BLEU": f"{current_average * 100:.2f}%"
                })

                pbar.update(1)
    average_bleu = running_sum / eval_size if eval_size > 0 else 0.0
    print(f"Average BLEU score: {average_bleu * 100:.2f}%")
    return average_bleu


In [None]:
model.eval()
average_bleu = evaluate_bleu(
    model=model,
    tokenizer=tokenizer,
    test_data=test_set,
    device = device,
    eval_percentage=1,  
    batch_size=5,  
    max_new_tokens = 50
)

print(f"Final Average BLEU score: {average_bleu * 100:.2f}%")

# Inference

In [None]:
import torch
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import re

def generate_response(
    model: AutoModelForSeq2SeqLM,
    tokenizer: AutoTokenizer,
    query: str,
    max_new_tokens: int = 250,
    device: str = "cpu",
    response_marker: str = "Response:",
    clean_after_marker: bool = True
) -> str:
    """
    Generates a clean response from the model given a query.

    Parameters:
        model (AutoModelForSeq2SeqLM): The pre-trained model.
        tokenizer (AutoTokenizer): The tokenizer for the model.
        query (str): The input query string.
        max_new_tokens (int): The maximum number of tokens to generate for the response.
        device (str): The device to run the model on ('cuda' or 'cpu').
        response_marker (str): A specific marker indicating where the response starts.
        clean_after_marker (bool): Whether to clean the response after the marker.

    Returns:
        str: The cleaned generated response text.
    """
    model = model.to(device)
    
    prompt = f"{query}\n\n{response_marker} "
    
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        padding=True,
        truncation=True
    ).to(device)
    
    with torch.no_grad():
        outputs = model.generate(
            inputs["input_ids"],
            attention_mask=inputs["attention_mask"],
            max_new_tokens=max_new_tokens,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id,
        )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    if response_marker in generated_text:
        response = generated_text.split(response_marker)[-1].strip()
    else:
        response = generated_text[len(prompt):].strip()
    
    if clean_after_marker:
        response = re.split(r'--|#|;', response)[0].strip()
    
    return response


In [40]:
random_quesion = 13774
query = test_set['instruction'][random_quesion]
response = generate_response(model, tokenizer, query)
print("expected output:", test_set['output'][random_quesion])
print("=============================================================")
print("Generated Response:", response)

expected output: SELECT power_output FROM table_name_88 WHERE wheel_arrangement = "b-b" AND build_date = "1952"
Generated Response: SELECT power_output 
FROM table_name_88
WHERE wheel_arrangement = 'b-b wheel arrangement' AND build_date = '1952'


In [43]:
random_quesion = 13774
test_set['instruction'][random_quesion]

'CREATE TABLE table_name_88 (\n    power_output VARCHAR,\n    wheel_arrangement VARCHAR,\n    build_date VARCHAR\n)\n\n\n-- Using valid SQLite, answer the following questions for the tables provided above.\n\n-- What is the power of b-b wheel arrangement, built in 1952?\n'

In [63]:
schema = '''
admissions
    patient_id	INT
    admission_date	DATE
    discharge_date	DATE
    diagnosis	TEXT
    primary key attending_doctor_id	INT
doctors
    doctor_id	INT
    first_name	TEXT
    last_name	TEXT
    specialty	TEXT
'''
query = ''' 
We need a breakdown for the total amount of admissions each doctor has started each year. Show the doctor_id, doctor_full_name, specialty, year, total_admissions for that year.
'''

response = generate_response(model, tokenizer, schema + query)
print("Generated Response:", response)

Generated Response: To solve this problem, we can use a SQL query to join the two tables on the doctor_id and then group by the year and doctor_id.

Here is the SQL query:

```sql
SELECT 
    d.doctor_id,
    CONCAT(d.first_name, ' ', d.last_name) AS doctor_full_name,
    d.specialty,
    YEAR(a.admission_date) AS year,
    COUNT(a.patient_id) AS total_admissions
FROM 
    doctors d
JOIN 
    admissions a ON d.doctor_id = a.attending_doctor_id
GROUP BY 
    d.doctor_id,
    YEAR(a.admission_date)
ORDER BY 
    d.doctor_id,
    YEAR(a.admission_date)
