In [25]:
input_file_path = "prepare_data/dev_input.csv"
model_name = "xiyansql_7b_16_en_prompt"
dialect = "sqlite"
output_file_path = f"inference_data/{model_name}_inf.txt"

In [26]:
from langchain_core.prompts import PromptTemplate

common_prompt_template = PromptTemplate.from_template(
    """You are an expert in {dialect}. Your job is to read and understand the following [Database Schema] description, along with any [Reference Information], and then use your knowledge of {dialect} to generate an SQL statement that answers the [User Question].

[User Question]
{user_question}

[Database Schema]
{schema}

[Reference Information]
None

ONLY OUTPUT THE SQL STATEMENT, NO OTHER TEXT.
"""
)
xiyan_en_prompt_template = PromptTemplate.from_template(
    """You are an expert in {dialect}. Your job is to read and understand the following [Database Schema] description, along with any [Reference Information], and then use your knowledge of {dialect} to generate an SQL statement that answers the [User Question].

[User Question]
{user_question}

[Database Schema]
{schema}

[Reference Information]
None

[User Question]
{user_question}
```sql
"""
)

In [27]:
model_config = {
    "llama3.2:3b": {
        "model_name": "llama3.2:3b",
        "model_config_init": {
            "temperature": 0.0,
        },
        "prompt_template": common_prompt_template,
    },
    "xiyansql_7b_16_en_prompt": {
        "model_name": "hf.co/mradermacher/XiYanSQL-QwenCoder-7B-2504-GGUF:F16",
        "model_config_init": {
            "temperature": 0.0,
        },
        "prompt_template": xiyan_en_prompt_template,
    },
}

In [28]:
from langchain_ollama import ChatOllama


llm = ChatOllama(
    model=model_config[model_name]["model_name"],
    **model_config[model_name]["model_config_init"]
)
llm.invoke("Hello world")

AIMessage(content='Hello! How can I assist you today?', additional_kwargs={}, response_metadata={'model': 'hf.co/mradermacher/XiYanSQL-QwenCoder-7B-2504-GGUF:F16', 'created_at': '2025-06-23T04:06:14.599069095Z', 'done': True, 'done_reason': 'stop', 'total_duration': 244400250, 'load_duration': 20855821, 'prompt_eval_count': 10, 'prompt_eval_duration': 30703308, 'eval_count': 10, 'eval_duration': 191560419, 'model_name': 'hf.co/mradermacher/XiYanSQL-QwenCoder-7B-2504-GGUF:F16'}, id='run--3d024e6d-fa13-4bed-9deb-6e4e95c55047-0', usage_metadata={'input_tokens': 10, 'output_tokens': 10, 'total_tokens': 20})

In [29]:
prompt_template = model_config[model_name]["prompt_template"]


def generate_sql(user_question: str, schema_list: list[str], dialect: str):
    message = prompt_template.invoke(
        {
            "user_question": user_question,
            "schema": "\n".join(schema_list),
            "dialect": dialect,
        }
    )
    message = llm.invoke(message).content
    # format the message to be a valid sql statement
    message = (
        message.strip()
        .replace("```sql", "")
        .replace("```", "")
        .replace("\n", "")
        .replace("\t", "")
    )
    print(f"Generated answer for question: {user_question[:20]}...")
    return message

In [30]:
generate_sql(
    "How many singers do we have?",
    [
        'CREATE TABLE "singer" (\n"Singer_ID" int,\n"Name" text,\n"Country" text,\n"Song_Name" text,\n"Song_release_year" text,\n"Age" int,\n"Is_male" bool,\nPRIMARY KEY ("Singer_ID")\n)'
    ],
    dialect,
)

Generated answer for question: How many singers do ...


'SELECT count(*) FROM singer'

In [31]:
import pandas as pd

df = pd.read_csv(input_file_path)
df

Unnamed: 0,question_number,question,db_id,tables,schemas
0,0,How many singers do we have?,concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""..."
1,1,What is the total number of singers?,concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""..."
2,2,"Show name, country, age for all singers ordere...",concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""..."
3,3,"What are the names, countries, and ages for ev...",concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""..."
4,4,"What is the average, minimum, and maximum age ...",concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""..."
...,...,...,...,...,...
1029,1029,What are the citizenships that are shared by s...,singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""..."
1030,1030,How many available features are there in total?,real_estate_properties,['Other_Available_Features'],['CREATE TABLE `Other_Available_Features` (\n`...
1031,1031,What is the feature type name of feature AirCon?,real_estate_properties,"['Other_Available_Features', 'Ref_Feature_Types']",['CREATE TABLE `Other_Available_Features` (\n`...
1032,1032,Show the property type descriptions of propert...,real_estate_properties,"['Properties', 'Ref_Property_Types']",['CREATE TABLE `Properties` (\n`property_id` I...


In [32]:
inference_df = df.assign(
    answer=lambda df_: df_.apply(
        lambda row: generate_sql(row.question, row.schemas, dialect), axis=1
    )
)
inference_df

Generated answer for question: How many singers do ...
Generated answer for question: What is the total nu...
Generated answer for question: Show name, country, ...
Generated answer for question: What are the names, ...
Generated answer for question: What is the average,...
Generated answer for question: What is the average,...
Generated answer for question: Show the name and th...
Generated answer for question: What are the names a...
Generated answer for question: What are all distinc...
Generated answer for question: What are  the differ...
Generated answer for question: Show all countries a...
Generated answer for question: How many singers are...
Generated answer for question: List all song names ...
Generated answer for question: What are all the son...
Generated answer for question: Show location and na...
Generated answer for question: What are the locatio...
Generated answer for question: What is the maximum ...
Generated answer for question: What is the average ...
Generated 

Unnamed: 0,question_number,question,db_id,tables,schemas,answer
0,0,How many singers do we have?,concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""...",SELECT count(*) FROM singer
1,1,What is the total number of singers?,concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""...",SELECT count(*) FROM singer
2,2,"Show name, country, age for all singers ordere...",concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""...","SELECT name , country , age FROM singer ORDER ..."
3,3,"What are the names, countries, and ages for ev...",concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""...","SELECT Name , Country , Age FROM singer ORDER ..."
4,4,"What is the average, minimum, and maximum age ...",concert_singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""...","SELECT avg(age) , min(age) , max(age) FROM sin..."
...,...,...,...,...,...,...
1029,1029,What are the citizenships that are shared by s...,singer,['singer'],"['CREATE TABLE ""singer"" (\n""Singer_ID"" int,\n""...",SELECT Citizenship FROM singer WHERE Birth_Yea...
1030,1030,How many available features are there in total?,real_estate_properties,['Other_Available_Features'],['CREATE TABLE `Other_Available_Features` (\n`...,SELECT count(*) FROM Available_Features
1031,1031,What is the feature type name of feature AirCon?,real_estate_properties,"['Other_Available_Features', 'Ref_Feature_Types']",['CREATE TABLE `Other_Available_Features` (\n`...,SELECT T1.feature_type_name FROM Ref_Feature_T...
1032,1032,Show the property type descriptions of propert...,real_estate_properties,"['Properties', 'Ref_Property_Types']",['CREATE TABLE `Properties` (\n`property_id` I...,SELECT T1.property_type_description FROM Ref_P...


In [33]:
with open(output_file_path, "w+") as f:
    answer_list = inference_df["answer"].tolist()
    f.write("\n".join(answer_list))