In [1]:
import os
import json
import sqlite3
import logging
from pipeline.schema_generation import schema_generation
from pipeline.schema_preprocess import schema_preprocess
from pipeline.data_generation import data_generation
from pipeline.temp_generation import temp_generation
from pipeline.ques_generation import ques_generation
from pipeline.ques_revision import ques_revision

In [2]:
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(message)s')

In [3]:
pipeline_config_path = './pipeline_configs.json'
engine_config_path = './engine_configs.json'
template_root = ''
output_root = '../output/'
result_root = '../result/'

In [4]:
try: 
    with open(pipeline_config_path, 'r') as file:
        pipeline_configs = json.load(file)
    logging.info('Load pipeline configurations successfully.')
    logging.debug(pipeline_configs)
except Exception as e:
    logging.error(f'Load pipeline configurations failed: {e}')

2024-08-19 01:01:07,919 - Load pipeline configurations successfully.
2024-08-19 01:01:07,920 - {'schema_generation': {'engine': 'deepseek-coder', 'temperature': 0.5, 'params_root': '../params/', 'template_path': '../templates/schema_generation.txt', 'output_path': '../output/schema_generation/', 'result_path': '../result/schema_generation/'}, 'schema_preprocess': {'input_path': '../result/', 'result_path': '../result/schema_preprocess/'}, 'data_generation': {'engine': 'deepseek-coder', 'temperature': 0.5, 'input_path': '../result/', 'template_path': '../templates/data_generation.txt', 'output_path': '../output/data_generation/', 'result_path': '../result/data_generation/', 'epoch': 1, 'row_num': 3, 'example_value_num': 3}, 'temp_generation': {'engine': 'deepseek-coder', 'temperature': 0.5, 'input_path': '../result/', 'template_path': '../templates/temp_generation.txt', 'evidence_path': '../params/', 'output_path': '../output/temp_generation/', 'result_path': '../result/temp_generation/

## Schema Generation

In [11]:
config_schema_generation = pipeline_configs['schema_generation']

Run on all found databases

In [None]:
db_params_dict = {}
for filename in os.listdir(config_schema_generation['params_root']):
    if filename.endswith('.json'):
        db_name = os.path.splitext(filename)[0]
        try:
            schema_generation(db_name, config_schema_generation)
            logging.info(f'[Schema Generation]{db_name} success')
        except Exception as e:
            logging.error(f'[Schema Generation]{db_name} failed: {e}')

Run on one specific database

In [None]:
# db_name = 'test'
# schema_generation(db_name, config_schema_generation)

## Schema Preprocess

In [5]:
config_schema_preprocess = pipeline_configs['schema_preprocess']

In [6]:
db_params_dict = {}
for filename in os.listdir(os.path.join(config_schema_preprocess['input_path'], 'schema_generation')):
    if filename.endswith('.sqlite'):
        db_name = os.path.splitext(filename)[0]
        try:
            schema_preprocess(db_name, config_schema_preprocess)
            logging.info(f'[Schema Preprocess]{db_name} success')
        except Exception as e:
            logging.error(f'[Schema Preprocess]{db_name} failed: {e}')

2024-08-18 01:57:12,262 - Starting schema preprocess
2024-08-18 01:57:12,263 - Schema got
2024-08-18 01:57:12,264 - {'frpm': {'description': '', 'attributes': {'CDSCode': {'description': '', 'type': 'TEXT'}, 'Academic Year': {'description': '', 'type': 'TEXT'}, 'County Code': {'description': '', 'type': 'TEXT'}, 'District Code': {'description': '', 'type': 'INTEGER'}, 'School Code': {'description': '', 'type': 'TEXT'}, 'County Name': {'description': '', 'type': 'TEXT'}, 'District Name': {'description': '', 'type': 'TEXT'}, 'School Name': {'description': '', 'type': 'TEXT'}, 'District Type': {'description': '', 'type': 'TEXT'}, 'School Type': {'description': '', 'type': 'TEXT'}, 'Educational Option Type': {'description': '', 'type': 'TEXT'}, 'NSLP Provision Status': {'description': '', 'type': 'TEXT'}, 'Charter School (Y/N)': {'description': '', 'type': 'INTEGER'}, 'Charter School Number': {'description': '', 'type': 'TEXT'}, 'Charter Funding Type': {'description': '', 'type': 'TEXT'}, 

In [7]:
db_name = 'california_schools'
schema_preprocess(db_name, config_schema_preprocess)

2024-08-18 01:55:20,601 - Starting schema preprocess
2024-08-18 01:55:20,602 - Schema got
2024-08-18 01:55:20,603 - {'frpm': {'description': '', 'attributes': {'CDSCode': {'description': '', 'type': 'TEXT'}, 'Academic Year': {'description': '', 'type': 'TEXT'}, 'County Code': {'description': '', 'type': 'TEXT'}, 'District Code': {'description': '', 'type': 'INTEGER'}, 'School Code': {'description': '', 'type': 'TEXT'}, 'County Name': {'description': '', 'type': 'TEXT'}, 'District Name': {'description': '', 'type': 'TEXT'}, 'School Name': {'description': '', 'type': 'TEXT'}, 'District Type': {'description': '', 'type': 'TEXT'}, 'School Type': {'description': '', 'type': 'TEXT'}, 'Educational Option Type': {'description': '', 'type': 'TEXT'}, 'NSLP Provision Status': {'description': '', 'type': 'TEXT'}, 'Charter School (Y/N)': {'description': '', 'type': 'INTEGER'}, 'Charter School Number': {'description': '', 'type': 'TEXT'}, 'Charter Funding Type': {'description': '', 'type': 'TEXT'}, 

## Data Generation

In [5]:
config_data_generation = pipeline_configs['data_generation']

In [6]:
db_dict = {}
for filename in os.listdir(os.path.join(config_data_generation['input_path'], 'schema_generation')):
    if filename.endswith('.sqlite'):
        db_name = os.path.splitext(filename)[0]
        try:
            data_generation(db_name, config_data_generation)
            logging.info(f'[Data Generation]{db_name} success')
        except Exception as e:
            logging.error(f'[Data Generation]{db_name} failed: {e}')

2024-08-18 00:05:58,619 - Starting database data generation
2024-08-18 00:05:58,621 - california_schools SCHEMA:
{'frpm': {'description': '', 'attributes': {'CDSCode': {'description': '', 'type': 'TEXT'}, 'Academic Year': {'description': '', 'type': 'TEXT'}, 'County Code': {'description': '', 'type': 'TEXT'}, 'District Code': {'description': '', 'type': 'INTEGER'}, 'School Code': {'description': '', 'type': 'TEXT'}, 'County Name': {'description': '', 'type': 'TEXT'}, 'District Name': {'description': '', 'type': 'TEXT'}, 'School Name': {'description': '', 'type': 'TEXT'}, 'District Type': {'description': '', 'type': 'TEXT'}, 'School Type': {'description': '', 'type': 'TEXT'}, 'Educational Option Type': {'description': '', 'type': 'TEXT'}, 'NSLP Provision Status': {'description': '', 'type': 'TEXT'}, 'Charter School (Y/N)': {'description': '', 'type': 'INTEGER'}, 'Charter School Number': {'description': '', 'type': 'TEXT'}, 'Charter Funding Type': {'description': '', 'type': 'TEXT'}, 'IR

In [6]:
db_name = 'california_schools'
data_generation(db_name, config_data_generation)

2024-08-18 00:06:26,823 - Starting database data generation
2024-08-18 00:06:26,825 - california_schools SCHEMA:
{'frpm': {'description': '', 'attributes': {'CDSCode': {'description': '', 'type': 'TEXT'}, 'Academic Year': {'description': '', 'type': 'TEXT'}, 'County Code': {'description': '', 'type': 'TEXT'}, 'District Code': {'description': '', 'type': 'INTEGER'}, 'School Code': {'description': '', 'type': 'TEXT'}, 'County Name': {'description': '', 'type': 'TEXT'}, 'District Name': {'description': '', 'type': 'TEXT'}, 'School Name': {'description': '', 'type': 'TEXT'}, 'District Type': {'description': '', 'type': 'TEXT'}, 'School Type': {'description': '', 'type': 'TEXT'}, 'Educational Option Type': {'description': '', 'type': 'TEXT'}, 'NSLP Provision Status': {'description': '', 'type': 'TEXT'}, 'Charter School (Y/N)': {'description': '', 'type': 'INTEGER'}, 'Charter School Number': {'description': '', 'type': 'TEXT'}, 'Charter Funding Type': {'description': '', 'type': 'TEXT'}, 'IR

## Template Generation

In [5]:
config_temp_generation = pipeline_configs['temp_generation']

In [None]:
db_dict = {}
for filename in os.listdir(os.path.join(config_temp_generation['input_path'], 'schema_generation')):
    if filename.endswith('.sqlite'):
        db_name = os.path.splitext(filename)[0]
        try:
            temp_generation(db_name, config_temp_generation)
        except Exception as e:
            logging.error(f'[Temp Generation]{db_name} failed: {e}')

In [6]:
db_name = 'california_schools'
temp_generation(db_name, config_temp_generation)

2024-08-18 23:29:23,330 - Starting template generation
2024-08-18 23:29:23,332 - california_schools SCHEMA loaded
2024-08-18 23:29:23,335 - california_schools SCHEMA PROMPT loaded
2024-08-18 23:29:23,335 - Template found
2024-08-18 23:29:23,336 - Evidence found
2024-08-18 23:29:23,339 - Human: 
As a professor teaching database courses, generate 5 question templates with corresponding SQL answer templates for examination.

Requirements of question templates:

1, As realistic as possible

2, Evidence must be used when students answer the questions. In the other words, questions should be unsolvable without evidence information.

3, Use `` around table / column names to prevent reserved word conflicts

4, (Optional) You can use `{TABLE}` to indicate any tables, `{COLUMN}` to indicate any columns.
For example, you can have 'find all entries of {COLUMN} from table {TABLE}.' in question and 'SELECT `{COLUMN}` FROM `{TABLE}`' in SQL answer.
If you want to use this, ensure that any table / col

## Question Generation

In [7]:
config_ques_generation = pipeline_configs['ques_generation']

In [None]:
db_dict = {}
for filename in os.listdir(os.path.join(config_ques_generation['input_path'], 'schema_generation')):
    if filename.endswith('.sqlite'):
        db_name = os.path.splitext(filename)[0]
        try:
            ques_generation(db_name, config_ques_generation)
        except Exception as e:
            logging.error(f'[Ques Generation]{db_name} failed: {e}')

In [8]:
db_name = 'california_schools'
ques_generation(db_name, config_ques_generation)

2024-08-19 00:30:15,422 - Starting question generation
2024-08-19 00:30:15,423 - california_schools SCHEMA loaded
2024-08-19 00:30:15,424 - california_schools SCHEMA PROMPT loaded
2024-08-19 00:30:15,425 - Template found
2024-08-19 00:30:15,426 - Find question templates in generated_templates.json
2024-08-19 00:30:15,426 - Find 15 question templates
2024-08-19 00:30:15,860 - 1 executable with result.
2024-08-19 00:30:15,860 - 1 executable with result.
2024-08-19 00:30:16,579 - 1 executable with result.
2024-08-19 00:30:16,707 - 3 executable with result.
2024-08-19 00:30:16,721 - 3 executable with result.
2024-08-19 00:30:16,856 - Generate 9 new questions totally.


## Question revision

In [5]:
config_ques_revision = pipeline_configs['ques_revision']

In [None]:
db_dict = {}
for filename in os.listdir(os.path.join(config_ques_revision['input_path'], 'ques_revision')):
    if filename.endswith('.sqlite'):
        db_name = os.path.splitext(filename)[0]
        try:
            ques_revision(db_name, config_ques_revision)
        except Exception as e:
            logging.error(f'[Ques revision]{db_name} failed: {e}')

In [6]:
db_name = 'california_schools'
ques_revision(db_name, config_ques_revision)

2024-08-19 01:01:14,610 - Template found
2024-08-19 01:01:14,610 - Human: 
Given a question-SQL pair, you should:
1, Fill placeholders in the question to correctly describe what the SQL answer does.
2, Rewrite the question to improve its readability

Question: Retrieve the CDSCode and school name of the school with the highest SAT average score in reading.
Evidence: The average SAT score in reading is stored in the `AvgScrRead` column of the `satscores` table.
SQL answer: SELECT `cds`, `sname`
FROM `satscores`
ORDER BY `AvgScrRead` DESC
LIMIT 1;

Return rewroted question directly. NO ANY OTHER OUTPUT

2024-08-19 01:01:14,612 - Got LLM config
2024-08-19 01:01:14,613 - send prmopt to deepseek-coder
2024-08-19 01:01:14,619 - Starting new HTTPS connection (1): api.deepseek.com:443
2024-08-19 01:01:14,856 - https://api.deepseek.com:443 "POST /chat/completions HTTP/11" 200 None
2024-08-19 01:01:15,886 - AI: 
Find the CDS code and name of the school with the highest average SAT reading score.