Parse SQL Schema

In [1]:
import sqlparse
import re

def extract_tables_from_query(query):
    parsed = sqlparse.parse(query)[0]
    tables = []
    for token in parsed.tokens:
        if token.ttype is None and token.get_real_name():
            tables.append(token.get_real_name())
    return list(set(tables))

def parse_schema_file(schema_file_path):
    with open(schema_file_path, 'r') as f:
        schema_sql = f.read()

    tables = {}
    for statement in sqlparse.parse(schema_sql):
        if statement.get_type() == 'CREATE':
            table_name = str(statement.tokens[4])
            columns = []
            inside_parens = str(statement).split('(', 1)[1].rsplit(')', 1)[0]
            for line in inside_parens.split(','):
                col = line.strip().split()[0]
                if col.upper() not in ['PRIMARY', 'FOREIGN', 'CONSTRAINT']:
                    columns.append(col)
            tables[table_name] = columns
    return tables

# Example usage
if __name__ == "__main__":
    query = "SELECT name, email FROM customers WHERE customer_id IN (SELECT customer_id FROM orders WHERE amount > 100);"
    schema = parse_schema_file("sample_schema.sql")
    print("Parsed Schema:", schema)
    print("Tables in Query:", extract_tables_from_query(query))


Parsed Schema: {' ': ['user_id', 'education_id', 'graduation_date', 'degree', "'Bachelor'", "'Master'", "'PhD'", "'Diploma')", 'specialization', 'createdAt', 'updatedAt'], 'TABLE': ['project_id', 'user_id', 'hours', 'user_role', "'Member')", 'user_contribution', 'createdAt', 'updatedAt']}
Tables in Query: ['customers']


SQL to Text

In [2]:
import sqlparse

def explain_sql(query):
    parsed = sqlparse.parse(query)[0]
    tokens = [token for token in parsed.tokens if not token.is_whitespace]

    explanation = []

    for i, token in enumerate(tokens):
        if token.ttype is None and token.get_real_name():
            continue
        if token.ttype is sqlparse.tokens.DML and token.value.upper() == "SELECT":
            explanation.append("This query retrieves")
            columns = str(tokens[i+1]).strip()
            explanation.append(columns)
        elif token.ttype is sqlparse.tokens.Keyword and token.value.upper() == "FROM":
            table = str(tokens[i+1]).strip()
            explanation.append(f"from the '{table}' table")
        elif token.ttype is sqlparse.tokens.Keyword and token.value.upper() == "WHERE":
            condition = str(tokens[i+1]).strip()
            explanation.append(f"where the condition '{condition}' is met")

    return " ".join(explanation)

# Example
if __name__ == "__main__":
    query = """
    SELECT 
    U.full_name AS Full_name, 
    U.email AS Email, 
    UE.degree AS Degree, 
    E.university_name AS University, 
    YEAR(UE.graduation_date) AS Graduation_year, 
    UE.specialization AS Specialization
FROM User U
JOIN User_Education UE ON U.id = UE.user_id
JOIN Education E ON UE.education_id = E.id
WHERE UE.specialization LIKE 'DSAI%';
    """
    print(explain_sql(query))


This query retrieves U.full_name AS Full_name, 
    U.email AS Email, 
    UE.degree AS Degree, 
    E.university_name AS University, 
    YEAR(UE.graduation_date) AS Graduation_year, 
    UE.specialization AS Specialization from the 'User' table


In [3]:
import sqlparse
from transformers import pipeline

class SQLExplainer:
    def __init__(self, use_llm=False):
        self.use_llm = use_llm
        if use_llm:
            self.generator = pipeline("text2text-generation", model="t5-small")

    def template_explain(self, query):
        parsed = sqlparse.parse(query)[0]
        tokens = [token for token in parsed.tokens if not token.is_whitespace]

        explanation = []

        for i, token in enumerate(tokens):
            if token.ttype is None and token.get_real_name():
                continue
            if token.ttype is sqlparse.tokens.DML and token.value.upper() == "SELECT":
                explanation.append("This query retrieves")
                columns = str(tokens[i+1]).strip().replace("\n", " ")
                explanation.append(columns)
            elif token.ttype is sqlparse.tokens.Keyword and token.value.upper() == "FROM":
                table = str(tokens[i+1]).strip()
                explanation.append(f"from the '{table}' table")
            elif token.ttype is sqlparse.tokens.Keyword and token.value.upper() == "WHERE":
                condition = str(tokens[i+1]).strip().replace("\n", " ")
                explanation.append(f"where the condition '{condition}' is met")

        return " ".join(explanation)

    def llm_explain(self, query):
        prompt = f"Explain this SQL query in simple English:\n{query}"
        result = self.generator(prompt, max_length=100, do_sample=False)[0]['generated_text']
        return result

    def explain(self, query):
        if self.use_llm:
            return self.llm_explain(query)
        else:
            return self.template_explain(query)


  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import pipeline

generator = pipeline("text2text-generation", model="t5-small")

def llm_explain(query):
    prompt = f"Explain this SQL query in simple English:\n{query}"
    result = generator(prompt, max_length=100, do_sample=False)[0]['generated_text']
    return result


Device set to use cpu


In [5]:
from explain_sql import SQLExplainer

# Sample Query
query = """
SELECT 
    U.full_name AS Full_name, 
    U.email AS Email, 
    UE.degree AS Degree, 
    E.university_name AS University, 
    YEAR(UE.graduation_date) AS Graduation_year, 
    UE.specialization AS Specialization
FROM User U
JOIN User_Education UE ON U.id = UE.user_id
JOIN Education E ON UE.education_id = E.id
WHERE UE.specialization LIKE 'DSAI%';
"""

if __name__ == "__main__":
    print("=== Template Explanation ===")
    explainer = SQLExplainer(use_llm=False)
    print(explainer.explain(query))

    print("\n=== LLM Explanation ===")
    llm_explainer = SQLExplainer(use_llm=True)
    print(llm_explainer.explain(query))


=== Template Explanation ===
This query retrieves U.full_name AS Full_name,      U.email AS Email,      UE.degree AS Degree,      E.university_name AS University,      YEAR(UE.graduation_date) AS Graduation_year,      UE.specialization AS Specialization from the 'User' table

=== LLM Explanation ===


Device set to use cpu


SELECT U.full_name AS Full_name, U.email AS Email, UE.degree AS Degree, E.university_name AS University, YEAR(UE.graduation_date) AS Graduation_year, UE.specialization AS Specialization FROM User U JOIN User_Education UE ON U.id = UE.user_id JOIN User_Education UE ON


In [6]:
from rag_engine import RAGEngine

sql_query = """
SELECT 
    U.full_name AS Full_name, 
    U.email AS Email, 
    UE.degree AS Degree, 
    E.university_name AS University, 
    YEAR(UE.graduation_date) AS Graduation_year, 
    UE.specialization AS Specialization
FROM User U
JOIN User_Education UE ON U.id = UE.user_id
JOIN Education E ON UE.education_id = E.id
WHERE UE.specialization LIKE 'DSAI%';
"""

engine = RAGEngine("sample_schema.sql", use_llm=True)
print(engine.generate_documentation(sql_query))


Device set to use cpu


: SQL Query: SELECT U.full_name AS Full_name, U.email AS Email, UE.degree AS Degree, E.university_name AS University, YEAR(UE.graduation_date) AS Graduation_year, UE.specialization AS Specialization FROM User U JOIN User_Education UE ON U.id = UE.user_id JOIN Education E ON UE.education_id = E.id


In [8]:
rag = RAGEngine("sample_schema.sql", use_llm=True)
rag.generate_documentation(sql_query)
rag.answer_nl_question("How does the linkedNetwork Expert  work?")


Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (2728 > 512). Running this sequence through the model will result in indexing errors


'KEY (user_id) REFERENCES User (id) ON DELETE CASCADE ON UPDATE CASCADE, FOREIGN KEY (user_id) REFERENCES Project (id) ON DELETE CASCADE ON UPDATE CASCADE, FOREIGN KEY (user_id) REFERENCES Group (id) ON DELETE CASCADE, FOREIGN KEY (user_'