# SQL Generation 

#### Generates SQL for table creation, sample queries and ETL Processing 

The quote from Good Will Hunting (1997) is:

***"My boy's wicked smart."*** – Morgan O'Mally (played by Casey Affleck)

In [25]:
first_intallation = False 
if first_intallation: 
    !pip install --upgrade bottleneck
    !pip install pipreqs
# pipreqs /path/to/your/project --force    
# !pip install numpy==1.24.3
# !pip install --upgrade numpy 

In [26]:
import os
import schedule
from datetime import datetime
import pandas as pd
import numpy as np
import khutilities.file_manager as fm 
import khutilities.quick_logger as ql 
import khutilities.talking_code as tc 
from docx import Document
from docx.shared import Inches
from IPython.display import Video
import time
import story_board as sb 
from IPython.display import Markdown, display, Image
print(f"Libraries Imported successfully on {datetime.now().date()} at {datetime.now().time()}") 

Libraries Imported successfully on 2025-04-07 at 14:04:30.546422


#### Required Setup Step 0 - Intitiate Configuration Settings and name the overall solution

In [27]:
import configparser 
config = configparser.ConfigParser()
cfg = config.read('config.ini')  
solution_name = 'sql_generation'

#### Required Setup Step 0 - Intitiate Logging and debugging 

In [28]:
import logging # built in python library that does not need to be installed 
import khutilities.file_manager as fm 
import khutilities.quick_logger as ql 

global start_stime 
start_time = ql.set_start_time()
logging = ql.create_logger_start(solution_name) 
ql.pvlog('info',f"Process started {solution_name} on Date:{datetime.now().strftime('%m-%d-%Y')} at Time:{datetime.now().strftime('%I:%M:%S %p')} ")

Process started sql_generation on Date:04-07-2025 at Time:02:04:38 PM 


In [29]:
# Create a new Document
report_date_stamp = datetime.now().date()
report_time_stamp = datetime.now().time()
data_story_doc = Document()
data_story_doc.add_heading(f"Data Science Story Board - {solution_name}", level=1)
data_story_doc.add_heading(f"Processed on : {report_date_stamp} at {report_time_stamp}", level=3)

<docx.text.paragraph.Paragraph at 0x24a31292b50>

In [30]:
definition = '''

## SQL Generation 

1. **Create Table**  
2. **SQL Select**  
3. **ETL Process**  

''' 
# Write the solution definitions out to the solution_description.md file
file_name = "generate_sql.md"
with open(file_name, 'w', encoding='utf-8') as f:
    f.write(definition)  # Write the template to the readme.md file

# Display the definition as formatted Markdown in the notebook
display(Markdown(definition))



## SQL Generation 

1. **Create Table**  
2. **SQL Select**  
3. **ETL Process**  



In [31]:
definition = '''



''' 
sb.outmd(definition)







In [32]:
definition = '''
## SQL Generation 

🔹 SQL Generation will generate various forms of SQL based upon the datas descriptive statistics:

1. ✅ **Create Table** → Adds a **table_name** for the table or dataset.
2. ✅ **SQL Select** → Adds a **column_name** for current column, how pandas named the raw data.
3. ✅ **ETL Process** → Adds a column for the **pandas.dtype**, how pandas inferred the raw data.


''' 
sb.outmd(definition)


## SQL Generation 

🔹 SQL Generation will generate various forms of SQL based upon the datas descriptive statistics

1. ✅ **Create Table** → Adds a **table_name** for the table or dataset.
2. ✅ **SQL Select** → Adds a **column_name** for current column, how pandas named the raw data.
3. ✅ **ETL Process** → Adds a column for the **pandas.dtype**, how pandas inferred the raw data.




In [80]:
getting_schema_data = True
if getting_schema_data: 
    
    df = pd.read_excel("schema_statistics.xlsx")    # Read the CSV file into a pandas DataFrame
    print(f'The data contains {df.shape[0]} rows and {df.shape[1]} columns of schema data' )
    print(f"The schema contains {df['Table_Name'].nunique()} tables")
    print(f"The schema contains {df['Column_Name'].count()} column names")    
    df_schema_data = df  


The data contains 94 rows and 38 columns of schema data
The schema contains 6 tables
The schema contains 94 column names


In [67]:
df_schema_data.columns

Index(['Table_Name', 'Column_Name', 'Column_Number', 'Pandas_Data_Type',
       'SQL_Data_Type', 'Likely_Primary_Key', 'Likely_Foreign_Key',
       'Likely_Categorical', 'count', 'mean', 'std', 'min', '25%', '50%',
       '75%', 'max', 'S_Count', 'S_Missing_Values', 'S_Unique_Values',
       'S_Most_Frequent', 'S_Mean', 'S_Standard_Deviation', 'S_Variance',
       'S_Coefficient_of_Variation', 'S_Skewness', 'S_Kurtosis',
       'Q_1_Lower_Quartile', 'Q_2_Median', 'Q_3_Upper_Quartile',
       'Q_4_Top_Quartile_Spread', 'P_10_Percentile', 'P_90_Percentile',
       'S_Interquartile_Range', 'S_Range', 'S_Minimum_Value',
       'S_Maximum_Value', 'Inferred_Column_Description', 'Pandas_Data_Type.1'],
      dtype='object')

In [82]:
def generate_sql_select_all(df_schema, table_name):
    """ SQL select all detail from table
    """

    table_columns = df_schema[df_schema["Table_Name"] == table_name]["Column_Name"] # Filter schema metadata for the specified table
    if table_columns.empty:    # Check if the table exists in the schema
        return f"-- No columns found for table: {table_name}"

    column_list = ",  ".join([f"{col}" for col in table_columns])  # Format column names safely
    sql_query = f"SELECT  {column_list}\nFROM {table_name};"   # Generate the SELECT statement  

    return sql_query 

In [83]:
def generate_sql_count_all(df_schema, table_name):
    """ generate_sql_count_all """ 
    return f"SELECT count(*) FROM {table_name};"

In [84]:
df_schema_data[['Table_Name', 'Column_Name', 'Column_Number', 'Pandas_Data_Type','S_Most_Frequent']].head()

Unnamed: 0,Table_Name,Column_Name,Column_Number,Pandas_Data_Type,S_Most_Frequent
0,Diabetes,Age,1,int64,22.0
1,Diabetes,BMI,2,int64,32.0
2,Diabetes,BloodPressure,3,int64,70.0
3,Diabetes,DiabetesPedigreeFunction,4,int64,0.254
4,Diabetes,Glucose,5,int64,99.0


In [85]:
df_schema_data.columns

Index(['Table_Name', 'Column_Name', 'Column_Number', 'Pandas_Data_Type',
       'SQL_Data_Type', 'Likely_Primary_Key', 'Likely_Foreign_Key',
       'Likely_Categorical', 'count', 'mean', 'std', 'min', '25%', '50%',
       '75%', 'max', 'S_Count', 'S_Missing_Values', 'S_Unique_Values',
       'S_Most_Frequent', 'S_Mean', 'S_Standard_Deviation', 'S_Variance',
       'S_Coefficient_of_Variation', 'S_Skewness', 'S_Kurtosis',
       'Q_1_Lower_Quartile', 'Q_2_Median', 'Q_3_Upper_Quartile',
       'Q_4_Top_Quartile_Spread', 'P_10_Percentile', 'P_90_Percentile',
       'S_Interquartile_Range', 'S_Range', 'S_Minimum_Value',
       'S_Maximum_Value', 'Inferred_Column_Description', 'Pandas_Data_Type.1'],
      dtype='object')

In [86]:
def generate_sql_select_all_filter(df_schema, table_name, column_name):
    """ generate_sql_select_all_filter
    """
    # Filter schema metadata for the specified table
    table_columns = df_schema[df_schema["Table_Name"] == table_name]["Column_Name"]

    # Check if the table exists in the schema
    if table_columns.empty:
        return f"-- No columns found for table: {table_name}"

    # Generate the SELECT statement
    column_list = ",  ".join([f"{col}" for col in table_columns])  # Format column names safely
    sql_query = f"SELECT  {column_list}\nFROM {table_name} " 
    sql_query += f"WHERE {column_name} = \n"    

    return sql_query 

In [87]:
def generate_sql_select_count_all_filter(df_schema, table_name, column_name, column_value):
    """
    """
    sql_query = f"SELECT Count(*) \nFROM {table_name} " 
    sql_query += f"WHERE {column_name} = '{column_value}'\n"    

    return sql_query 

In [88]:
import random
def generate_nlp_select_all(df_schema, table_name):
    """
    Generates a SQL Server SELECT statement for a given table based on df_schema.

    Parameters:
    df_schema (pd.DataFrame): The schema metadata DataFrame.
    table_name (str): The name of the table for which to generate the SELECT statement.

    Returns:
    str: The generated natural language question assoicated with the SQL SELECT statement.
    """
    # Generate the NLP question
    question_random = random.randint(1, 9)
    
    if question_random == 1:
        nlp_question = f"Can you show me all the details from the {table_name} table?"
    elif question_random == 2:
        nlp_question = f"What information is stored in the {table_name} table?"
    elif question_random == 3:
        nlp_question = f"Please provide every detail from the {table_name} table."
    elif question_random == 4:
        nlp_question = f"What do you know about {table_name}?"   
    elif question_random == 5:
        nlp_question = f"Can you list everything in the {table_name} table for me?"     
    elif question_random == 6:
        nlp_question = f"What are all the entries stored in the {table_name} table?"    
    elif question_random == 7:
        nlp_question = f"provide examples of {table_name} ?"           
    elif question_random == 8:
        nlp_question = f"show me examples of {table_name} ?"  
    elif question_random == 9:
        nlp_question = f"I need to see all the content of the {table_name}. Could you show that?"          

    return nlp_question 

In [89]:
# Example Usage
table_name = "Diabetes"  # Replace with your desired table name
sql_statement = generate_sql_select_all(df_schema_data, table_name)

print(sql_statement)   # Print SQL statement

SELECT  Age,  BMI,  BloodPressure,  DiabetesPedigreeFunction,  Glucose,  Insulin,  Outcome,  Pregnancies,  SkinThickness
FROM Diabetes;


In [90]:
df_schema_data.columns

Index(['Table_Name', 'Column_Name', 'Column_Number', 'Pandas_Data_Type',
       'SQL_Data_Type', 'Likely_Primary_Key', 'Likely_Foreign_Key',
       'Likely_Categorical', 'count', 'mean', 'std', 'min', '25%', '50%',
       '75%', 'max', 'S_Count', 'S_Missing_Values', 'S_Unique_Values',
       'S_Most_Frequent', 'S_Mean', 'S_Standard_Deviation', 'S_Variance',
       'S_Coefficient_of_Variation', 'S_Skewness', 'S_Kurtosis',
       'Q_1_Lower_Quartile', 'Q_2_Median', 'Q_3_Upper_Quartile',
       'Q_4_Top_Quartile_Spread', 'P_10_Percentile', 'P_90_Percentile',
       'S_Interquartile_Range', 'S_Range', 'S_Minimum_Value',
       'S_Maximum_Value', 'Inferred_Column_Description', 'Pandas_Data_Type.1'],
      dtype='object')

In [91]:
def generate_sql_select_all(df_schema, table_name):
    return f"SELECT * FROM {table_name};"

def generate_nlp_select_all(df_schema, table_name):
    return f"Show all records from {table_name}."

def generate_sql_select_all_explicit(df_schema, table_name):
    table_columns = df_schema[df_schema["Table_Name"] == table_name]["Column_Name"]
    column_list = ",  ".join([f"{col}" for col in table_columns])  # Format column names safely    
    return f"SELECT {column_list} FROM {table_name};"

def generate_nlp_select_all_explicit(schema, table_name):
    return f"Show detail for all records from {table_name}."

In [92]:
# Initialize the DataFrame for storing training questions
df_training_questions = pd.DataFrame(columns=['Table_Name', 'SQL_Type', 'SQL_Query', 'NLP_Query'])

# Get unique list of tables from schema data
list_of_tables = df_schema_data.Table_Name.unique()

# Generate SQL and NLP queries for each table
for table_number, table_name in enumerate(list_of_tables):
    print(f"Table#{table_number} - Name:{table_name}")
    
    select_all_sql = generate_sql_select_all(df_schema_data, table_name)
    print(f"Select All SQL: {select_all_sql}")
    
    select_all_nlp = generate_nlp_select_all(df_schema_data, table_name)
    print(f"Select All NLP: {select_all_nlp}")
 
    new_row = pd.DataFrame({
        'Table_Name': [table_name], 
        'SQL_Type': ['select all'], 
        'SQL_Query': [select_all_sql], 
        'NLP_Query': [select_all_nlp]
    })
    df_training_questions = pd.concat([df_training_questions, new_row], ignore_index=True)

Table#0 - Name:Diabetes
Select All SQL: SELECT * FROM Diabetes;
Select All NLP: Show all records from Diabetes.
Table#1 - Name:Health_Care
Select All SQL: SELECT * FROM Health_Care;
Select All NLP: Show all records from Health_Care.
Table#2 - Name:Penguin
Select All SQL: SELECT * FROM Penguin;
Select All NLP: Show all records from Penguin.
Table#3 - Name:Titanic
Select All SQL: SELECT * FROM Titanic;
Select All NLP: Show all records from Titanic.
Table#4 - Name:Personnel
Select All SQL: SELECT * FROM Personnel;
Select All NLP: Show all records from Personnel.
Table#5 - Name:Network_Provider_Patient
Select All SQL: SELECT * FROM Network_Provider_Patient;
Select All NLP: Show all records from Network_Provider_Patient.


In [93]:
df_training_questions.shape

(6, 4)

In [111]:
df_titanic_columns = df_schema_data[df_schema_data["Table_Name"] == 'Titanic']
df_titanic_columns.head(100)                          

Unnamed: 0,Table_Name,Column_Name,Column_Number,Pandas_Data_Type,SQL_Data_Type,Likely_Primary_Key,Likely_Foreign_Key,Likely_Categorical,count,mean,...,Q_3_Upper_Quartile,Q_4_Top_Quartile_Spread,P_10_Percentile,P_90_Percentile,S_Interquartile_Range,S_Range,S_Minimum_Value,S_Maximum_Value,Inferred_Column_Description,Pandas_Data_Type.1
33,Titanic,Age,1,int64,BIGINT,False,False,False,1046,29.881138,...,39.0,41.0,14.0,50.0,18.0,79.83,0.17,80.0,Table Titanic Column Age of type BIGINT and ma...,int64
34,Titanic,Age_wiki,2,float64,FLOAT,False,False,False,1302,29.415829,...,37.75,36.25,15.0,48.0,16.75,73.83,0.17,74.0,Table Titanic Column Age_wiki of type FLOAT an...,float64
35,Titanic,Boarded,3,int64,BIGINT,False,False,True,1304,,...,,,,,,,Belfast,,Table Titanic Column Boarded of type BIGINT an...,int64
36,Titanic,Body,4,object,VARCHAR(255),False,False,False,130,,...,,,,,,,101MB,,Table Titanic Column Body of type VARCHAR(255)...,object
37,Titanic,Cabin,5,object,VARCHAR(255),False,False,False,295,,...,,,,,,,A10,,Table Titanic Column Cabin of type VARCHAR(255...,object
38,Titanic,Class,6,float64,FLOAT,False,False,True,1304,2.291411,...,3.0,0.0,1.0,3.0,1.25,2.0,1,3.0,Table Titanic Column Class of type FLOAT and m...,float64
39,Titanic,Destination,7,int64,BIGINT,False,False,False,1304,,...,,,,,,,"Aberdeen, South Dakota, US",,Table Titanic Column Destination of type BIGIN...,int64
40,Titanic,Embarked,8,int64,BIGINT,False,False,True,1307,,...,,,,,,,C,,Table Titanic Column Embarked of type BIGINT a...,int64
41,Titanic,Fare,9,object,VARCHAR(255),False,False,False,1308,33.295479,...,31.275,481.0542,7.5675,78.05082,23.3792,512.3292,0,512.3292,Table Titanic Column Fare of type VARCHAR(255)...,object
42,Titanic,Hometown,10,float64,FLOAT,False,False,False,1304,,...,,,,,,,"Abbeyleix, Laois, Ireland[note 1]",,Table Titanic Column Hometown of type FLOAT an...,float64


In [None]:
df_schema_data.columns

In [108]:
df_schema_data.head(100)

Unnamed: 0,Table_Name,Column_Name,Column_Number,Pandas_Data_Type,SQL_Data_Type,Likely_Primary_Key,Likely_Foreign_Key,Likely_Categorical,count,mean,...,Q_3_Upper_Quartile,Q_4_Top_Quartile_Spread,P_10_Percentile,P_90_Percentile,S_Interquartile_Range,S_Range,S_Minimum_Value,S_Maximum_Value,Inferred_Column_Description,Pandas_Data_Type.1
0,Diabetes,Age,1,int64,BIGINT,False,False,False,768,3.324089e+01,...,4.100000e+01,4.000000e+01,2.200000e+01,5.100000e+01,1.700000e+01,6.000000e+01,21,8.100000e+01,Table Diabetes Column Age of type BIGINT and m...,int64
1,Diabetes,BMI,2,int64,BIGINT,False,False,False,768,3.199258e+01,...,3.660000e+01,3.050000e+01,2.360000e+01,4.150000e+01,9.300000e+00,6.710000e+01,0,6.710000e+01,Table Diabetes Column BMI of type BIGINT and m...,int64
2,Diabetes,BloodPressure,3,int64,BIGINT,False,False,False,768,6.910547e+01,...,8.000000e+01,4.200000e+01,5.400000e+01,8.800000e+01,1.800000e+01,1.220000e+02,0,1.220000e+02,Table Diabetes Column BloodPressure of type BI...,int64
3,Diabetes,DiabetesPedigreeFunction,4,int64,BIGINT,False,True,False,768,4.718763e-01,...,6.262500e-01,1.793750e+00,1.650000e-01,8.786000e-01,3.825000e-01,2.342000e+00,0.078,2.420000e+00,Table Diabetes Column DiabetesPedigreeFunction...,int64
4,Diabetes,Glucose,5,int64,BIGINT,False,False,False,768,1.208945e+02,...,1.402500e+02,5.875000e+01,8.500000e+01,1.670000e+02,4.125000e+01,1.990000e+02,0,1.990000e+02,Table Diabetes Column Glucose of type BIGINT a...,int64
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
89,Network_Provider_Patient,RACE,28,object,VARCHAR(255),False,False,True,967,,...,,,,,,,American Indian or Alaska Native,,Table Network_Provider_Patient Column RACE of ...,object
90,Network_Provider_Patient,SPONSORING_PROVIDER_NAME,29,object,VARCHAR(255),False,False,False,211,,...,,,,,,,,,Table Network_Provider_Patient Column SPONSORI...,object
91,Network_Provider_Patient,SPONSORING_PROVIDER_NPI,30,int64,BIGINT,False,False,False,205,1.563360e+09,...,1.720088e+09,2.726689e+08,1.229265e+09,1.873633e+09,2.523934e+08,9.894445e+08,1003312455,1.992757e+09,Table Network_Provider_Patient Column SPONSORI...,int64
92,Network_Provider_Patient,STATE,31,object,VARCHAR(255),False,False,True,992,,...,,,,,,,FL,,Table Network_Provider_Patient Column STATE of...,object


In [95]:
# Generate SQL query for each column in each table based on the most frequent value
for index, row in df_schema_data.iterrows():
    table_name = row['Table_Name']
    column_name = row['Column_Name']
    most_frequent_value = row['S_Most_Frequent']
    
    # Check if the most frequent value is a string and needs to be quoted
    if isinstance(most_frequent_value, str):
        most_frequent_value = f"'{most_frequent_value}'"
    
    # Generate SQL query
    sql_query = f"SELECT COUNT(*) FROM {table_name} WHERE {column_name} = {most_frequent_value};"
    nlp_query = f"HOW MANY RECORDS FROM {table_name} WHERE {column_name} is {most_frequent_value};"
    
    new_row = pd.DataFrame({
        'Table_Name': [table_name], 
        'SQL_Type': ['select count of filter'], 
        'SQL_Query': [sql_query], 
        'NLP_Query': [nlp_query]
    })
    df_training_questions = pd.concat([df_training_questions, new_row], ignore_index=True)

In [96]:
df_training_questions.shape

(100, 4)

In [102]:
# Generate SQL query for each column in each table based on the most frequent value
for index, row in df_schema_data.iterrows():
    table_name = row['Table_Name']
    column_name = row['Column_Name']
    likely_categorical = row['Likely_Categorical']
    
    if table_name == 'Titanic' : 
        nlp_column = 'passengers'
        nlp_from_column = 'PassengerId' 
    
    
    if likely_categorical and table_name == 'Titanic' : 
        categories = df_titanic[column_name].unique()
        
        if len(categories) < 10: 
            for category_value in categories:  
    
                if isinstance(category_value, str):
                    category_value = f"'{category_value}'"
                

                # Generate SQL query
                sql_query = f"SELECT COUNT(distinct {nlp_from_column}) FROM {table_name} WHERE {column_name} = {category_value};"
                nlp_query = f"How many {nlp_column} where {column_name} is {category_value};"
    
    new_row = pd.DataFrame({
        'Table_Name': [table_name], 
        'SQL_Type': ['select count of category'], 
        'SQL_Query': [sql_query], 
        'NLP_Query': [nlp_query]
    })
    df_training_questions = pd.concat([df_training_questions, new_row], ignore_index=True)

In [103]:
df_training_questions.shape

(299, 4)

In [106]:
df_training_questions.tail(50)

Unnamed: 0,Table_Name,SQL_Type,SQL_Query,NLP_Query
249,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Embarked = ...,HOW MANY RECORDS FROM Titanic WHERE Embarked i...
250,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Embarked = ...,HOW MANY RECORDS FROM Titanic WHERE Embarked i...
251,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Parch = 9;,HOW MANY RECORDS FROM Titanic WHERE Parch is 9;
252,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Parch = 9;,HOW MANY RECORDS FROM Titanic WHERE Parch is 9;
253,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Pclass = 2;,HOW MANY RECORDS FROM Titanic WHERE Pclass is 2;
254,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Sex = 'fema...,HOW MANY RECORDS FROM Titanic WHERE Sex is 'fe...
255,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE SibSp = 8;,HOW MANY RECORDS FROM Titanic WHERE SibSp is 8;
256,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Survived = ...,HOW MANY RECORDS FROM Titanic WHERE Survived i...
257,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Survived = ...,HOW MANY RECORDS FROM Titanic WHERE Survived i...
258,Titanic,select count of category,SELECT COUNT(*) FROM Titanic WHERE Survived = ...,HOW MANY RECORDS FROM Titanic WHERE Survived i...


In [73]:
df_schema_data.head(10)

Unnamed: 0,Table_Name,Column_Name,Column_Number,Pandas_Data_Type,SQL_Data_Type,Likely_Primary_Key,Likely_Foreign_Key,Likely_Categorical,count,mean,...,Q_3_Upper_Quartile,Q_4_Top_Quartile_Spread,P_10_Percentile,P_90_Percentile,S_Interquartile_Range,S_Range,S_Minimum_Value,S_Maximum_Value,Inferred_Column_Description,Pandas_Data_Type.1
0,Diabetes,Age,1,int64,BIGINT,False,False,False,768,33.240885,...,41.0,40.0,22.0,51.0,17.0,60.0,21,81.0,Table Diabetes Column Age of type BIGINT and m...,int64
1,Diabetes,BMI,2,int64,BIGINT,False,False,False,768,31.992578,...,36.6,30.5,23.6,41.5,9.3,67.1,0,67.1,Table Diabetes Column BMI of type BIGINT and m...,int64
2,Diabetes,BloodPressure,3,int64,BIGINT,False,False,False,768,69.105469,...,80.0,42.0,54.0,88.0,18.0,122.0,0,122.0,Table Diabetes Column BloodPressure of type BI...,int64
3,Diabetes,DiabetesPedigreeFunction,4,int64,BIGINT,False,True,False,768,0.471876,...,0.62625,1.79375,0.165,0.8786,0.3825,2.342,0.078,2.42,Table Diabetes Column DiabetesPedigreeFunction...,int64
4,Diabetes,Glucose,5,int64,BIGINT,False,False,False,768,120.894531,...,140.25,58.75,85.0,167.0,41.25,199.0,0,199.0,Table Diabetes Column Glucose of type BIGINT a...,int64
5,Diabetes,Insulin,6,float64,FLOAT,False,False,False,768,79.799479,...,127.25,718.75,0.0,210.0,127.25,846.0,0,846.0,Table Diabetes Column Insulin of type FLOAT an...,float64
6,Diabetes,Outcome,7,float64,FLOAT,False,False,True,768,0.348958,...,1.0,0.0,0.0,1.0,1.0,1.0,0,1.0,Table Diabetes Column Outcome of type FLOAT an...,float64
7,Diabetes,Pregnancies,8,int64,BIGINT,False,False,True,768,3.845052,...,6.0,11.0,0.0,9.0,5.0,17.0,0,17.0,Table Diabetes Column Pregnancies of type BIGI...,int64
8,Diabetes,SkinThickness,9,int64,BIGINT,False,False,False,768,20.536458,...,32.0,67.0,0.0,40.0,32.0,99.0,0,99.0,Table Diabetes Column SkinThickness of type BI...,int64
9,Health_Care,Admission Type,1,object,VARCHAR(255),False,False,True,55500,,...,,,,,,,Elective,,Table Health_Care Column Admission Type of typ...,object


In [70]:
df_training_questions.head(100)

Unnamed: 0,Table_Name,SQL_Type,SQL_Query,NLP_Query
0,Diabetes,select all,SELECT * FROM Diabetes;,Show all records from Diabetes.
1,Health_Care,select all,SELECT * FROM Health_Care;,Show all records from Health_Care.
2,Penguin,select all,SELECT * FROM Penguin;,Show all records from Penguin.
3,Titanic,select all,SELECT * FROM Titanic;,Show all records from Titanic.
4,Personnel,select all,SELECT * FROM Personnel;,Show all records from Personnel.
...,...,...,...,...
95,Network_Provider_Patient,select count of filter,SELECT COUNT(*) FROM Network_Provider_Patient ...,HOW MANY RECORDS FROM Network_Provider_Patient...
96,Network_Provider_Patient,select count of filter,SELECT COUNT(*) FROM Network_Provider_Patient ...,HOW MANY RECORDS FROM Network_Provider_Patient...
97,Network_Provider_Patient,select count of filter,SELECT COUNT(*) FROM Network_Provider_Patient ...,HOW MANY RECORDS FROM Network_Provider_Patient...
98,Network_Provider_Patient,select count of filter,SELECT COUNT(*) FROM Network_Provider_Patient ...,HOW MANY RECORDS FROM Network_Provider_Patient...


In [75]:
df_titanic = pd.read_csv('titanic.csv')

In [76]:
df_titanic.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked,WikiId,Name_wiki,Age_wiki,Hometown,Boarded,Destination,Lifeboat,Body,Class
0,1,0.0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,...,S,691.0,"Braund, Mr. Owen Harris",22.0,"Bridgerule, Devon, England",Southampton,"Qu'Appelle Valley, Saskatchewan, Canada",,,3.0
1,2,1.0,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,...,C,90.0,"Cumings, Mrs. Florence Briggs (née Thayer)",35.0,"New York, New York, US",Cherbourg,"New York, New York, US",4,,1.0
2,3,1.0,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,...,S,865.0,"Heikkinen, Miss Laina",26.0,"Jyväskylä, Finland",Southampton,New York City,14?,,3.0
3,4,1.0,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,...,S,127.0,"Futrelle, Mrs. Lily May (née Peel)",35.0,"Scituate, Massachusetts, US",Southampton,"Scituate, Massachusetts, US",D,,1.0
4,5,0.0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,...,S,627.0,"Allen, Mr. William Henry",35.0,"Birmingham, West Midlands, England",Southampton,New York City,,,3.0


In [74]:
def add_category_filters(table_name)

# Initialize an empty DataFrame to store SQL and NLP queries
df_training_questions = pd.DataFrame(columns=['Table_Name', 'SQL_Type', 'SQL_Query', 'NLP_Query'])

# Process each row in schema data to generate queries
for index, row in df_schema_data.iterrows():
    if row['Likely_Categorical'] == True:  # Adjusted for Boolean True
        table_name = row['Table_Name']
        column_name = row['Column_Name']
        num_unique = unique_counts[table_name].get(column_name, 0)

        if num_unique <= 10:
            # Construct the SQL query
            sql_query = f"SELECT COUNT(*) FROM {table_name} WHERE {column_name} = '{row['S_Most_Frequent']}'"
            nlp_query = f"How many records from {table_name} where {column_name} = '{row['S_Most_Frequent']}'?"

            # Append to the DataFrame
            new_row = pd.DataFrame({
                'Table_Name': [table_name], 
                'SQL_Type': ['select count of filter'], 
                'SQL_Query': [sql_query], 
                'NLP_Query': [nlp_query]
            })
            df_training_questions = pd.concat([df_training_questions, new_row], ignore_index=True)

# Display the resulting DataFrame
print(df_training_questions)


  Table_Name                SQL_Type  \
0     table1  select count of filter   
1     table2  select count of filter   
2     table2  select count of filter   

                                           SQL_Query  \
0  SELECT COUNT(*) FROM table1 WHERE category1 = ...   
1  SELECT COUNT(*) FROM table2 WHERE category1 = ...   
2  SELECT COUNT(*) FROM table2 WHERE category2 = 'B'   

                                           NLP_Query  
0  How many records from table1 where category1 =...  
1  How many records from table2 where category1 =...  
2  How many records from table2 where category2 =...  


In [None]:
import pandas as pd

# Sample schema data
df_schema_data = pd.DataFrame({
    'Table_Name': ['table1', 'table1', 'table2', 'table2'],
    'Column_Name': ['category1', 'category2', 'category1', 'category2'],
    'Likely_Categorical': [True, False, True, True],  # Now using Boolean True/False
    'S_Most_Frequent': [10, 'A', 20, 'B']  # Assuming these are the most frequent values for demonstration
})

# Sample actual data unique counts (simulated here, replace with actual data fetching logic)
unique_counts = {
    'table1': {'category1': 8, 'category2': 15},
    'table2': {'category1': 10, 'category2': 9}
}

# Initialize an empty DataFrame to store SQL and NLP queries
df_training_questions = pd.DataFrame(columns=['Table_Name', 'SQL_Type', 'SQL_Query', 'NLP_Query'])

# Process each row in schema data to generate queries
for index, row in df_schema_data.iterrows():
    if row['Likely_Categorical'] == True:  # Adjusted for Boolean True
        table_name = row['Table_Name']
        column_name = row['Column_Name']
        num_unique = unique_counts[table_name].get(column_name, 0)

        if num_unique <= 10:
            # Construct the SQL query
            sql_query = f"SELECT COUNT(*) FROM {table_name} WHERE {column_name} = '{row['S_Most_Frequent']}'"
            nlp_query = f"How many records from {table_name} where {column_name} = '{row['S_Most_Frequent']}'?"

            # Append to the DataFrame
            new_row = pd.DataFrame({
                'Table_Name': [table_name], 
                'SQL_Type': ['select count of filter'], 
                'SQL_Query': [sql_query], 
                'NLP_Query': [nlp_query]
            })
            df_training_questions = pd.concat([df_training_questions, new_row], ignore_index=True)

# Display the resulting DataFrame
print(df_training_questions)


In [43]:
df_training_questions = pd.dataframe([['Table_Name','SQL_Type','SQL_Query','NLP_Query']])
list_of_tables = df_schema_data.Table_Name.unique()
for table_number, table_name in enumerate(list_of_tables):
    print(f"Table#{table_number} - Name:{table_name}  ")
    select_all = generate_sql_select_all(df_schema_data, table_name)
    print(f"Select All SQL : {select_all} \n")
    select_all_nlp = generate_nlp_select_all(df_schema_data, table_name)
    df_training_questions.loc = table_name, "select all", select_all, select_all_nlp
    print(f"Select All NLP : {select_all_nlp} \n ")  
    
    
    select_count_all = generate_sql_count_all(df_schema_data, table_name)  
    print(f"Select Count All : {select_count_all} \n ")
    select_all = generate_sql_count_all(df_schema_data, table_name)      

Table#0 - Name:Diabetes  
Select All SQL : SELECT    Age,  BMI,  BloodPressure,  DiabetesPedigreeFunction,  Glucose,  Insulin,  Outcome,  Pregnancies,  SkinThickness
FROM Diabetes; 
 
Select All NLP : What information is stored in the Diabetes table? 
 
Select Count All : SELECT count(*) FROM Diabetes; 
 
Table#1 - Name:Health_Care  
Select All SQL : SELECT    Admission Type,  Age,  Billing Amount,  Blood Type,  Date of Admission,  Discharge Date,  Doctor,  Gender,  Hospital,  Insurance Provider,  Medical Condition,  Medication,  Name,  Room Number,  Test Results
FROM Health_Care; 
 
Select All NLP : Please provide every detail from the Health_Care table. 
 
Select Count All : SELECT count(*) FROM Health_Care; 
 
Table#2 - Name:Penguin  
Select All SQL : SELECT    Unnamed: 0,  bill_depth_mm,  bill_length_mm,  body_mass_g,  flipper_length_mm,  island,  sex,  species,  year
FROM Penguin; 
 
Select All NLP : Can you list everything in the Penguin table for me? 
 
Select Count All : SELECT

In [21]:
df_schema_data = pd.read_excel("schema_statistics.xlsx")
print(f"read schema data and found statistics on {df_schema_data.Table_Name.nunique()} tables")

read schema data and found statistics on 6 tables


In [22]:
table = df_schema_data.Table_Name.unique()
df_schema_data.head(2)

Unnamed: 0,Table_Name,Column_Name,Column_Number,Pandas_Data_Type,SQL_Data_Type,Likely_Primary_Key,Likely_Foreign_Key,Likely_Categorical,count,mean,...,Q_3_Upper_Quartile,Q_4_Top_Quartile_Spread,P_10_Percentile,P_90_Percentile,S_Interquartile_Range,S_Range,S_Minimum_Value,S_Maximum_Value,Inferred_Column_Description,Pandas_Data_Type.1
0,Diabetes,Age,1,int64,BIGINT,False,False,False,768,33.240885,...,41.0,40.0,22.0,51.0,17.0,60.0,21,81.0,Table Diabetes Column Age of type BIGINT and m...,int64
1,Diabetes,BMI,2,int64,BIGINT,False,False,False,768,31.992578,...,36.6,30.5,23.6,41.5,9.3,67.1,0,67.1,Table Diabetes Column BMI of type BIGINT and m...,int64


In [None]:
import os
report_date_stamp = datetime.now().date()
word_file_path = os.path.join(f"{solution_name}_{report_date_stamp}_storyboard.docx")
data_story_doc.save(word_file_path)    
ql.pvlog('info',f"Storyboard Created : {solution_name} ")  

## Step 0 - Process End - display log

In [None]:
# Calculate and classify the process performance 
status = ql.calculate_process_performance(solution_name, start_time) 
print(ql.append_log_file(solution_name))  

#### https://github.com/JoeEberle/ -- josepheberle@outlook.com