In [1]:
from dotenv import load_dotenv
import os
load_dotenv()  # Loads variables from .env
api_key = os.environ.get("GOOGLE_API_KEY")
print("Loaded API Key:", api_key)

Loaded API Key: AIzaSyDE8_WB5pqa_PpAsOTvrIP6lW6W4URffVI


In [10]:
import pandas as pd
from langchain_google_genai import GoogleGenerativeAI
from langchain_community.utilities import SQLDatabase
from langchain.prompts import PromptTemplate
from sqlalchemy import create_engine

# ✅ Clean SQL helper
def clean_sql_query(query: str) -> str:
    query = query.strip()
    if query.startswith("```sql"):
        query = query[6:].strip()
    if query.endswith("```"):
        query = query[:-3].strip()
    return query

# ✅ Connect to MySQL
db_uri = "mysql+pymysql://root:Jaiswar10@localhost:3306/new_journey_2025"
db = SQLDatabase.from_uri(
    database_uri=db_uri,
    include_tables=["shop"]
)
engine = create_engine(db_uri)

# ✅ Create LLM
llm = GoogleGenerativeAI(model="gemini-2.0-flash", temperature=0.7)

# ✅ Custom Prompt with a few-shot example for correlated subqueries
template = """
You are an expert in converting natural language questions to SQL queries for MySQL.

Rules:
- Do not use markdown formatting (no ```sql ... ```).
- Only return raw SQL query.
- Use only the provided tables: {table_info}.

Example:
Question: Find the dealers who offer a price higher than the average price of their articles.
SQL Query: SELECT T1.dealer FROM shop AS T1 JOIN (SELECT article, AVG(price) AS avg_price FROM shop GROUP BY article) AS T2 ON T1.article = T2.article WHERE T1.price > T2.avg_price;

Question: {input}
SQL Query:
"""
prompt = PromptTemplate.from_template(template)

# ✅ Main function to handle the entire process
def process_query_to_dataframe(user_query: str) -> None:
    # 1. Manually format the prompt
    full_prompt = prompt.format(
        input=user_query,
        table_info=db.get_table_info()
    )
    
    # 2. Invoke the LLM to get the raw SQL query
    raw_sql_query = llm.invoke(full_prompt)
    
    # 3. Clean the query before attempting to run it
    cleaned_sql_query = clean_sql_query(raw_sql_query)
    
    print("Generated and cleaned SQL query:", cleaned_sql_query)
    
    try:
        # 4. Use pandas.read_sql_query to run the cleaned query and get a DataFrame
        df = pd.read_sql_query(cleaned_sql_query, engine)
        
        print("AI SQL Answer:")
        print(df.to_string(index=False))
        
    except Exception as e:
        print(f"An error occurred during query execution: {e}")
        # Print the problematic query for debugging
        print(f"Problematic Query: '{cleaned_sql_query}'")

# ✅ Run Queries
queries = [
    "For each dealer, find how many articles they sell below the average price of that article."
]

for q in queries:
    print("\nUser:", q)
    process_query_to_dataframe(q)


User: For each dealer, find how many articles they sell below the average price of that article.
Generated and cleaned SQL query: SELECT dealer, count(*) FROM shop AS T1 JOIN (SELECT article, avg(price) AS avg_price FROM shop GROUP BY article) AS T2 ON T1.article = T2.article WHERE T1.price < T2.avg_price GROUP BY dealer
AI SQL Answer:
dealer  count(*)
     A         1
     B         1
     D         1
