In [239]:
# 1.detect the language of the question
# 2.get approval of the detected language from the user
# 3.get the question from the user as input
# 4.translate the question to english
# 5.Improve the prompt of detcting language.
# 6.analyse the question

# Imports

In [240]:
from langgraph.graph import StateGraph, END, START
from typing import TypedDict, List, Annotated, Literal, Optional, Union
from langchain_core.messages import HumanMessage, SystemMessage, AIMessage
import operator
from langchain_ollama import ChatOllama
from IPython.display import Image, display
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import JsonOutputParser
import json
from pprint import pprint
import questionary


# States

In [241]:
class LanguageState(TypedDict):
    primary_language: str
    confidence_score: Union[int, float]
    alternatives: List[str]
    features: List[str]

class AppState(TypedDict):
    question: str
    language: LanguageState

class States(TypedDict):
    messages: List[str]  # Accumulates all messages
    question: str  # Original user query
    language: LanguageState
    approve_language: str
    translation: str
        
    question_type: Literal["CLEAR", "AMBIGUOUS", "CLARIFICATION_NEEDED"]
    required_tables: List[str]  # e.g., ['users', 'orders']
    required_columns: List[str] # e.g., ['users.name', 'orders.total_amount']
    ambiguity: Optional[str]    # Description of the ambiguity found
    clarification_questions: List[str] # Questions to ask the user
    # A structured interpretation of the question (e.g., a JSON object)
    analyzed_query_intent: Optional[dict]
    

static_approve_language = "NO"

# Prompts

### Detetct Language

In [242]:
prompt_detect_language = """

You are a linguist specializing in Persian (Farsi), Arabic, English, and Spanish languages.

# LANGUAGE PRIORITY:
Prioritize detection for these core languages: Persian (Farsi), English.
If the text clearly matches one of these, it should be the primary detection.

# INSTRUCTIONS:
1. **Primary Language**: Detect the main language, prioritizing Persian/English
2. **Multiple Languages**: If mixed content, identify the dominant language as primary
3. **Alternatives**: Provide 2-3 alternative language possibilities (prioritizing the core languages when plausible)
4. **Features**: List specific linguistic characteristics that helped identification:
   - **Persian**: Right-to-left script, specific characters (پ, چ, گ, ژ, ک), Persian-specific words
   - **English**: Latin script, articles "the/a", common English function words
5. **Confidence**: Score from 0-100% based on text length and clarity
6. **Special Content**: Handle emojis, codes, and mixed content appropriately

# CUSTOMER CONTEXT:
Our primary users are Persian or English speakers discussing business analytics.

# OUTPUT FORMAT:
You MUST output a valid JSON object with the following structure:
{{
  "primary_language": "",
  "confidence_score": 0,
  "alternatives": [],
  "features": []
}}

# TEXT TO ANALYZE:
{user_input}

"""

### Determine Approval

In [243]:
prompt_approval = """
Analyze the user's response to determine if it means YES or NO.

CRITICAL RULES:
1. Respond with ONLY "YES" or "NO" in uppercase letters
2. No additional text, explanations, or punctuation
3. Consider context, tone, and common expressions
4. For ambiguous responses, choose the most probable interpretation

YES INDICATORS (respond with YES):
- Direct affirmatives: yes, yeah, yep, yup, sure, absolutely, definitely, certainly
- Agreement: okay, alright, fine, agreed, of course, by all means
- Positive confirmation: correct, right, exactly, that's right, I agree
- Enthusiastic: absolutely!, definitely!, without a doubt!, certainly!
- Implied yes: "I think so", "probably", "maybe", "I guess", "why not"

NO INDICATORS (respond with NO):
- Direct negatives: no, nope, nah, never, not at all, absolutely not
- Refusal: I can't, I won't, I don't think so, I'd rather not
- Negative confirmation: incorrect, wrong, that's not right, I disagree
- Hesitation: not really, not exactly, sort of but not really
- Avoidance: maybe later, another time, I'm not sure

AMBIGUOUS CASES:
- "I don't know" → NO (unless context suggests otherwise)
- "Maybe" → Consider context, but usually NO for clear decisions
- Sarcasm: Interpret literal meaning despite tone

Respond with only YES or NO:
"""

### Translate to English

In [244]:
prompt_translate_to_english = """"
You are a professional translator. Translate the following text to English.

CRITICAL REQUIREMENTS:
1. Preserve the original meaning, context, and intent exactly
2. Maintain appropriate tone (formal, informal, professional, casual)
3. Handle idioms and cultural references appropriately
4. Respond with ONLY the English translation, no additional text
5. Ensure natural, fluent English output

TRANSLATION GUIDELINES:
- For questions: Keep the interrogative form
- For commands: Maintain imperative tone  
- For statements: Preserve declarative nature
- Handle politeness markers: "please", "thank you", honorifics
- Translate proper names phonetically when appropriate
- Convert currency, measurements, dates to standard formats

SPECIAL CASES:
- Technical terms: Use standard English equivalents
- Slang: Find appropriate English equivalents
- Humor/sarcasm: Preserve the intended effect
- Ambiguous text: Choose most probable meaning

Provide only the English translation.
"""

### Database Schema

In [245]:
prompt_database_schema = """
**Table: artist**
*   `artist_id` (Primary Key)
*   `name`
*   *Description: Musical artists and bands.*

**Table: album**
*   `album_id` (Primary Key)
*   `title`
*   `artist_id` (Foreign Key to `artist.artist_id`)
*   *Description: Albums associated with a specific artist.*

**Table: media_type**
*   `media_type_id` (Primary Key)
*   `name`
*   *Description: Format of the media (e.g., MPEG, Protected AAC).*

**Table: genre**
*   `genre_id` (Primary Key)
*   `name`
*   *Description: Musical genre of a track.*

**Table: track**
*   `track_id` (Primary Key)
*   `name`
*   `album_id` (Foreign Key to `album.album_id`)
*   `media_type_id` (Foreign Key to `media_type.media_type_id`)
*   `genre_id` (Foreign Key to `genre.genre_id`)
*   `composer`
*   `milliseconds` (Duration)
*   `bytes` (Size)
*   `unit_price`
*   *Description: An individual song or musical piece.*

**Table: playlist**
*   `playlist_id` (Primary Key)
*   `name`
*   *Description: A curated collection of tracks.*

**Table: playlist_track**
*   `playlist_id` (Foreign Key to `playlist.playlist_id`, part of Primary Key)
*   `track_id` (Foreign Key to `track.track_id`, part of Primary Key)
*   *Description: Junction table linking playlists to their constituent tracks.*

**Table: employee**
*   `employee_id` (Primary Key)
*   `last_name`
*   `first_name`
*   `title`
*   `reports_to` (Foreign Key to `employee.employee_id`, self-referencing)
*   `birth_date`
*   `hire_date`
*   `address`
*   `city`
*   `state`
*   `country`
*   `postal_code`
*   `phone`
*   `fax`
*   `email`
*   *Description: Company employees, including sales support agents.*

**Table: customer**
*   `customer_id` (Primary Key)
*   `first_name`
*   `last_name`
*   `company`
*   `address`
*   `city`
*   `state`
*   `country`
*   `postal_code`
*   `phone`
*   `fax`
*   `email`
*   `support_rep_id` (Foreign Key to `employee.employee_id`)
*   *Description: Customers who purchase music.*

**Table: invoice**
*   `invoice_id` (Primary Key)
*   `customer_id` (Foreign Key to `customer.customer_id`)
*   `invoice_date`
*   `billing_address`
*   `billing_city`
*   `billing_state`
*   `billing_country`
*   `billing_postal_code`
*   `total`
*   *Description: A sales transaction, containing one or more line items.*

**Table: invoice_line**
*   `invoice_line_id` (Primary Key)
*   `invoice_id` (Foreign Key to `invoice.invoice_id`)
*   `track_id` (Foreign Key to `track.track_id`)
*   `unit_price`
*   `quantity`
*   *Description: Line items detailing the tracks purchased within an invoice.*

**Table: actor**
*   `actor_id` (Primary Key)
*   `first_name`
*   `last_name`
*   *Description: The names of all performers who appear in the films.*

**Table: category**
*   `category_id` (Primary Key)
*   `name`
*   *Description: The `category` table classifies films for organization and filtering.*

**Table: film**
*   `film_id` (Primary Key)
*   `title`
*   `description`
*   `release_year`
*   `rental_duration` (Stores the number of days a movie can be rented before it must be returned.)
*   `rental_rate` (the cost per time period (e.g., daily) for a customer to rent a specific film.)
*   `length` (Duration)
*   `replacement_cost` (the amount the business would pay to buy a new copy of the film if the rented one is lost or damaged.)
*   `rating`
*   `special_features` (bonus materials like trailers, commentaries, or deleted scenes available for a film.)
*   *Description: Stores the core catalog of movies available for rental, including their details, cost, and rating.*

**Table: film_actor**
*   `actor_id` (Foreign Key to `actor.actor_id`, part of Primary Key)
*   `film_id` (Foreign Key to `film.film_id`, part of Primary Key)
*   *Description: Connects actors to films, resolving the many-to-many relationship between them.*

**Table: film_category**
*   `film_id` (Foreign Key to `film.film_id`, part of Primary Key)
*   `category_id` (Foreign Key to `category.category_id`, part of Primary Key)
*   *Description: links films to their categories.*

**Table: playlist**
*   `playlist_id` (Primary Key)
*   `name`
*   *Description: Stores user-created or curated collections of tracks for playback or organization.*

**Table: playlist_track**
*   `playlist_id` (Foreign Key to `playlist.playlist_id`, part of Primary Key)
*   `track_id` (Foreign Key to `track.track_id`, part of Primary Key)
*   *Description: connects playlists to their tracks, defining which songs are in which custom collections.*
"""

### Analyze The Question

In [246]:
prompt_question_analyze = """
You are an expert at analyzing user questions against a database schema.
Your goal is to understand the user's intent, map it to the database, and identify any ambiguity.

# DATABASE SCHEMA:
{schema}

# INSTRUCTIONS:
1.  Analyze the user's question carefully.
2.  Identify the key entities and metrics they are asking about.
3.  Map these entities to the relevant TABLES and COLUMNS in the schema provided.
4.  **CRITICAL: Determine if the question is CLEAR, AMBIGUOUS, or requires CLARIFICATION.**
    - **CLEAR:** The question has a single, unambiguous interpretation based on the schema. (e.g., "What is the total sales for the Marketing department in January 2024?")
    - **AMBIGUOUS/CLARIFICATION_NEEDED:** The question lacks specific details, uses vague terms, or could refer to multiple things. (e.g., "Show me sales." -> Ambiguous: time period? "Who are the best employees?" -> Ambiguous: defined by sales? tenure? manager review?)

5.  If the question is AMBIGUOUS, you MUST:
    a.  Describe the ambiguity in the 'ambiguity' field.
    b.  Propose 1-3 specific, concise clarification questions to ask the user in the 'clarification_questions' field.

# OUTPUT FORMAT:
You MUST output a valid JSON object with the following structure:
{{
    "question_type": "CLEAR" | "AMBIGUOUS" | "CLARIFICATION_NEEDED",
    "required_tables": ["table1", "table2"],
    "required_columns": ["table1.column1", "table2.column2"],
    "ambiguity": "Description of the ambiguity, or null if clear.",
    "clarification_questions": ["Question 1?", "Question 2?"] // Empty list if clear.
    "analyzed_query_intent": {{
        "metric": "What is being measured (e.g., total sales, count of employees)",
        "dimensions": "How it's being broken down (e.g., by department, by month)",
        "filters": "What conditions are applied (e.g., date range, department name)"
    }}
}}

# USER QUESTION TO ANALYZE:
{question}
"""

# Models

In [247]:
# Initialize the LLM
llm = ChatOllama(model="qwen3:0.6b", reasoning=False, temperature=0.1)

# Nodes

### Get Question

In [248]:
def get_question(state: AppState) -> AppState:
    """Get the question from the user"""

    print("⚡️ Getting the question")

    user_input = input("How can I help you?")

    print(f"📡 Question: {user_input}")

    state["question"] = user_input

    return state

### Detect Language

In [249]:
def detect_language(state: AppState) -> AppState:
    """Detect the langugae of the question"""

    print("⚡️ Detecting Language...")

    try:

        prompt_template = ChatPromptTemplate.from_template(prompt_detect_language)

        chain = prompt_template | llm | JsonOutputParser()

        result = chain.invoke({
            "user_input": state["question"]
        })

        state["language"] = result

        pprint(f"📡 Language Detected:\n{state["language"]}")

    except Exception as e:

        pprint(f"⛔️ Language Detection Failed:\n {e}")


    return state
        

### Approve Language

In [250]:
def approve_language(state: AppState) -> AppState:
    """Ask user to approve detected language."""

    print("⚡️ Approving Detected Language...")

    user_input = input("Do you confirm detected language?")

    # chat = [
    #     SystemMessage(content=prompt_approval),
    #     HumanMessage(content=user_input)
    # ]

    # approve = llm.invoke(chat).content

    print(f"📡 Approve Language:\n{choice}")

    # state["approve_language"] = approve
    
    return state
    
    

In [251]:
def is_language_approved(state: States) -> States:
    """Check the detected language is approved by the user"""

    approved = state['approve_language']

    print(f"💡 Is Language Approved: {approved}")

    return approved

### Translate Question

In [252]:
def translate_question(state: States) -> States:
    """Translate the question to english"""

    print("⚡️ Translating Question...")

    chat = [
        SystemMessage(content=prompt_translate_to_english),
        HumanMessage(content=state["question"])
    ]

    translation = llm.invoke(chat).content

    print(f"📡 Translated Question is:\n{translation}")

    state["translation"] = translation

    return state

### Analyze Question

In [253]:
def analyze_question(state: States) -> States:
    """Analyzes the user's question against the DB schema."""

    print("⚡️ Analyzing Question...")

    user_question = state["translation"]

    prompt_template = ChatPromptTemplate.from_template(prompt_question_analyze)

    analysis_chain = prompt_template | llm | JsonOutputParser()

    result = analysis_chain.invoke({
        "schema": prompt_database_schema,
        "question": user_question
    })


    print(f"📡 Question Analyzed:\n{result}")

        # 4. Update the state with the analysis result
    state["question_type"] = result["question_type"]
    state["required_tables"] = result["required_tables"]
    state["required_columns"] = result["required_columns"]
    state["ambiguity"] = result["ambiguity"]
    state["clarification_questions"] = result["clarification_questions"]
    state["analyzed_query_intent"] = result["analyzed_query_intent"]

    return state



# Workflow

In [254]:
def create_workflow():
    workflow = StateGraph(AppState)
    
    workflow.add_node(get_question.__name__, get_question)
    workflow.add_node(detect_language.__name__, detect_language)
    # workflow.add_node(approve_language.__name__, approve_language)
    # workflow.add_node(translate_question.__name__, translate_question)
    # workflow.add_node(analyze_question.__name__, analyze_question)


    workflow.add_edge(START, get_question.__name__)
    workflow.add_edge(get_question.__name__, detect_language.__name__)
    # workflow.add_edge(detect_language.__name__, approve_language.__name__)


    # workflow.add_conditional_edges(
    #     approve_language.__name__,
    #     is_language_approved,
    #     {
    #         "YES": translate_question.__name__,
    #         "NO": detect_language.__name__
    #     }
    # )

    # workflow.add_edge(translate_question.__name__, analyze_question.__name__)

    # workflow.add_edge(get_question.__name__, analyze_question.__name__)
    workflow.add_edge(detect_language.__name__, END)



    return workflow.compile()

# Initiation

In [255]:

agent = create_workflow()

initiate_state = {}

agent.invoke(initiate_state)

⚡️ Getting the question
📡 Question: Hola. ¿Cómo puedo ayudarte?
⚡️ Detecting Language...
('📡 Language Detected:\n'
 "{'primary_language': 'English', 'confidence_score': 100, 'alternatives': [], "
 '\'features\': [\'Latin script\', "Articles \'the/a\'", \'Common English '
 "function words']}")


{'question': 'Hola. ¿Cómo puedo ayudarte?',
 'language': {'primary_language': 'English',
  'confidence_score': 100,
  'alternatives': [],
  'features': ['Latin script',
   "Articles 'the/a'",
   'Common English function words']}}