In [None]:
!python -m unittest tests.test_expressions


In [2]:
!python -m unittest tests.test_parser

...............F^C


In [1]:
!find . -type f -name "*.py" | sort

./1.py
./2.py
./3.py
./__init__.py
./m3.py
./maiaas22222n.py
./main.py
./src/ast/ast_builder.py
./src/ast/ast_processor.py
./src/ast/expression_ast.py
./src/ast/expression_optimizer.py
./src/ast/__init__.py
./src/ast/match_recognize_ast.py
./src/ast/pattern_ast.py
./src/ast/pattern_optimizer.py
./src/ast/validator.py
./src/ast/visitor.py
./src/evaluator/evaluation_engine.py
./src/grammar/__init__.py
./src/grammar/TrinoLexer.py
./src/grammar/TrinoParserListener.py
./src/grammar/TrinoParser.py
./src/grammar/TrinoParserVisitor.py
./src/__init__.py
./src/parser/antlr_parser.py
./src/parser/config.py
./src/parser/expression_parser.py
./src/parser/__init__.py
./src/parser/match_recognize_parser.py
./src/parser/parser_util.py
./src/parser/parse_tree.py
./src/parser/pattern_parser.py
./src/parser/sql_parser.py
./src/parser/symbol_table.py
./src/parser/tokenizer.py
./src/parser/unified_parser.py
./src/semantic/semantic_analyzer.py
./src/semantic/type_system.py
./src/validator/function_validator

In [12]:
from src.parser.match_recognize_extractor import parse_full_query

if __name__ == "__main__":
    sample_query = """
    SELECT * FROM orders MATCH_RECOGNIZE(
         PARTITION BY custkey
         ORDER BY orderdate
         MEASURES
                  A.totalprice AS starting_price,
                  LAST(B.totalprice) AS bottom_price,
                  LAST(U.totalprice) AS top_price
         ONE ROW PER MATCH
         AFTER MATCH SKIP PAST LAST ROW
         PATTERN (A* B+ C? D+)
         SUBSET U = (C, D)
         DEFINE
                  B AS totalprice < PREV(totalprice),
                  C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice,
                  D AS totalprice > PREV(totalprice)
    )
    """
    query_ast = parse_full_query(sample_query)
    print(query_ast)


DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT * FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES A.totalprice AS starting_price, LAST(B.totalprice) AS bottom_price, LAST(U.totalprice) AS top_price ONE ROW PER MATCH AFTER MATCH SKIP PAST LAST ROW PATTERN (A* B+ C? D+) SUBSET U = (C, D) DEFINE B AS totalprice < PREV(totalprice), C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice, D AS totalprice > PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=*, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Extracted FROM clause via regex: FromClause(table=orders)
DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(

FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(columns=['orderdate']),
  measures=MeasuresClause(measures=[Measure(expression=A.totalprice, alias=starting_price, metadata={}), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={}), Measure(expression=LAST(U.totalprice), alias=top_price, metadata={})]),
  rows_per_match=RowsPerMatchClause(value=ONE ROW PER MATCH),
  after_match_skip=AfterMatchSkipClause(value=SKIP PAST LAST ROW),
  pattern=PatternClause(pattern=A* B+ C? D+, metadata={'variables': ['A', 'B', 'C', 'D']}),
  subset=[SubsetClause(subset_text=U=(C,D))],
  define=DefineClause(definitions=[Define(variable=B, condition=totalprice<PREV(totalprice)), Define(variable=C, condition=totalprice>PREV(totalprice)ANDtotalprice<=A.totalprice), Define(variable=D, condition

In [None]:
import pandas as pd
from src.parser.match_recognize_extractor import parse_full_query, parse_match_recognize_query
from src.ast.ast_nodes import FullQueryAST
import logging
import re

logger = logging.getLogger(__name__)

def enrich_ast_with_df_metadata(ast: FullQueryAST, df: pd.DataFrame) -> FullQueryAST:
    """
    Enrich the AST with metadata from the pandas DataFrame.
    For each SELECT item that directly references a column, attach its dtype, null count,
    unique count, and basic statistics if numeric.
    """
    if ast.select_clause:
        for item in ast.select_clause.items:
            col = item.expression.strip()
            if col in df.columns:
                item.metadata["dtype"] = str(df.dtypes[col])
                item.metadata["null_count"] = int(df[col].isnull().sum())
                item.metadata["unique_count"] = int(df[col].nunique())
                if pd.api.types.is_numeric_dtype(df[col]):
                    item.metadata["stats"] = df[col].describe().to_dict()
    return ast

def validate_ast_against_df(ast: FullQueryAST, df: pd.DataFrame):
    """
    Validate that all columns referenced in the AST exist in the DataFrame.
    - For the ORDER BY clause, check that each column exists.
    - For each simple SELECT item (only an identifier), check that it exists in the DataFrame.
    - For measure expressions, scan for function calls (e.g., COUNT, FIRST, LAST, PREV, NEXT)
      and ensure that the column arguments exist.
    """
    df_columns = set(df.columns)
    simple_column_regex = re.compile(r'^[A-Za-z_][A-Za-z0-9_]*$')
    
    # Validate ORDER BY columns.
    if ast.match_recognize and ast.match_recognize.order_by:
        for col in ast.match_recognize.order_by.columns:
            if col not in df_columns:
                raise Exception(f"ORDER BY column '{col}' not found in DataFrame columns {df_columns}")

    # Validate simple SELECT items.
    if ast.select_clause:
        for item in ast.select_clause.items:
            if simple_column_regex.match(item.expression) and item.expression not in df_columns:
                raise Exception(f"SELECT column '{item.expression}' not found in DataFrame columns {df_columns}")

    # Validate measure expressions using allowed function patterns.
    allowed_functions = {
        "COUNT": r"COUNT\(\s*(\*|[A-Z][A-Z0-9]*(?:\.[A-Z][A-Z0-9]*)?)\s*\)",
        "FIRST": r"FIRST\(\s*([A-Z][A-Z0-9]*(?:\.[A-Z][A-Z0-9]*)?)(?:\s*,\s*\d+)?\s*\)",
        "LAST": r"LAST\(\s*([A-Z][A-Z0-9]*(?:\.[A-Z][A-Z0-9]*)?)(?:\s*,\s*\d+)?\s*\)",
        "PREV": r"PREV\(\s*([A-Z][A-Z0-9]*(?:\.[A-Z][A-Z0-9]*)?)(?:\s*,\s*\d+)?\s*\)",
        "NEXT": r"NEXT\(\s*([A-Z][A-Z0-9]*(?:\.[A-Z][A-Z0-9]*)?)(?:\s*,\s*\d+)?\s*\)",
    }
    if ast.match_recognize and ast.match_recognize.measures:
        for measure in ast.match_recognize.measures.measures:
            expr = measure.expression.upper()
            for func, pattern in allowed_functions.items():
                if func in expr:
                    m = re.search(pattern, expr)
                    if m:
                        col = m.group(1)
                        if col != "*" and col not in df_columns:
                            raise Exception(f"Measure function {func} references column '{col}' not in DataFrame columns {df_columns}")
    logger.debug("AST successfully validated against DataFrame schema.")

if __name__ == "__main__":
    sample_query = """
    SELECT id, name, salary AS sal, COUNT(*) AS total,
           CASE WHEN salary > 5000 THEN 'High' ELSE 'Low' END
    FROM orders MATCH_RECOGNIZE(
         PARTITION BY custkey
         ORDER BY orderdate
         MEASURES
                  A.totalprice AS starting_price,
                  LAST(B.totalprice) AS bottom_price,
                  LAST(U.totalprice) AS top_price
         ONE ROW PER MATCH
         AFTER MATCH SKIP PAST LAST ROW
         PATTERN (A B+ C+ D+)
         SUBSET U = (C, D)
         DEFINE
                  B AS totalprice < PREV(totalprice),
                  C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice,
                  D AS totalprice > PREV(totalprice)
    )
    """
    # Parse the query into an AST.
    full_ast = parse_full_query(sample_query)
    print("Extracted Full Query AST:")
    print(full_ast)
    
    # Create a sample DataFrame representing the "orders" table.
    data = {
        "id": [1, 2, 3],
        "name": ["Alice", "Bob", "Charlie"],
        "salary": [4000, 6000, 5000],
        "custkey": [101, 102, 101],
        "orderdate": pd.to_datetime(["2023-01-01", "2023-01-02", "2023-01-03"]),
        "totalprice": [100, 200, 150]
    }
    df = pd.DataFrame(data)
    
    # Enrich the AST with DataFrame metadata.
    enriched_ast = enrich_ast_with_df_metadata(full_ast, df)
    print("\nEnriched Full Query AST with DataFrame metadata:")
    print(enriched_ast)
    
    # Validate that the AST's column references exist in the DataFrame.
    validate_ast_against_df(enriched_ast, df)
    print("\nAST successfully validated against DataFrame schema.")


DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT id, name, salary AS sal, COUNT(*) AS total, CASE WHEN salary > 5000 THEN 'High' ELSE 'Low' END FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES A.totalprice AS starting_price, LAST(B.totalprice) AS bottom_price, LAST(U.totalprice) AS top_price ONE ROW PER MATCH AFTER MATCH SKIP PAST LAST ROW PATTERN (A B+ C+ D+) SUBSET U = (C, D) DEFINE B AS totalprice < PREV(totalprice), C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice, D AS totalprice > PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=id, metadata={}), SelectItem(expression=name, metadata={}), SelectItem(expression=salary, alias=sal, metadata={}), SelectItem(expression=COUNT(*), alias=total, metadata={}), SelectItem(expression=CASE WHEN salary > 5000 THEN 'High' ELSE 'Low' END, metadata={})])
DEBUG:src.parser.match_rec

Extracted Full Query AST:
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=id, metadata={}), SelectItem(expression=name, metadata={}), SelectItem(expression=salary, alias=sal, metadata={}), SelectItem(expression=COUNT(*), alias=total, metadata={}), SelectItem(expression=CASE WHEN salary > 5000 THEN 'High' ELSE 'Low' END, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(columns=['orderdate']),
  measures=MeasuresClause(measures=[Measure(expression=A.totalprice, alias=starting_price, metadata={}), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={}), Measure(expression=LAST(U.totalprice), alias=top_price, metadata={})]),
  rows_per_match=RowsPerMatchClause(value=ONE ROW PER MATCH),
  after_match_skip=AfterMatchSkipClause(value=SKIP PAST LAST ROW),
  pattern=PatternClause(pattern=AB+C+D+, metadata={'variables': {'B', 'D', 

In [None]:
from src.parser.match_recognize_extractor import parse_match_recognize_query
if __name__ == "__main__":
    sample_query = """
    SELECT * FROM orders MATCH_RECOGNIZE(
         PARTITION BY custkey
         ORDER BY orderdate
         MEASURES
                  A.totalprice AS starting_price,
                  LAST(B.totalprice) AS bottom_price,
                  LAST(U.totalprice) AS top_price
         ONE ROW PER MATCH
         AFTER MATCH SKIP PAST LAST ROW
         PATTERN (A B+ C+ D+)
         SUBSET U = (C, D)
         DEFINE
                  B AS totalprice < PREV(totalprice),
                  C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice,
                  D AS totalprice > PREV(totalprice)
    )
    """
    components = parse_match_recognize_query(sample_query)

    print(components)

DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(columns=['orderdate'])
DEBUG:src.parser.match_recognize_extractor:Extracted MEASURES: MeasuresClause(measures=[Measure(expression=A.totalprice, alias=starting_price, metadata={}), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={}), Measure(expression=LAST(U.totalprice), alias=top_price, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Extracted ROWS PER MATCH: RowsPerMatchClause(value=ONE ROW PER MATCH)
DEBUG:src.parser.match_recognize_extractor:Extracted AFTER MATCH SKIP: AfterMatchSkipClause(value=SKIP PAST LAST ROW)
DEBUG:src.parser.match_recognize_extractor:Extracted PATTERN: PatternClause(pattern=AB+C+D+, metadata={'variables': {'D', 'C', 'AB'}})
DEBUG:src.parser.match_recognize_extractor:Extra

MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(columns=['orderdate']),
  measures=MeasuresClause(measures=[Measure(expression=A.totalprice, alias=starting_price, metadata={}), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={}), Measure(expression=LAST(U.totalprice), alias=top_price, metadata={})]),
  rows_per_match=RowsPerMatchClause(value=ONE ROW PER MATCH),
  after_match_skip=AfterMatchSkipClause(value=SKIP PAST LAST ROW),
  pattern=PatternClause(pattern=AB+C+D+, metadata={'variables': {'D', 'C', 'AB'}}),
  subset=[SubsetClause(subset_text=U=(C,D))],
  define=DefineClause(definitions=[Define(variable=B, condition=totalprice<PREV(totalprice)), Define(variable=C, condition=totalprice>PREV(totalprice)ANDtotalprice<=A.totalprice), Define(variable=D, condition=totalprice>PREV(totalprice))])
)


In [1]:
#!/usr/bin/env python3
# main.py

from typing import Dict, Any
import json
from src.parser.sql_parser import parse_sql_query
from src.parser.match_recognize_parser import MatchRecognizeParser

def process_query(sql_query: str) -> Dict[str, Any]:
    print("\n" + "=" * 80)
    print("PROCESSING QUERY:")
    print(sql_query)
    print("=" * 80 + "\n")
    
    # Step 1: Parse the SQL query using the SQL parser
    print("\n--- STEP 1: PARSING SQL QUERY ---")
    parse_result = parse_sql_query(sql_query)
    
    print("Parsing successful!")
    
    print("\n===== TOKENS RECOGNIZED =====")
    print([token.text for token in parse_result["tokens"]])
    
    print("\n===== PARSE TREE =====")
    # Print a formatted version of the parse tree (assumed to be a dict)
    print(json.dumps(parse_result["parse_tree"], indent=2))
    
    print("\n===== ERRORS (If Any) =====")
    print(parse_result["errors"])
    
    # Step 2: Extract MATCH_RECOGNIZE details using the specialized parser
    print("\n--- STEP 2: EXTRACTING MATCH_RECOGNIZE DETAILS ---")
    mr_parser = MatchRecognizeParser()
    mr_details = mr_parser.parse_query(sql_query)
    
    print("\nMATCH_RECOGNIZE DETAILS:")
    print(json.dumps(mr_details, indent=2))
    
    print("\n--- EXTRACTION METHODS ---")
    print("Pattern Variables:", mr_parser.get_pattern_variables(sql_query))
    print("Partition By Columns:", mr_parser.extract_partition_by(sql_query))
    print("Order By Columns:", mr_parser.extract_order_by(sql_query))
    print("Measures:", mr_parser.extract_measures(sql_query))
    print("Define Clauses:", mr_parser.extract_define_clauses(sql_query))
    
    return parse_result

if __name__ == "__main__":
    test_query = """
    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+ v xxc dc)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    """
    
    result = process_query(test_query)



PROCESSING QUERY:

    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+ v xxc dc)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    


--- STEP 1: PARSING SQL QUERY ---
Parsing successful!

===== TOKENS RECOGNIZED =====
['\n    ', 'SELECT', ' ', '*', '\n    ', 'FROM', ' ', 'Orders', '\n    ', 'MATCH_RECOGNIZE', ' ', '(', '\n        ', 'PARTITION', ' ', 'BY', ' ', 'customer_id', '\n        ', 'ORDER', ' ', 'BY', ' ', 'order_time', '\n        ', 'MEASURES', '\n            ', 'A', '.', 'order_id', ' ', 'AS', ' ', 'start_order', ',', '\n            ', 'B', '.', 'order_id', ' ', 'AS', ' ', 'end_order', ',', '\n            ', 'COUNT', '(', '*', ')', ' ', 'AS', ' ', 'order_count', '\n        ', 'ONE', ' ', 'ROW', '

In [1]:
# main.py

from typing import Dict, Any
import json
from src.parser.sql_parser import parse_sql_query
def process_query(sql_query: str) -> Dict[str, Any]:
    print("\n" + "="*80)
    print("PROCESSING QUERY:")
    print(sql_query)
    print("="*80 + "\n")

    # Step 1: Parse SQL query
    print("\n--- STEP 1: PARSING SQL QUERY ---")
    parse_result = parse_sql_query(sql_query)

    print("Parsing successful!")

    print("\n===== TOKENS RECOGNIZED =====")
    print([token.text for token in parse_result["tokens"]])  # ✅ Print token values

    print("\n===== PARSE TREE =====")
    print(parse_result["parse_tree"])  # ✅ Print the parse tree output

    print("\n===== ERRORS (If Any) =====")
    print(parse_result["errors"])  # ✅ Print parsing errors if present


    print("\n--- STEP 2: PROCESSING PARSE TREE ---")
    print(parse_result["match_recognize"])
    return parse_result


if __name__ == "__main__":
    # Example query
    test_query = """
    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+ v xxc dc)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    """
    
    # Process the query
    result = process_query(test_query)



PROCESSING QUERY:

    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+ v xxc dc)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    


--- STEP 1: PARSING SQL QUERY ---
Parsing successful!

===== TOKENS RECOGNIZED =====
['\n    ', 'SELECT', ' ', '*', '\n    ', 'FROM', ' ', 'Orders', '\n    ', 'MATCH_RECOGNIZE', ' ', '(', '\n        ', 'PARTITION', ' ', 'BY', ' ', 'customer_id', '\n        ', 'ORDER', ' ', 'BY', ' ', 'order_time', '\n        ', 'MEASURES', '\n            ', 'A', '.', 'order_id', ' ', 'AS', ' ', 'start_order', ',', '\n            ', 'B', '.', 'order_id', ' ', 'AS', ' ', 'end_order', ',', '\n            ', 'COUNT', '(', '*', ')', ' ', 'AS', ' ', 'order_count', '\n        ', 'ONE', ' ', 'ROW', '

In [7]:
# main.py

from typing import Dict, Any
import json
from src.parser.sql_parser import parse_sql_query
def process_query(sql_query: str) -> Dict[str, Any]:
    """
    Process SQL query through parsing and AST building phases.
    """
    print("\n" + "="*80)
    print("PROCESSING QUERY:")
    print(sql_query)
    print("="*80 + "\n")
    
  
        # Step 1: Parse SQL query
    print("\n--- STEP 1: PARSING SQL QUERY ---")
    parse_result = parse_sql_query(sql_query)
    print("Parse result:")
    # print parser result
        
    print("Parsing successful!")
    print("Parse Tree:", parse_result["parse_tree"])
    print("Parser:", parse_result["parser"])
    print("Errors:", parse_result["errors"])
    print("Tokens:", [token.text for token in parse_result["tokens"]])

    
if __name__ == "__main__":
    # Example query
    test_query = """
    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    """
    
    # Process the query
    result = process_query(test_query)



PROCESSING QUERY:

    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    


--- STEP 1: PARSING SQL QUERY ---
Parse result:
Parsing successful!
Parse Tree: []
Parser: <src.grammar.TrinoParser.TrinoParser object at 0x759a20e4a8d0>
Errors: []
Tokens: ['\n    ', 'SELECT', ' ', '*', '\n    ', 'FROM', ' ', 'Orders', '\n    ', 'MATCH_RECOGNIZE', ' ', '(', '\n        ', 'PARTITION', ' ', 'BY', ' ', 'customer_id', '\n        ', 'ORDER', ' ', 'BY', ' ', 'order_time', '\n        ', 'MEASURES', '\n            ', 'A', '.', 'order_id', ' ', 'AS', ' ', 'start_order', ',', '\n            ', 'B', '.', 'order_id', ' ', 'AS', ' ', 'end_order', ',', '\n            ', 'COUNT', '

In [None]:
# main.py

from typing import Dict, Any
import json
from src.parser.sql_parser import parse_sql_query
from src.ast.ast_processor import ASTProcessor

def process_query(sql_query: str) -> Dict[str, Any]:
    """
    Process SQL query through parsing and AST building phases.
    """
    print("\n" + "="*80)
    print("PROCESSING QUERY:")
    print(sql_query)
    print("="*80 + "\n")
    
    try:
        # Step 1: Parse SQL query
        print("\n--- STEP 1: PARSING SQL QUERY ---")
        parse_result = parse_sql_query(sql_query)
        
        if parse_result.get("errors"):
            print("Parsing errors:")
            for error in parse_result["errors"]:
                print(f"  - {error}")
            return {
                "status": "error",
                "phase": "parsing",
                "errors": parse_result["errors"]
            }
            
        print("Parsing successful!")
        
        # Step 2: Process AST
        print("\n--- STEP 2: PROCESSING AST ---")
        ast_processor = ASTProcessor()
        ast_result = ast_processor.process_parse_tree(parse_result)
        
        if ast_result.get("errors"):
            print("AST processing errors:")
            for error in ast_result["errors"]:
                print(f"  - {error}")
            return {
                "status": "error",
                "phase": "ast_processing",
                "errors": ast_result["errors"]
            }
            
        print("AST structure:")
        print(json.dumps(ast_result["ast"], default=lambda o: o.__dict__ if hasattr(o, "__dict__") else str(o), indent=2))
        
        print("\nPattern information for automaton:")
        print(json.dumps(ast_result["pattern"], default=lambda o: o.__dict__ if hasattr(o, "__dict__") else str(o), indent=2))
        
        # The pattern information can now be passed to the automaton converter
        return {
            "status": "success",
            "ast": ast_result["ast"],
            "pattern": ast_result["pattern"],
            "warnings": ast_result.get("warnings", [])
        }
        
    except Exception as e:
        import traceback
        error_msg = f"Unexpected error: {str(e)}\n{traceback.format_exc()}"
        print(f"Error: {error_msg}")
        return {
            "status": "error",
            "phase": "processing",
            "errors": [error_msg]
        }

if __name__ == "__main__":
    # Example query
    test_query = """
    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    """
    
    # Process the query
    result = process_query(test_query)



PROCESSING QUERY:

    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    


--- STEP 1: PARSING SQL QUERY ---
Parsing successful!

--- STEP 2: PROCESSING AST ---
AST structure:
{
  "type": "query",
  "match_recognize": [
    {
      "partition_by": [],
      "order_by": [],
      "measures": [
        {
          "expression": {
            "raw": "A.order_id",
            "ast": {
              "type": "pattern_variable_reference",
              "value": "A.order_id",
              "operator": null,
              "children": [],
              "pattern_variable": "A",
              "column": "order_id",
              "navigation_type": null,
              "of

In [None]:
# main.py

from typing import Dict, Any
import json
from src.parser.sql_parser import parse_sql_query
from src.ast.ast_processor import ASTProcessor

def process_query(sql_query: str) -> Dict[str, Any]:
    """
    Process SQL query through parsing and AST building phases.
    """
    print("\n" + "="*80)
    print("PROCESSING QUERY:")
    print(sql_query)
    print("="*80 + "\n")
    
    try:
        # Step 1: Parse SQL query
        print("\n--- STEP 1: PARSING SQL QUERY ---")
        parse_result = parse_sql_query(sql_query)
        
        if parse_result.get("errors"):
            print("Parsing errors:")
            for error in parse_result["errors"]:
                print(f"  - {error}")
            return {
                "status": "error",
                "phase": "parsing",
                "errors": parse_result["errors"]
            }
            
        print("Parsing successful!")
        
        # Step 2: Process AST
        print("\n--- STEP 2: PROCESSING AST ---")
        ast_processor = ASTProcessor()
        ast_result = ast_processor.process_parse_tree(parse_result)
        
        if ast_result.get("errors"):
            print("AST processing errors:")
            for error in ast_result["errors"]:
                print(f"  - {error}")
            return {
                "status": "error",
                "phase": "ast_processing",
                "errors": ast_result["errors"]
            }
            
        print("AST structure:")
        print(json.dumps(ast_result["ast"], default=lambda o: o.__dict__ if hasattr(o, "__dict__") else str(o), indent=2))
        
        print("\nPattern information for automaton:")
        print(json.dumps(ast_result["pattern"], default=lambda o: o.__dict__ if hasattr(o, "__dict__") else str(o), indent=2))
        
        # The pattern information can now be passed to the automaton converter
        return {
            "status": "success",
            "ast": ast_result["ast"],
            "pattern": ast_result["pattern"],
            "warnings": ast_result.get("warnings", [])
        }
        
    except Exception as e:
        import traceback
        error_msg = f"Unexpected error: {str(e)}\n{traceback.format_exc()}"
        print(f"Error: {error_msg}")
        return {
            "status": "error",
            "phase": "processing",
            "errors": [error_msg]
        }

if __name__ == "__main__":
    # Example query
    test_query = """
    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    """
    
    # Process the query
    result = process_query(test_query)



PROCESSING QUERY:

    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    


--- STEP 1: PARSING SQL QUERY ---
Parsing successful!

--- STEP 2: PROCESSING AST ---
AST structure:
{
  "type": "query",
  "match_recognize": [
    {
      "partition_by": [],
      "order_by": [],
      "measures": [
        {
          "expression": {
            "raw": "A.order_id",
            "ast": {
              "type": "pattern_variable_reference",
              "value": "A.order_id",
              "operator": null,
              "children": [],
              "pattern_variable": "A",
              "column": "order_id",
              "navigation_type": null,
              "of

In [11]:
# test_ast_builder.py

import json
import sys
import os

# Add project root to path
from src.parser.sql_parser import parse_sql_query
from src.ast.ast_builder import build_ast_from_parse_tree

def test_ast_builder():
    """Test the AST builder with a MATCH_RECOGNIZE query"""
    query = """
    SELECT *
    FROM Orders
    MATCH_RECOGNIZE (
        PARTITION BY customer_id
        ORDER BY order_time
        MEASURES
            A.order_id AS start_order,
            B.order_id AS end_order,
            COUNT(*) AS order_count
        ONE ROW PER MATCH
        PATTERN (A B+)
        DEFINE
            A AS A.amount > 100,
            B AS B.amount > A.amount
    );
    """
    
    print("\n--- PARSING QUERY ---")
    parse_result = parse_sql_query(query)
    
    if parse_result.get("errors"):
        print("Parsing failed:")
        for error in parse_result["errors"]:
            print(f"  - {error}")
        return
    
    print("Parsing succeeded!")
    
    print("\n--- BUILDING AST ---")
    ast_result = build_ast_from_parse_tree(parse_result)
    
    if ast_result.get("errors"):
        print("AST building failed:")
        for error in ast_result["errors"]:
            print(f"  - {error}")
        return
    
    print("AST building succeeded!")
    print("\nAST Structure:")
    print(json.dumps(ast_result["ast"], default=lambda o: o.__dict__ if hasattr(o, "__dict__") else str(o), indent=2))

if __name__ == "__main__":
    test_ast_builder()



--- PARSING QUERY ---
Parsing succeeded!

--- BUILDING AST ---
AST building succeeded!

AST Structure:
{
  "type": "query",
  "match_recognize": [
    {
      "partition_by": [],
      "order_by": [],
      "measures": [
        {
          "expression": {
            "raw": "A.order_id",
            "ast": {
              "type": "pattern_variable_reference",
              "value": "A.order_id",
              "operator": null,
              "children": [],
              "pattern_variable": "A",
              "column": "order_id",
              "navigation_type": null,
              "offset": 0,
              "count_star": false,
              "semantics": null,
              "line": 1,
              "column_pos": 1
            }
          },
          "alias": "start_order"
        },
        {
          "expression": {
            "raw": "B.order_id",
            "ast": {
              "type": "pattern_variable_reference",
              "value": "B.order_id",
              "operator

In [None]:
import sys
import pandas as pd
# Use an absolute import for match_recognize.
from transformations.match_recognize import match_recognize

query = """
    SELECT id, name FROM employees MATCH_RECOGNIZE (
        PARTITION BY department, region
        ORDER BY hire_date
        MEASURES salary AS avg_salary
        PATTERN (A)
        DEFINE A AS salary > 1000
    );
    """
    
data = [
        {"id": 1, "name": "Alice",   "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
        {"id": 2, "name": "Bob",     "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
        {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
        {"id": 4, "name": "Diana",   "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
    ]
    
try:
        output_df = match_recognize(query, pd.DataFrame(data))
        print("Match Recognize Output:")
        print(output_df)
except Exception as e:
        print(f"Error: {str(e)}")

ImportError: cannot import name 'AutomatonBuilder' from partially initialized module 'transformations.automaton.automaton_builder' (most likely due to a circular import) (/home/monierashraf/Desktop/llm/Match_recognize/project/transformations/automaton/automaton_builder.py)