In [None]:
!python -m unittest tests.test_expressions


In [2]:
!python -m unittest tests.test_parser

...............F^C


In [3]:
!find . -type f -name "*.py" | sort

./1.py
./2.py
./3.py
./__init__.py
./m3.py
./maiaas22222n.py
./main.py
./src/ast/ast_nodes.py
./src/ast/__init__.py
./src/executor/__init__.py
./src/grammar/__init__.py
./src/grammar/TrinoLexer.py
./src/grammar/TrinoParserListener.py
./src/grammar/TrinoParser.py
./src/grammar/TrinoParserVisitor.py
./src/__init__.py
./src/optimizer/__init__.py
./src/parser/__init__.py
./src/parser/match_recognize_extractor.py
./src/validator/__init__.py
./tests/__init__.py
./tests/test_ast.py
./tests/test_parser_edge_cases.py
./tests/test_parser.py
./tests/test_validator.py


In [None]:
from src.parser.match_recognize_extractor import parse_full_query

def run_test_queries():
    test_queries = [
        {
            "description": "Mixed semantics: RUNNING, FINAL, and default (no prefix)",
            "query": """
                SELECT * FROM orders MATCH_RECOGNIZE(
                    PARTITION BY custkey
                    ORDER BY orderdate
                    MEASURES
                              RUNNING LAST(A.totalprice) AS starting_price,
         FINAL LAST(B.totalprice) AS bottom_price,
         C.totalprice AS top_price
                    ONE ROW PER MATCH
                    PATTERN (A B+ C)
                    DEFINE
                        B AS totalprice < PREV(totalprice),
                        C AS totalprice > PREV(totalprice)
                )
            """
        },
        {
            "description": "Default semantics (no prefix in measures)",
            "query": """
                SELECT * FROM orders MATCH_RECOGNIZE(
                    PARTITION BY custkey
                    ORDER BY orderdate
                    MEASURES
                        A.totalprice AS starting_price,
                        LAST(B.totalprice) AS bottom_price
                    ONE ROW PER MATCH
                    PATTERN (A B+)
                    DEFINE
                        B AS totalprice < PREV(totalprice)
                )
            """
        },
        {
            "description": "Mixed with extra spaces and explicit prefixes",
            "query": """
                SELECT * FROM orders MATCH_RECOGNIZE(
                    PARTITION BY custkey
                    ORDER BY orderdate
                    MEASURES
                        FINAL   A.totalprice   AS   starting_price,
                        RUNNING  LAST(B.totalprice)   AS  bottom_price,
                        RUNNING NEXT(C.totalprice) AS top_price
                    ALL ROWS PER MATCH
                    PATTERN ( A B+ C+ )
                    DEFINE
                        B AS totalprice < PREV(totalprice),
                        C AS totalprice > PREV(totalprice)
                )
            """
        },
        {
            "description": "Measures without aliases",
            "query": """
                SELECT * FROM orders MATCH_RECOGNIZE(
                    PARTITION BY custkey
                    ORDER BY orderdate
                    MEASURES
                        FINAL A.totalprice,
                        RUNNING LAST(B.totalprice)
                    ONE ROW PER MATCH
                    PATTERN (A B+)
                    DEFINE
                        B AS totalprice < PREV(totalprice)
                )
            """
        }
    ]

    for test in test_queries:
        print(f"---\nTest: {test['description']}")
        try:
            ast = parse_full_query(test['query'])
            print(ast)
        except Exception as e:
            print(f"Error: {e}")

if __name__ == "__main__":
    run_test_queries()


line 5:45 no viable alternative at input 'RUNNING A.totalprice AS'
line 5:32 mismatched input 'A' expecting {'AFTER', 'ALL', 'INITIAL', 'ONE', 'PATTERN', 'SEEK', ','}
line 6:30 missing ';' at 'LAST'
line 6:49 mismatched input 'AS' expecting ';'
line 7:37 mismatched input 'AS' expecting ';'
line 11:26 mismatched input 'AS' expecting ';'
line 11:58 mismatched input ',' expecting ';'
line 13:16 extraneous input ')' expecting ';'
DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT * FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES RUNNING A.totalprice AS starting_price, FINAL LAST(B.totalprice) AS bottom_price, C.totalprice AS top_price ONE ROW PER MATCH PATTERN (A B+ C) DEFINE B AS totalprice < PREV(totalprice), C AS totalprice > PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=*, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Ext

---
Test: Mixed semantics: RUNNING, FINAL, and default (no prefix)
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[Measure(expression=<missing 'AS'>RUNNING, alias=None, metadata={'semantics': 'RUNNING'}, )]),
  rows_per_match=None,
  after_match_skip=None,
  pattern=None,
  subset=[],
  define=None
),
  metadata={}
)
---
Test: Default semantics (no prefix in measures)
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ord

In [1]:
from src.parser.match_recognize_extractor import parse_full_query
from src.ast.ast_nodes import FullQueryAST

# Define a list of test queries to cover edge cases and full coverage
test_queries = [
    # 1. Basic ONE ROW PER MATCH
    {
        "name": "Basic One Row Per Match",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    A.totalprice AS starting_price,
                    LAST(B.totalprice) AS bottom_price
                ONE ROW PER MATCH
                PATTERN (A B+ C)
                DEFINE
                    B AS totalprice < PREV(totalprice)
            );
        """
    },
    # 2. ALL ROWS PER MATCH with SHOW EMPTY MATCHES, SUBSET clause and union variable
    {
        "name": "All Rows Per Match - Show Empty Matches",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    A.totalprice AS starting_price,
                    LAST(B.totalprice) AS bottom_price,
                    LAST(U.totalprice) AS top_price
                ALL ROWS PER MATCH SHOW EMPTY MATCHES
                PATTERN (A | B | C | D)
                SUBSET U = (C, D)
                DEFINE
                    B AS totalprice < PREV(totalprice),
                    C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice,
                    D AS totalprice > PREV(totalprice)
            );
        """
    },
    # 3. ALL ROWS PER MATCH with OMIT EMPTY MATCHES
    {
        "name": "All Rows Per Match - Omit Empty Matches",
        "query": """
            SELECT * FROM transactions MATCH_RECOGNIZE(
                PARTITION BY account_id
                ORDER BY transaction_date
                MEASURES
                    A.amount AS start_amount,
                    C.amount AS final_amount
                ALL ROWS PER MATCH OMIT EMPTY MATCHES
                PATTERN (A | B | C)
                DEFINE
                    B AS amount < PREV(amount),
                    C AS amount > PREV(amount)
            );
        """
    },
    # 4. ALL ROWS PER MATCH WITH UNMATCHED ROWS
    {
        "name": "All Rows Per Match - With Unmatched Rows",
        "query": """
            SELECT * FROM sales MATCH_RECOGNIZE(
                PARTITION BY region
                ORDER BY sale_date
                MEASURES
                    A.price AS initial_price,
                    D.price AS final_price
                ALL ROWS PER MATCH WITH UNMATCHED ROWS
                PATTERN (A B C D)
                DEFINE
                    B AS price < PREV(price),
                    C AS price > PREV(price),
                    D AS price > A.price
            );
        """
    },
    # 5. Nested Pattern: union variables and repeated groups with WITH UNMATCHED ROWS
    {
        "name": "Nested Pattern with Unmatched Rows",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    A.totalprice AS starting_price,
                    F.totalprice AS ending_price
                ALL ROWS PER MATCH WITH UNMATCHED ROWS
                PATTERN (A (B C | D E)+ F)
                DEFINE
                    B AS totalprice < PREV(totalprice),
                    C AS totalprice > PREV(totalprice),
                    D AS totalprice = PREV(totalprice),
                    E AS totalprice >= PREV(totalprice)
            );
        """
    },
    # 6. Using Classifier() and MATCH_NUMBER() functions
    {
        "name": "Classifier and Match Number Functions",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    CLASSIFIER() AS pattern_type,
                    MATCH_NUMBER() AS match_id,
                    A.totalprice AS starting_price,
                    LAST(B.totalprice) AS bottom_price
                ONE ROW PER MATCH
                PATTERN (A B+ C)
                DEFINE
                    B AS totalprice < PREV(totalprice)
            );
        """
    },
    # 7. Running vs Final Semantics in MEASURES
    {
        "name": "Running vs Final Semantics in Measures",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    RUNNING LAST(A.totalprice) AS running_last_price,
                    FINAL LAST(A.totalprice) AS final_last_price
                ONE ROW PER MATCH
                PATTERN (A B+ C)
                DEFINE
                    B AS totalprice < PREV(totalprice)
            );
        """
    },
    # 8. Aggregates with Running vs Final semantics (avg and count)
    {
        "name": "Running Avg vs Final Count Functions",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    RUNNING avg(A.totalprice) AS running_avg,
                    FINAL count(A.*) AS final_count
                ONE ROW PER MATCH
                PATTERN (A B+ C)
                DEFINE
                    B AS totalprice < PREV(totalprice)
            );
        """
    },
    # 9. Empty Match: pattern that produces an empty match for every row
    {
        "name": "Empty Match and Unmatched Rows",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    A.totalprice AS starting_price,
                    count(A.*) AS match_count
                ONE ROW PER MATCH
                PATTERN (())
            );
        """
    },
    # 10. Physical Navigation Functions: PREV() and NEXT()
    {
        "name": "Physical Navigation Functions",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    PREV(A.totalprice, 2) AS prev_price,
                    NEXT(A.totalprice, 1) AS next_price
                ONE ROW PER MATCH
                PATTERN (A B+ C)
                DEFINE
                    B AS totalprice < PREV(totalprice)
            );
        """
    },
    # 11. Nested Navigation Function
    {
        "name": "Nested Navigation Function",
        "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                PARTITION BY custkey
                ORDER BY orderdate
                MEASURES
                    PREV(FIRST(A.totalprice, 3), 2) AS nested_nav_price
                ONE ROW PER MATCH
                PATTERN (A B+ C)
                DEFINE
                    B AS totalprice < PREV(totalprice)
            );
        """
    }
]

# Run all test cases and print the resulting AST
for test in test_queries:
    print(f"🔹 Running Test: {test['name']}")
    try:
        query_ast = parse_full_query(test["query"])
        print(query_ast)
    except Exception as e:
        print(f"Error: {e}")
    print("=" * 80)


DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT * FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES A.totalprice AS starting_price, LAST(B.totalprice) AS bottom_price ONE ROW PER MATCH PATTERN (A B+ C) DEFINE B AS totalprice < PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=*, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Extracted FROM clause via regex: FromClause(table=orders)
DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)])
DEBUG:src.parser.match_recognize_extractor:Raw measure text: A.totalprice AS starting_price
DEBUG:src.parser.match

🔹 Running Test: Basic One Row Per Match
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[Measure(expression=A.totalprice, alias=starting_price, metadata={'semantics': 'RUNNING'}, ), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={'semantics': 'RUNNING'}, )]),
  rows_per_match=RowsPerMatchClause(mode=ONE ROW PER MATCH),
  after_match_skip=None,
  pattern=PatternClause(pattern=A B+ C, metadata={'variables': ['A', 'B', 'C']}),
  subset=[],
  define=DefineClause(definitions=[Define(variable=B, condition=totalprice<PREV(totalprice))])
),
  metadata={}
)
🔹 Running Test: All Rows Per Match - Show Empty Matches
FullQueryAST(
  select_clause=SelectClause(items=[Sel

DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)])
DEBUG:src.parser.match_recognize_extractor:Raw measure text: PREV(FIRST(A.totalprice
DEBUG:src.parser.match_recognize_extractor:Extracted measure: expr=PREV(FIRST(A.totalprice, alias=None, semantics=RUNNING
DEBUG:src.parser.match_recognize_extractor:Raw measure text: 3)
DEBUG:src.parser.match_recognize_extractor:Extracted measure: expr=3), alias=None, semantics=RUNNING
DEBUG:src.parser.match_recognize_extractor:Raw measure text: 2) AS nested_nav_price
DEBUG:src.parser.match_recognize_extractor:Extracted measure: expr=2), alias=nested_nav_price, semantics=RUNNING
DEBUG:src.parser.match_recognize_extractor:Extracted MEASURES: MeasuresClause(measures=[Measure(expression=PREV(FIRST(A.totalprice, alias=None, metadata={'semantics': 'RUNNING'}, ), Measure(expression=3), alias=None, metadata={'semantics': 'RUNNING'}, ), Measure(expression=2), 

FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[Measure(expression=PREV(FIRST(A.totalprice, alias=None, metadata={'semantics': 'RUNNING'}, ), Measure(expression=3), alias=None, metadata={'semantics': 'RUNNING'}, ), Measure(expression=2), alias=nested_nav_price, metadata={'semantics': 'RUNNING'}, )]),
  rows_per_match=RowsPerMatchClause(mode=ONE ROW PER MATCH),
  after_match_skip=None,
  pattern=PatternClause(pattern=A B+ C, metadata={'variables': ['A', 'B', 'C']}),
  subset=[],
  define=DefineClause(definitions=[Define(variable=B, condition=totalprice<PREV(totalprice))])
),
  metadata={}
)


In [1]:
from src.parser.match_recognize_extractor import parse_full_query

def run_test_queries():
    test_queries = [
        {
            "description": "Mixed semantics: RUNNING, FINAL, and default (no prefix)",
            "query": """
                SELECT * FROM orders MATCH_RECOGNIZE(
                    PARTITION BY custkey
                    ORDER BY orderdate
                    MEASURES
                            RUNNING LAST(A.totalprice) AS starting_price,
         FINAL LAST(B.totalprice) AS bottom_price,
         C.totalprice AS top_price
                    ONE ROW PER MATCH
                    PATTERN (A B+ C)
                    DEFINE
                        B AS totalprice < PREV(totalprice),
                        C AS totalprice > PREV(totalprice)
                )
            """
        },
        {
            "description": "Default semantics (no prefix in measures)",
            "query": """
                SELECT * FROM orders MATCH_RECOGNIZE(
                    PARTITION BY custkey
                    ORDER BY orderdate
                    MEASURES
                        A.totalprice AS starting_price,
                        LAST(B.totalprice) AS bottom_price
                    ONE ROW PER MATCH
                    PATTERN (A B+)
                    DEFINE
                        B AS totalprice < PREV(totalprice)
                )
            """
        },
        {
            "description": "Mixed with extra spaces and explicit prefixes",
            "query": """
                SELECT * FROM orders MATCH_RECOGNIZE(
                    PARTITION BY custkey
                    ORDER BY orderdate
                    MEASURES
                        FINAL   A.totalprice   AS   starting_price,
                        RUNNING  LAST(B.totalprice)   AS  bottom_price,
                        RUNNING NEXT(C.totalprice) AS top_price
                    ALL ROWS PER MATCH
                    PATTERN ( A B+ C+ )
                    DEFINE
                        B AS totalprice < PREV(totalprice),
                        C AS totalprice > PREV(totalprice)
                )
            """
        },
        {
            "description": "Measures without aliases",
            "query": """
                SELECT * FROM orders MATCH_RECOGNIZE(
                    PARTITION BY custkey
                    ORDER BY orderdate
                    MEASURES
                        FINAL A.totalprice,
                        RUNNING LAST(B.totalprice)
                    ONE ROW PER MATCH
                    PATTERN (A B+)
                    DEFINE
                        B AS totalprice < PREV(totalprice)
                )
            """
        }
    ]

    for test in test_queries:
        print(f"---\nTest: {test['description']}")
        try:
            ast = parse_full_query(test['query'])
            print(ast)
        except Exception as e:
            print(f"Error: {e}")

if __name__ == "__main__":
    run_test_queries()


DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT * FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES RUNNING LAST(A.totalprice) AS starting_price, FINAL LAST(B.totalprice) AS bottom_price, C.totalprice AS top_price ONE ROW PER MATCH PATTERN (A B+ C) DEFINE B AS totalprice < PREV(totalprice), C AS totalprice > PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=*, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Extracted FROM clause via regex: FromClause(table=orders)
DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)])
DEBUG:src.parser.match_recogniz

---
Test: Mixed semantics: RUNNING, FINAL, and default (no prefix)
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[]),
  rows_per_match=RowsPerMatchClause(mode=ONE ROW PER MATCH),
  after_match_skip=None,
  pattern=PatternClause(pattern=A B+ C, metadata={'variables': ['A', 'B', 'C']}),
  subset=[],
  define=DefineClause(definitions=[Define(variable=B, condition=totalprice<PREV(totalprice)), Define(variable=C, condition=totalprice>PREV(totalprice))])
),
  metadata={}
)
---
Test: Default semantics (no prefix in measures)
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecog

In [1]:
from src.parser.match_recognize_extractor import parse_full_query

if __name__ == "__main__":
    sample_query = """
    SELECT x ,b FROM orders MATCH_RECOGNIZE(
         PARTITION BY custkey
         ORDER BY orderdate 
         MEASURES
                  A.totalprice AS starting_price,
                  LAST(B.totalprice) AS bottom_price,
                  LAST(U.totalprice) AS top_price
         ALL ROWS PER MATCH OMIT EMPTY MATCHES
         
         PATTERN ( A | B | C | D )
         SUBSET U = (C, D)
         DEFINE
                  B AS totalprice < PREV(totalprice),
                  C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice,
                  D AS totalprice > PREV(totalprice)
    )
    """
    query_ast = parse_full_query(sample_query)
    print(query_ast)


DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT x ,b FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES A.totalprice AS starting_price, LAST(B.totalprice) AS bottom_price, LAST(U.totalprice) AS top_price ALL ROWS PER MATCH OMIT EMPTY MATCHES PATTERN ( A | B | C | D ) SUBSET U = (C, D) DEFINE B AS totalprice < PREV(totalprice), C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice, D AS totalprice > PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=x, metadata={}), SelectItem(expression=b, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Extracted FROM clause via regex: FromClause(table=orders)
DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor

FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=x, metadata={}), SelectItem(expression=b, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[Measure(expression=A.totalprice, alias=starting_price, metadata={'semantics': 'RUNNING'}, ), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={'semantics': 'RUNNING'}, ), Measure(expression=LAST(U.totalprice), alias=top_price, metadata={'semantics': 'RUNNING'}, )]),
  rows_per_match=RowsPerMatchClause(mode=ALL ROWS PER MATCH, OMIT EMPTY MATCHES),
  after_match_skip=None,
  pattern=PatternClause(pattern=A | B | C | D, metadata={'variables': ['A', 'B', 'C', 'D']}),
  subset=[SubsetClause(subset_text=U=(C,D))],
  define=DefineClause(definitions=[Define(variable=B, condition=

In [1]:
from src.parser.match_recognize_extractor import parse_match_recognize_query
if __name__ == "__main__":
    sample_query = """
SELECT *
FROM orders
MATCH_RECOGNIZE(
  PARTITION BY custkey
  ORDER BY orderdate
  MEASURES
    CLASSIFIER() AS pattern_type,
    MATCH_NUMBER() AS match_id,
    A.totalprice AS starting_price,
    LAST(B.totalprice) AS bottom_price
  ONE ROW PER MATCH
  PATTERN (A B+ C)
  DEFINE
    B AS totalprice < PREV(totalprice)
);
    """
    components = parse_match_recognize_query(sample_query)

    print(components)

DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)])
DEBUG:src.parser.match_recognize_extractor:Detected CLASSIFIER function in measure: Measure(expression=CLASSIFIER(), alias=pattern_type, metadata={}, CLASSIFIER() detected)
DEBUG:src.parser.match_recognize_extractor:Detected MATCH_NUMBER function in measure: Measure(expression=MATCH_NUMBER(), alias=match_id, metadata={}, MATCH_NUMBER() detected)
DEBUG:src.parser.match_recognize_extractor:Extracted MEASURES: MeasuresClause(measures=[Measure(expression=CLASSIFIER(), alias=pattern_type, metadata={}, CLASSIFIER() detected), Measure(expression=MATCH_NUMBER(), alias=match_id, metadata={}, MATCH_NUMBER() detected), Measure(expression=A.totalprice, alias=startin

MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[Measure(expression=CLASSIFIER(), alias=pattern_type, metadata={}, CLASSIFIER() detected), Measure(expression=MATCH_NUMBER(), alias=match_id, metadata={}, MATCH_NUMBER() detected), Measure(expression=A.totalprice, alias=starting_price, metadata={}, ), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={}, )]),
  rows_per_match=RowsPerMatchClause(mode=ONE ROW PER MATCH),
  after_match_skip=None,
  pattern=PatternClause(pattern=A B+ C, metadata={'variables': ['A', 'B', 'C']}),
  subset=[],
  define=DefineClause(definitions=[Define(variable=B, condition=totalprice<PREV(totalprice))])
)


In [1]:
from src.parser.match_recognize_extractor import parse_full_query

if __name__ == "__main__":
    test_queries = [
        {
            "name": "Classifier and Match Number Functions",
            "query": """
            SELECT * FROM orders 
            MATCH_RECOGNIZE(
                PARTITION BY custkey 
                ORDER BY orderdate 
                MEASURES
                    CLASSIFIER() AS pattern_type,
                    MATCH_NUMBER() AS match_id,
                    A.totalprice AS starting_price,
                    LAST(B.totalprice) AS bottom_price
                ONE ROW PER MATCH 
                PATTERN ( A B+ C ) 
                DEFINE 
                    B AS totalprice < PREV(totalprice)
            );
            """
        },
        {
            "name": "Running vs Final Semantics in MEASURES",
            "query": """
            SELECT * FROM orders 
            MATCH_RECOGNIZE(
                PARTITION BY custkey 
                ORDER BY orderdate 
                MEASURES
                    RUNNING LAST(A.totalprice) AS running_last_price,
                    FINAL LAST(A.totalprice) AS final_last_price
                ONE ROW PER MATCH 
                PATTERN ( A B+ C ) 
                DEFINE 
                    B AS totalprice < PREV(totalprice)
            );
            """
        },
        {
            "name": "Running Avg vs Final Count Functions",
            "query": """
            SELECT * FROM orders 
            MATCH_RECOGNIZE(
                PARTITION BY custkey 
                ORDER BY orderdate 
                MEASURES
                    RUNNING avg(A.totalprice) AS running_avg,
                    FINAL count(A.*) AS final_count
                ONE ROW PER MATCH 
                PATTERN ( A B+ C ) 
                DEFINE 
                    B AS totalprice < PREV(totalprice)
            );
            """
        }
    ]
    
    for test in test_queries:
        print(f"🔹 Running Test: {test['name']}")
        query_ast = parse_full_query(test["query"])
        print(query_ast)
        print("=" * 80)


DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT * FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES CLASSIFIER() AS pattern_type, MATCH_NUMBER() AS match_id, A.totalprice AS starting_price, LAST(B.totalprice) AS bottom_price ONE ROW PER MATCH PATTERN ( A B+ C ) DEFINE B AS totalprice < PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=*, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Extracted FROM clause via regex: FromClause(table=orders)
DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)])
DEBUG:src.parser.match_recognize_extractor:Extracted M

🔹 Running Test: Classifier and Match Number Functions
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[Measure(expression=CLASSIFIER(), alias=pattern_type, metadata={'semantics': 'RUNNING'}, CLASSIFIER() detected), Measure(expression=MATCH_NUMBER(), alias=match_id, metadata={'semantics': 'RUNNING'}, MATCH_NUMBER() detected), Measure(expression=A.totalprice, alias=starting_price, metadata={'semantics': 'RUNNING'}, ), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={'semantics': 'RUNNING'}, )]),
  rows_per_match=RowsPerMatchClause(mode=ONE ROW PER MATCH),
  after_match_skip=None,
  pattern=PatternClause(pattern=A B+ C, metadata={'variables': ['A', 'B', 'C']})

In [2]:
from src.parser.match_recognize_extractor import parse_full_query

if __name__ == "__main__":
    test_queries = [
        # 1️⃣ All Rows Per Match with Omit Empty Matches, Subset, and multiple DEFINE conditions
        {
            "name": "All Rows Per Match with Omit Empty Matches",
            "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                 PARTITION BY custkey
                 ORDER BY orderdate 
                 MEASURES
                          A.totalprice AS starting_price,
                          LAST(B.totalprice) AS bottom_price,
                          LAST(U.totalprice) AS top_price
                 ALL ROWS PER MATCH OMIT EMPTY MATCHES
                 PATTERN ( A | B | C | D )
                 SUBSET U = (C, D)
                 DEFINE
                          B AS totalprice < PREV(totalprice),
                          C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice,
                          D AS totalprice > PREV(totalprice)
            )
            """
        },
        # 2️⃣ One Row Per Match with a quantifier pattern
        {
            "name": "One Row Per Match with Quantifier",
            "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                 PARTITION BY custkey
                 ORDER BY orderdate 
                 MEASURES
                          A.totalprice AS starting_price,
                          LAST(B.totalprice) AS bottom_price
                 ONE ROW PER MATCH
                 PATTERN ( A B+ C )
                 DEFINE B AS totalprice < PREV(totalprice)
            )
            """
        },
        # 3️⃣ All Rows Per Match with WITH UNMATCHED ROWS and a nested pattern
        {
            "name": "All Rows with Unmatched Rows and Nested Pattern",
            "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                 PARTITION BY custkey
                 ORDER BY orderdate 
                 MEASURES
                          A.totalprice AS starting_price,
                          F.totalprice AS ending_price
                 ALL ROWS PER MATCH WITH UNMATCHED ROWS
                 PATTERN ( A (B C | D E)+ F )
                 DEFINE 
                          B AS totalprice < PREV(totalprice),
                          C AS totalprice > PREV(totalprice),
                          D AS totalprice = PREV(totalprice),
                          E AS totalprice >= PREV(totalprice)
            )
            """
        },
        # 4️⃣ Exclusion Pattern
        {
            "name": "Exclusion Pattern",
            "query": """
            SELECT * FROM transactions MATCH_RECOGNIZE(
                 PARTITION BY account_id
                 ORDER BY transaction_date
                 MEASURES
                          A.amount AS first_amount,
                          C.amount AS last_amount
                 ONE ROW PER MATCH
                 PATTERN ( {- A -} B C )
                 DEFINE 
                          B AS amount < PREV(amount),
                          C AS amount > PREV(amount)
            )
            """
        },
        # 5️⃣ Complex DEFINE Conditions
        {
            "name": "Complex Conditions in DEFINE",
            "query": """
            SELECT * FROM sales MATCH_RECOGNIZE(
                 PARTITION BY region
                 ORDER BY sale_date
                 MEASURES
                          A.price AS initial_price,
                          C.price AS final_price
                 ONE ROW PER MATCH
                 PATTERN ( A B C )
                 DEFINE 
                          B AS totalprice < PREV(totalprice) AND totalprice > 100,
                          C AS totalprice > PREV(totalprice) OR totalprice = 100
            )
            """
        },
        # 6️⃣ Pattern Permutation
        {
            "name": "Pattern Permutation",
            "query": """
            SELECT * FROM customers MATCH_RECOGNIZE(
                 PARTITION BY customer_id
                 ORDER BY transaction_date
                 MEASURES
                          A.amount AS first_spend,
                          C.amount AS final_spend
                 ONE ROW PER MATCH
                 PATTERN ( PERMUTE(A, B, C) )
                 DEFINE 
                          B AS amount < PREV(amount),
                          C AS amount > PREV(amount)
            )
            """
        },
        # 7️⃣ Range Quantifiers
        {
            "name": "Range Quantifiers",
            "query": """
            SELECT * FROM stock_data MATCH_RECOGNIZE(
                 PARTITION BY stock_symbol
                 ORDER BY trade_date
                 MEASURES
                          A.price AS first_price,
                          C.price AS final_price
                 ONE ROW PER MATCH
                 PATTERN ( A{2,4} B{3} C{,5} )
                 DEFINE 
                          B AS price < PREV(price),
                          C AS price > PREV(price)
            )
            """
        }
    ]

    # 🚀 Run all test cases and print the resulting ASTs
    for test in test_queries:
        print(f"🔹 Running Test: {test['name']}")
        query_ast = parse_full_query(test["query"])
        print(query_ast)
        print("=" * 80)


DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT * FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES A.totalprice AS starting_price, LAST(B.totalprice) AS bottom_price, LAST(U.totalprice) AS top_price ALL ROWS PER MATCH OMIT EMPTY MATCHES PATTERN ( A | B | C | D ) SUBSET U = (C, D) DEFINE B AS totalprice < PREV(totalprice), C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice, D AS totalprice > PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=*, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Extracted FROM clause via regex: FromClause(table=orders)
DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(sort_it

🔹 Running Test: All Rows Per Match with Omit Empty Matches
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[Measure(expression=A.totalprice, alias=starting_price, metadata={'semantics': 'RUNNING'}, ), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={'semantics': 'RUNNING'}, ), Measure(expression=LAST(U.totalprice), alias=top_price, metadata={'semantics': 'RUNNING'}, )]),
  rows_per_match=RowsPerMatchClause(mode=ALL ROWS PER MATCH, OMIT EMPTY MATCHES),
  after_match_skip=None,
  pattern=PatternClause(pattern=A | B | C | D, metadata={'variables': ['A', 'B', 'C', 'D']}),
  subset=[SubsetClause(subset_text=U=(C,D))],
  define=DefineClause(definitions=[Define(va

In [1]:
from src.parser.match_recognize_extractor import parse_full_query

if __name__ == "__main__":
    test_queries = [
        # 1️⃣ ONE ROW PER MATCH (default)
        {
            "name": "One Row Per Match (Default)",
            "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                 PARTITION BY custkey
                 ORDER BY orderdate
                 MEASURES
                      A.totalprice AS starting_price,
                      LAST(B.totalprice) AS bottom_price
                 ONE ROW PER MATCH
                 PATTERN ( A B+ C )
                 DEFINE B AS totalprice < PREV(totalprice)
            );
            """
        },
        # 2️⃣ ALL ROWS PER MATCH SHOW EMPTY MATCHES
        {
            "name": "All Rows Per Match - Show Empty Matches",
            "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                 PARTITION BY custkey
                 ORDER BY orderdate
                 MEASURES
                      A.totalprice AS starting_price,
                      LAST(B.totalprice) AS bottom_price,
                      LAST(U.totalprice) AS top_price
                 ALL ROWS PER MATCH SHOW EMPTY MATCHES
                 PATTERN ( A | B | C | D )
                 SUBSET U = (C, D)
                 DEFINE
                      B AS totalprice < PREV(totalprice),
                      C AS totalprice > PREV(totalprice) AND totalprice <= A.totalprice,
                      D AS totalprice > PREV(totalprice)
            );
            """
        },
        # 3️⃣ ALL ROWS PER MATCH OMIT EMPTY MATCHES
        {
            "name": "All Rows Per Match - Omit Empty Matches",
            "query": """
            SELECT * FROM transactions MATCH_RECOGNIZE(
                 PARTITION BY account_id
                 ORDER BY transaction_date
                 MEASURES
                      A.amount AS start_amount,
                      C.amount AS final_amount
                 ALL ROWS PER MATCH OMIT EMPTY MATCHES
                 PATTERN ( A | B | C )
                 DEFINE
                      B AS amount < PREV(amount),
                      C AS amount > PREV(amount)
            );
            """
        },
        # 4️⃣ ALL ROWS PER MATCH WITH UNMATCHED ROWS
        {
            "name": "All Rows Per Match - With Unmatched Rows",
            "query": """
            SELECT * FROM sales MATCH_RECOGNIZE(
                 PARTITION BY region
                 ORDER BY sale_date
                 MEASURES
                      A.price AS initial_price,
                      D.price AS final_price
                 ALL ROWS PER MATCH WITH UNMATCHED ROWS
                 PATTERN ( A B C D )
                 DEFINE
                      B AS price < PREV(price),
                      C AS price > PREV(price),
                      D AS price > A.price
            );
            """
        },
        # 5️⃣ Nested Pattern with ALL ROWS PER MATCH WITH UNMATCHED ROWS
        {
            "name": "Nested Pattern with Unmatched Rows",
            "query": """
            SELECT * FROM orders MATCH_RECOGNIZE(
                 PARTITION BY custkey
                 ORDER BY orderdate
                 MEASURES
                      A.totalprice AS starting_price,
                      F.totalprice AS ending_price
                 ALL ROWS PER MATCH WITH UNMATCHED ROWS
                 PATTERN ( A (B C | D E)+ F )
                 DEFINE 
                      B AS totalprice < PREV(totalprice),
                      C AS totalprice > PREV(totalprice),
                      D AS totalprice = PREV(totalprice),
                      E AS totalprice >= PREV(totalprice)
            );
            """
        }
    ]

    for test in test_queries:
        print(f"🔹 Running Test: {test['name']}")
        query_ast = parse_full_query(test["query"])
        print(query_ast)
        print("=" * 80)


DEBUG:src.parser.match_recognize_extractor:Full statement text: SELECT * FROM orders MATCH_RECOGNIZE( PARTITION BY custkey ORDER BY orderdate MEASURES A.totalprice AS starting_price, LAST(B.totalprice) AS bottom_price ONE ROW PER MATCH PATTERN ( A B+ C ) DEFINE B AS totalprice < PREV(totalprice) );
DEBUG:src.parser.match_recognize_extractor:Extracted SELECT clause via robust splitting: SelectClause(items=[SelectItem(expression=*, metadata={})])
DEBUG:src.parser.match_recognize_extractor:Extracted FROM clause via regex: FromClause(table=orders)
DEBUG:src.parser.match_recognize_extractor:Visiting PatternRecognition context
DEBUG:src.parser.match_recognize_extractor:Extracted PARTITION BY: PartitionByClause(columns=['custkey'])
DEBUG:src.parser.match_recognize_extractor:Extracted ORDER BY: OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)])
DEBUG:src.parser.match_recognize_extractor:Extracted MEASURES: MeasuresClause(measures=[Measure(expression=A.tot

🔹 Running Test: One Row Per Match (Default)
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from_clause=FromClause(table=orders),
  match_recognize=MatchRecognizeClause(
  partition_by=PartitionByClause(columns=['custkey']),
  order_by=OrderByClause(sort_items=[SortItem(column=orderdate, ordering=ASC, nulls_ordering=None)]),
  measures=MeasuresClause(measures=[Measure(expression=A.totalprice, alias=starting_price, metadata={}), Measure(expression=LAST(B.totalprice), alias=bottom_price, metadata={})]),
  rows_per_match=RowsPerMatchClause(mode=ONE ROW PER MATCH),
  after_match_skip=None,
  pattern=PatternClause(pattern=A B+ C, metadata={'variables': ['A', 'B', 'C']}),
  subset=[],
  define=DefineClause(definitions=[Define(variable=B, condition=totalprice<PREV(totalprice))])
),
  metadata={}
)
🔹 Running Test: All Rows Per Match - Show Empty Matches
FullQueryAST(
  select_clause=SelectClause(items=[SelectItem(expression=*, metadata={})]),
  from

In [None]:
import sys
import pandas as pd
# Use an absolute import for match_recognize.
from transformations.match_recognize import match_recognize

query = """
    SELECT id, name FROM employees MATCH_RECOGNIZE (
        PARTITION BY department, region
        ORDER BY hire_date
        MEASURES salary AS avg_salary
        PATTERN (A)
        DEFINE A AS salary > 1000
    );
    """
    
data = [
        {"id": 1, "name": "Alice",   "department": "Sales", "region": "West", "hire_date": "2021-01-01", "salary": 1200},
        {"id": 2, "name": "Bob",     "department": "Sales", "region": "West", "hire_date": "2021-01-02", "salary": 1300},
        {"id": 3, "name": "Charlie", "department": "Sales", "region": "West", "hire_date": "2021-01-03", "salary": 900},
        {"id": 4, "name": "Diana",   "department": "Sales", "region": "West", "hire_date": "2021-01-04", "salary": 1100},
    ]
    
try:
        output_df = match_recognize(query, pd.DataFrame(data))
        print("Match Recognize Output:")
        print(output_df)
except Exception as e:
        print(f"Error: {str(e)}")

ImportError: cannot import name 'AutomatonBuilder' from partially initialized module 'transformations.automaton.automaton_builder' (most likely due to a circular import) (/home/monierashraf/Desktop/llm/Match_recognize/project/transformations/automaton/automaton_builder.py)