In [1]:
!python test_matcher.py



Running Pattern Matching Tests

Executing: Basic Pattern
DEBUG:sql_parser:Raw measures content: FIRST(price) AS start_price,
            LAST(price) AS end_price,
            COUNT(*) AS pattern_length
DEBUG:sql_parser:Added function measure: start_price = (FIRST, price)
DEBUG:sql_parser:Added function measure: end_price = (LAST, price)
DEBUG:sql_parser:Added COUNT(*) measure: pattern_length
DEBUG:sql_parser:All parsed measures: {'start_price': ('FIRST', 'price'), 'end_price': ('LAST', 'price'), 'pattern_length': ('COUNT', '*')}
DEBUG:sql_parser:Raw pattern: A+ B
DEBUG:sql_parser:Parsed pattern: A+ B
DEBUG:sql_parser:Raw define content: A AS price > PREV(price),
            B AS price < PREV(price)
DEBUG:sql_parser:Added condition for variable A: price > PREV(price)
DEBUG:sql_parser:Added condition for variable B: price < PREV(price)
DEBUG:sql_parser:Parsed conditions: {'A': 'price > PREV(price)', 'B': 'price < PREV(price)'}
DEBUG:sql_parser:Parsed parameters: MatchRecognizeParams(par

In [1]:
# Import necessary libraries
import pandas as pd
from execution_engine import run_match_recognize

# Create the DataFrame
data = pd.DataFrame([
    {"ticker": "AAPL", "ts": "2024-01-01", "price": 100, "volume": 1000},
    {"ticker": "AAPL", "ts": "2024-01-02", "price": 95, "volume": 1100},
    {"ticker": "AAPL", "ts": "2024-01-03", "price": 90, "volume": 1200},
    {"ticker": "AAPL", "ts": "2024-01-04", "price": 92, "volume": 1300},
    {"ticker": "AAPL", "ts": "2024-01-05", "price": 98, "volume": 1400},
    {"ticker": "GOOGL", "ts": "2024-01-01", "price": 500, "volume": 2000},
    {"ticker": "GOOGL", "ts": "2024-01-02", "price": 480, "volume": 2100},
    {"ticker": "GOOGL", "ts": "2024-01-03", "price": 470, "volume": 2200},
    {"ticker": "GOOGL", "ts": "2024-01-04", "price": 485, "volume": 2300},
    {"ticker": "GOOGL", "ts": "2024-01-05", "price": 510, "volume": 2400}
])

# Define the MATCH_RECOGNIZE query
query = """
SELECT *
FROM data
MATCH_RECOGNIZE (
    PARTITION BY ticker
    ORDER BY ts
    MEASURES
        FIRST(price) AS start_price,
        LAST(A.price) AS bottom_price,
        LAST(price) AS end_price,
        COUNT(*) AS pattern_length,
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        RUNNING AVG(volume) AS avg_volume,
        FINAL MAX(volume) AS max_volume
    ONE ROW PER MATCH
    PATTERN (A+ (B | C) D?)
    SUBSET PRICE_TREND = (B, C)
    DEFINE
        A AS price < PREV(price),
        B AS price > PREV(price),
        C AS price = PREV(price),
        D AS price > PREV(price)
);
"""

# Run the MATCH_RECOGNIZE query
try:
    result = run_match_recognize(query, data)
    print("MATCH_RECOGNIZE Results:")
    print(result)
except Exception as e:
    print(f"Error running MATCH_RECOGNIZE: {str(e)}")


DEBUG:sql_parser:Raw measures content: FIRST(price) AS start_price,
        LAST(A.price) AS bottom_price,
        LAST(price) AS end_price,
        COUNT(*) AS pattern_length,
        CLASSIFIER() AS pattern_var,
        MATCH_NUMBER() AS match_num,
        RUNNING AVG(volume) AS avg_volume,
        FINAL MAX(volume) AS max_volume
DEBUG:sql_parser:Added function measure: start_price = (FIRST, price)
DEBUG:sql_parser:Added pattern variable measure: bottom_price = (LAST, A.price)
DEBUG:sql_parser:Added function measure: end_price = (LAST, price)
DEBUG:sql_parser:Added COUNT(*) measure: pattern_length
DEBUG:sql_parser:Added CLASSIFIER measure: pattern_var
DEBUG:sql_parser:Added MATCH_NUMBER measure: match_num
DEBUG:sql_parser:Added function measure: avg_volume = (RUNNING_AVG, volume)
DEBUG:sql_parser:Added function measure: max_volume = (FINAL_MAX, volume)
DEBUG:sql_parser:All parsed measures: {'start_price': ('FIRST', 'price'), 'bottom_price': ('LAST', 'A.price'), 'end_price': ('LAST', 

MATCH_RECOGNIZE Results:
  ticker  start_price  bottom_price  end_price  pattern_length pattern_var  \
0   AAPL          100            90         92               4           A   
1  GOOGL          500           470        485               4           A   

   match_num  avg_volume  max_volume  
0          2      1150.0      1300.0  
1          4      2150.0      2300.0  


In [6]:
!python -m transformations.match_recognize_ast


DEBUG:__main__:Entering MATCH_RECOGNIZE clause (enhanced AST builder)
DEBUG:__main__:Extracted PARTITION BY: ['custkey']
DEBUG:__main__:Extracted ORDER BY: ['orderdate']
DEBUG:__main__:Parsed expression 'FIRST(price)' into AST: ExpressionAST(type='navigation', value=None, operator=None, children=[], function_name=None, arguments=[ExpressionAST(type='identifier', value='price', operator=None, children=[], function_name=None, arguments=[], navigation_type=None, offset=None), ExpressionAST(type='literal', value='0', operator=None, children=[], function_name=None, arguments=[], navigation_type=None, offset=None)], navigation_type='FIRST', offset=None)
DEBUG:__main__:Parsed expression 'LAST(price)' into AST: ExpressionAST(type='navigation', value=None, operator=None, children=[], function_name=None, arguments=[ExpressionAST(type='identifier', value='price', operator=None, children=[], function_name=None, arguments=[], navigation_type=None, offset=None), ExpressionAST(type='literal', value='

In [4]:
!python -m unittest tests.test_enhancements


INFO:root:✓ Successfully detected circular reference
.DEBUG:transformations.match_recognize_pattern:Parsed pattern 'A | B | A' into AST: PatternAST(type='alternation', value=None, quantifier=None, quantifier_min=None, quantifier_max=None, children=[PatternAST(type='literal', value='A', quantifier=None, quantifier_min=None, quantifier_max=None, children=[], excluded=False), PatternAST(type='alternation', value=None, quantifier=None, quantifier_min=None, quantifier_max=None, children=[PatternAST(type='literal', value='B', quantifier=None, quantifier_min=None, quantifier_max=None, children=[], excluded=False), PatternAST(type='literal', value='A', quantifier=None, quantifier_min=None, quantifier_max=None, children=[], excluded=False)], excluded=False)], excluded=False)
INFO:root:✓ Successfully optimized 'A | B | A' to remove duplicates
.Entering MATCH_RECOGNIZE clause
DEBUG:transformations.match_recognize_parser:Entering MATCH_RECOGNIZE clause (enhanced AST builder)
DEBUG:transformations.

In [3]:
from antlr4 import *
from grammar.TrinoLexer import TrinoLexer
from grammar.TrinoParser import TrinoParser

def parse_input(input_text):
    input_stream = InputStream(input_text)
    lexer = TrinoLexer(input_stream)
    stream = CommonTokenStream(lexer)
    parser = TrinoParser(stream)

    # Check the actual entry rule from your grammar!
    tree = parser.statements()  # Change 'statements' if your grammar has a different entry rule.

    return tree, parser

if __name__ == "__main__":
    sample = "SELECT * FROM table_name;"  # Example SQL query
    tree, parser = parse_input(sample)
    print(tree.toStringTree(recog=parser))  # Print the parse tree


(statements (singleStatement (statement (rootQuery (query (queryNoWith (queryTerm (queryPrimary (querySpecification SELECT (selectItem *) FROM (relation (sampledRelation (patternRecognition (aliasedRelation (relationPrimary (qualifiedName (identifier table_name)))))))))))))) ;))


In [1]:
from antlr4 import *
from grammar.TrinoLexer import TrinoLexer
from grammar.TrinoParser import TrinoParser

def parse_input(input_text):
    input_stream = InputStream(input_text)
    lexer = TrinoLexer(input_stream)
    stream = CommonTokenStream(lexer)
    parser = TrinoParser(stream)

    # Check the actual entry rule from your grammar!
    tree = parser.statements()  # Change 'statements' if your grammar has a different entry rule.

    return tree, parser

if __name__ == "__main__":
    sample = "SELECT * FROM table_name;"  # Example SQL query
    tree, parser = parse_input(sample)
    print(tree.toStringTree(recog=parser))  # Print the parse tree


(statements (singleStatement (statement (rootQuery (query (queryNoWith (queryTerm (queryPrimary (querySpecification SELECT (selectItem *) FROM (relation (sampledRelation (patternRecognition (aliasedRelation (relationPrimary (qualifiedName (identifier table_name)))))))))))))) ;))


In [5]:
from antlr4 import *
from grammar.TrinoLexer import *
from grammar.TrinoParser import *

In [3]:
from antlr4 import *
from grammar.TrinoLexer import TrinoLexer
from grammar.TrinoParser import TrinoParser

def parse_input(input_text):
    input_stream = InputStream(input_text)
    lexer = TrinoLexer(input_stream)
    stream = CommonTokenStream(lexer)
    parser = TrinoParser(stream)

    # Replace 'parse' with the correct start rule from your TrinoParser.g4 file
    tree = parser.statements()  # Change this to match your grammar's entry rule

    return tree, parser  # Return both the parse tree and parser

if __name__ == "__main__":
    sample = """SELECT *
    FROM data
    MATCH_RECOGNIZE (
        PARTITION BY ticker
        ORDER BY ts
        MEASURES
            FIRST(price) AS start_price,
            LAST(A.price) AS bottom_price,
            LAST(price) AS end_price,
            COUNT(*) AS pattern_length,
            CLASSIFIER() AS pattern_var,
            MATCH_NUMBER() AS match_num,
            RUNNING AVG(volume) AS avg_volume,
            FINAL MAX(volume) AS max_volume
        ONE ROW PER MATCH
        PATTERN (A+ (B | C) D?)
        SUBSET PRICE_TREND = (B, C)
        DEFINE
            A AS price < PREV(price),
            B AS price > PREV(price),
            C AS price = PREV(price),
            D AS price > PREV(price)
    );"""
    
    tree, parser = parse_input(sample)  # Capture both values returned by parse_input()
    print(tree.toStringTree(recog=parser))  # Now parser is correctly defined


(statements (singleStatement (statement (rootQuery (query (queryNoWith (queryTerm (queryPrimary (querySpecification SELECT (selectItem *) FROM (relation (sampledRelation (patternRecognition (aliasedRelation (relationPrimary (qualifiedName (identifier (nonReserved data))))) MATCH_RECOGNIZE ( PARTITION BY (expression (booleanExpression (valueExpression (primaryExpression (identifier ticker))))) ORDER BY (sortItem (expression (booleanExpression (valueExpression (primaryExpression (identifier ts)))))) MEASURES (measureDefinition (expression (booleanExpression (valueExpression (primaryExpression (qualifiedName (identifier (nonReserved FIRST))) ( (expression (booleanExpression (valueExpression (primaryExpression (identifier price))))) ))))) AS (identifier start_price)) , (measureDefinition (expression (booleanExpression (valueExpression (primaryExpression (qualifiedName (identifier (nonReserved LAST))) ( (expression (booleanExpression (valueExpression (primaryExpression (primaryExpression (i

In [19]:
from antlr4.tree.Trees import Trees

tree, parser = parse_input(sample)
print(Trees.toStringTree(tree, None, parser))


(statements (singleStatement (statement (rootQuery (query (queryNoWith (queryTerm (queryPrimary (querySpecification SELECT (selectItem *) FROM (relation (sampledRelation (patternRecognition (aliasedRelation (relationPrimary (qualifiedName (identifier (nonReserved data))))) MATCH_RECOGNIZE ( PARTITION BY (expression (booleanExpression (valueExpression (primaryExpression (identifier ticker))))) ORDER BY (sortItem (expression (booleanExpression (valueExpression (primaryExpression (identifier ts)))))) MEASURES (measureDefinition (expression (booleanExpression (valueExpression (primaryExpression (qualifiedName (identifier (nonReserved FIRST))) ( (expression (booleanExpression (valueExpression (primaryExpression (identifier price))))) ))))) AS (identifier start_price)) , (measureDefinition (expression (booleanExpression (valueExpression (primaryExpression (qualifiedName (identifier (nonReserved LAST))) ( (expression (booleanExpression (valueExpression (primaryExpression (primaryExpression (i

In [21]:
from grammar.TrinoParserListener import TrinoParserListener
from antlr4 import ParseTreeWalker

class SQLListener(TrinoParserListener):
    def enterQualifiedName(self, ctx):
        print("Table or Column Detected:", ctx.getText())

tree, parser = parse_input(sample)

walker = ParseTreeWalker()
listener = SQLListener()
walker.walk(listener, tree)


Table or Column Detected: data
Table or Column Detected: FIRST
Table or Column Detected: LAST
Table or Column Detected: LAST
Table or Column Detected: COUNT
Table or Column Detected: CLASSIFIER
Table or Column Detected: MATCH_NUMBER
Table or Column Detected: AVG
Table or Column Detected: MAX
Table or Column Detected: PREV
Table or Column Detected: PREV
Table or Column Detected: PREV
Table or Column Detected: PREV


In [22]:
class MatchRecognizeListener(TrinoParserListener):
    def enterPatternRecognition(self, ctx):
        print("MATCH_RECOGNIZE Clause Detected:", ctx.getText())

tree, parser = parse_input(sample)

walker = ParseTreeWalker()
listener = MatchRecognizeListener()
walker.walk(listener, tree)


MATCH_RECOGNIZE Clause Detected: dataMATCH_RECOGNIZE(PARTITIONBYtickerORDERBYtsMEASURESFIRST(price)ASstart_price,LAST(A.price)ASbottom_price,LAST(price)ASend_price,COUNT(*)ASpattern_length,CLASSIFIER()ASpattern_var,MATCH_NUMBER()ASmatch_num,RUNNINGAVG(volume)ASavg_volume,FINALMAX(volume)ASmax_volumeONEROWPERMATCHPATTERN(A+(B|C)D?)SUBSETPRICE_TREND=(B,C)DEFINEAASprice<PREV(price),BASprice>PREV(price),CASprice=PREV(price),DASprice>PREV(price))


In [23]:
!pip install anytree




In [24]:
from anytree import Node, RenderTree

def build_tree(node, parser, parent=None):
    node_name = parser.ruleNames[node.getRuleIndex()] if node.getChildCount() > 0 else node.getText()
    tree_node = Node(node_name, parent=parent)
    for i in range(node.getChildCount()):
        build_tree(node.getChild(i), parser, tree_node)
    return tree_node

tree, parser = parse_input(sample)
root = build_tree(tree, parser)

for pre, fill, node in RenderTree(root):
    print("%s%s" % (pre, node.name))


statements
└── singleStatement
    ├── statement
    │   └── rootQuery
    │       └── query
    │           └── queryNoWith
    │               └── queryTerm
    │                   └── queryPrimary
    │                       └── querySpecification
    │                           ├── SELECT
    │                           ├── selectItem
    │                           │   └── *
    │                           ├── FROM
    │                           └── relation
    │                               └── sampledRelation
    │                                   └── patternRecognition
    │                                       ├── aliasedRelation
    │                                       │   └── relationPrimary
    │                                       │       └── qualifiedName
    │                                       │           └── identifier
    │                                       │               └── nonReserved
    │                                       │                  

Print Indented Tree Format (More Readable)

In [20]:
def print_tree(node, parser, indent=0):
    if node.getChildCount() == 0:
        print("  " * indent + node.getText())
    else:
        print("  " * indent + parser.ruleNames[node.getRuleIndex()])
        for i in range(node.getChildCount()):
            print_tree(node.getChild(i), parser, indent + 1)

tree, parser = parse_input(sample)
print_tree(tree, parser)


statements
  singleStatement
    statement
      rootQuery
        query
          queryNoWith
            queryTerm
              queryPrimary
                querySpecification
                  SELECT
                  selectItem
                    *
                  FROM
                  relation
                    sampledRelation
                      patternRecognition
                        aliasedRelation
                          relationPrimary
                            qualifiedName
                              identifier
                                nonReserved
                                  data
                        MATCH_RECOGNIZE
                        (
                        PARTITION
                        BY
                        expression
                          booleanExpression
                            valueExpression
                              primaryExpression
                                identifier
                             

In [25]:
from antlr4.error.ErrorListener import ErrorListener

class CustomErrorListener(ErrorListener):
    def __init__(self):
        super(CustomErrorListener, self).__init__()
        self.errors = []

    def syntaxError(self, recognizer, offendingSymbol, line, column, msg, e):
        self.errors.append(f"Syntax error at {line}:{column} - {msg}")

    def has_errors(self):
        return len(self.errors) > 0

def parse_input(input_text):
    input_stream = InputStream(input_text)
    lexer = TrinoLexer(input_stream)
    stream = CommonTokenStream(lexer)
    parser = TrinoParser(stream)

    error_listener = CustomErrorListener()
    parser.removeErrorListeners()
    parser.addErrorListener(error_listener)

    tree = parser.statements()  # Use correct entry rule

    if error_listener.has_errors():
        print("Errors detected in SQL:")
        for err in error_listener.errors:
            print(err)
        return None, None  # Return None if errors exist

    return tree, parser


In [26]:
sample = "SELECT FROM data WHERE price > 10"  # Missing column names
tree, parser = parse_input(sample)


Errors detected in SQL:
Syntax error at 1:7 - extraneous input 'FROM' expecting {'ABSENT', 'ADD', 'ADMIN', 'AFTER', 'ALL', 'ANALYZE', 'ANY', 'ARRAY', 'ASC', 'AT', 'AUTHORIZATION', 'BEGIN', 'BERNOULLI', 'BOTH', 'CALL', 'CALLED', 'CASCADE', 'CASE', 'CAST', 'CATALOG', 'CATALOGS', 'COLUMN', 'COLUMNS', 'COMMENT', 'COMMIT', 'COMMITTED', 'CONDITIONAL', 'COUNT', 'COPARTITION', 'CURRENT', 'CURRENT_CATALOG', 'CURRENT_DATE', 'CURRENT_PATH', 'CURRENT_SCHEMA', 'CURRENT_TIME', 'CURRENT_TIMESTAMP', 'CURRENT_USER', 'DATA', 'DATE', 'DAY', 'DECLARE', 'DEFAULT', 'DEFINE', 'DEFINER', 'DENY', 'DESC', 'DESCRIPTOR', 'DETERMINISTIC', 'DISTINCT', 'DISTRIBUTED', 'DO', 'DOUBLE', 'EMPTY', 'ELSEIF', 'ENCODING', 'ERROR', 'EXCLUDING', 'EXISTS', 'EXPLAIN', 'EXTRACT', 'FALSE', 'FETCH', 'FILTER', 'FINAL', 'FIRST', 'FOLLOWING', 'FORMAT', 'FUNCTION', 'FUNCTIONS', 'GRACE', 'GRANT', 'GRANTED', 'GRANTS', 'GRAPHVIZ', 'GROUPING', 'GROUPS', 'HOUR', 'IF', 'IGNORE', 'IMMEDIATE', 'INCLUDING', 'INITIAL', 'INPUT', 'INTERVAL', 'INVO