test match regcognize 

In [None]:
import pandas as pd
import numpy as np
from src.executor.match_recognize import match_recognize
from pandas.testing import assert_frame_equal
import pytest # Assuming pytest is used for running tests

# --- Test Data Setup ---

data_basic = [
    {"id": 1, "ticker": "A", "price": 10, "ts": pd.Timestamp("2023-01-01 09:00:00")},
    {"id": 2, "ticker": "A", "price": 11, "ts": pd.Timestamp("2023-01-01 09:01:00")}, # A
    {"id": 3, "ticker": "A", "price": 12, "ts": pd.Timestamp("2023-01-01 09:02:00")}, # A
    {"id": 4, "ticker": "A", "price": 11, "ts": pd.Timestamp("2023-01-01 09:03:00")}, # B
    {"id": 5, "ticker": "A", "price": 10, "ts": pd.Timestamp("2023-01-01 09:04:00")}, # B
    {"id": 6, "ticker": "A", "price": 11, "ts": pd.Timestamp("2023-01-01 09:05:00")}, # C
    {"id": 7, "ticker": "A", "price": 12, "ts": pd.Timestamp("2023-01-01 09:06:00")}, # C
    {"id": 8, "ticker": "A", "price": 13, "ts": pd.Timestamp("2023-01-01 09:07:00")}, # C
    {"id": 9, "ticker": "B", "price": 5, "ts": pd.Timestamp("2023-01-01 09:00:00")},
    {"id": 10, "ticker": "B", "price": 4, "ts": pd.Timestamp("2023-01-01 09:01:00")}, # B
    {"id": 11, "ticker": "B", "price": 6, "ts": pd.Timestamp("2023-01-01 09:02:00")}, # C
]
df_basic = pd.DataFrame(data_basic)

# --- Test Cases ---

def test_one_row_per_match_basic():
    """Tests basic ONE ROW PER MATCH with a V-shape pattern."""
    query = """
    SELECT * FROM df_basic MATCH_RECOGNIZE (
        PARTITION BY ticker
        ORDER BY ts
        MEASURES
            FIRST(A.price) AS start_price,
            LAST(B.price) AS bottom_price,
            LAST(C.price) AS end_price,
            MATCH_NUMBER() as match_num
        ONE ROW PER MATCH
        AFTER MATCH SKIP PAST LAST ROW
        PATTERN (A B+ C+)
        DEFINE
            A AS price > PREV(price, 1), -- Initial rise (or first row) - Adjusted for first row
            B AS price < PREV(price),
            C AS price > PREV(price)
    )
    """
    # Match 1 (ticker A): Rows 3(A), 4(B), 5(B), 6(C), 7(C), 8(C)
    # Match 2 (ticker B): Rows 9(A - implicit TRUE), 10(B), 11(C)

    expected_data = [
        {'ticker': 'A', 'start_price': 12, 'bottom_price': 10, 'end_price': 13, 'match_num': 1},
        {'ticker': 'B', 'start_price': 5, 'bottom_price': 4, 'end_price': 6, 'match_num': 1},
    ]
    expected_df = pd.DataFrame(expected_data)
    # Convert match_num to int64 to match pandas behavior if needed
    expected_df['match_num'] = expected_df['match_num'].astype(np.int64)


    output_df = match_recognize(query, df_basic)
    print("\n--- test_one_row_per_match_basic ---")
    print("Output:")
    print(output_df)
    print("Expected:")
    print(expected_df)
    # Select only expected columns for comparison
    output_df_sel = output_df[['ticker', 'start_price', 'bottom_price', 'end_price', 'match_num']]
    assert_frame_equal(output_df_sel.reset_index(drop=True), expected_df.reset_index(drop=True), check_dtype=False)

def test_all_rows_per_match_basic():
    """Tests basic ALL ROWS PER MATCH SHOW EMPTY MATCHES (default)."""
    query = """
    SELECT * FROM df_basic MATCH_RECOGNIZE (
        PARTITION BY ticker
        ORDER BY ts
        MEASURES
            CLASSIFIER() AS var_matched,
            MATCH_NUMBER() as match_num,
            FINAL SUM(price) as total_match_price -- Example FINAL aggregate
        ALL ROWS PER MATCH -- Default is SHOW EMPTY MATCHES
        AFTER MATCH SKIP PAST LAST ROW
        PATTERN (A B+ C+)
        DEFINE
            A AS price > PREV(price, 1),
            B AS price < PREV(price),
            C AS price > PREV(price)
    )
    """
    # Match 1 (ticker A): Rows 3(A), 4(B), 5(B), 6(C), 7(C), 8(C)
    # Match 2 (ticker B): Rows 9(A - implicit TRUE), 10(B), 11(C)
    expected_data = [
        # Match 1 (A)
        {'id': 3, 'ticker': 'A', 'price': 12, 'ts': pd.Timestamp('2023-01-01 09:02:00'), 'var_matched': 'A', 'match_num': 1, 'total_match_price': 70},
        {'id': 4, 'ticker': 'A', 'price': 11, 'ts': pd.Timestamp('2023-01-01 09:03:00'), 'var_matched': 'B', 'match_num': 1, 'total_match_price': 70},
        {'id': 5, 'ticker': 'A', 'price': 10, 'ts': pd.Timestamp('2023-01-01 09:04:00'), 'var_matched': 'B', 'match_num': 1, 'total_match_price': 70},
        {'id': 6, 'ticker': 'A', 'price': 11, 'ts': pd.Timestamp('2023-01-01 09:05:00'), 'var_matched': 'C', 'match_num': 1, 'total_match_price': 70},
        {'id': 7, 'ticker': 'A', 'price': 12, 'ts': pd.Timestamp('2023-01-01 09:06:00'), 'var_matched': 'C', 'match_num': 1, 'total_match_price': 70},
        {'id': 8, 'ticker': 'A', 'price': 13, 'ts': pd.Timestamp('2023-01-01 09:07:00'), 'var_matched': 'C', 'match_num': 1, 'total_match_price': 70},
         # Match 2 (B)
        {'id': 9, 'ticker': 'B', 'price': 5, 'ts': pd.Timestamp('2023-01-01 09:00:00'), 'var_matched': 'A', 'match_num': 1, 'total_match_price': 15},
        {'id': 10, 'ticker': 'B', 'price': 4, 'ts': pd.Timestamp('2023-01-01 09:01:00'), 'var_matched': 'B', 'match_num': 1, 'total_match_price': 15},
        {'id': 11, 'ticker': 'B', 'price': 6, 'ts': pd.Timestamp('2023-01-01 09:02:00'), 'var_matched': 'C', 'match_num': 1, 'total_match_price': 15},
    ]
    expected_cols = ['id', 'ticker', 'price', 'ts', 'var_matched', 'match_num', 'total_match_price']
    expected_df = pd.DataFrame(expected_data)[expected_cols]
    # Convert match_num and total_match_price types if needed
    expected_df['match_num'] = expected_df['match_num'].astype(np.int64)
    expected_df['total_match_price'] = expected_df['total_match_price'].astype(np.float64) # SUM is float

    output_df = match_recognize(query, df_basic)
    print("\n--- test_all_rows_per_match_basic ---")
    print("Output:")
    print(output_df)
    print("Expected:")
    print(expected_df)
    # Need to reorder output columns to match expected for assertion
    output_df_reordered = output_df[expected_cols]
    assert_frame_equal(output_df_reordered.reset_index(drop=True), expected_df.reset_index(drop=True), check_dtype=False)


def test_empty_match_pattern():
    """Tests empty pattern PATTERN (())."""
    query = """
    SELECT * FROM df_basic MATCH_RECOGNIZE (
        PARTITION BY ticker
        ORDER BY ts
        MEASURES MATCH_NUMBER() as match_num
        ONE ROW PER MATCH
        AFTER MATCH SKIP TO NEXT ROW -- Important for empty matches
        PATTERN (())
        DEFINE A AS TRUE -- Define needed, but pattern is empty
    )
    """
    # Expect one empty match starting at each row
    expected_data = [
        {'ticker': 'A', 'match_num': 1},
        {'ticker': 'A', 'match_num': 2},
        {'ticker': 'A', 'match_num': 3},
        {'ticker': 'A', 'match_num': 4},
        {'ticker': 'A', 'match_num': 5},
        {'ticker': 'A', 'match_num': 6},
        {'ticker': 'A', 'match_num': 7},
        {'ticker': 'B', 'match_num': 1},
        {'ticker': 'B', 'match_num': 2},
        {'ticker': 'B', 'match_num': 3},
    ]
    expected_df = pd.DataFrame(expected_data)
    expected_df['match_num'] = expected_df['match_num'].astype(np.int64)

    output_df = match_recognize(query, df_basic)
    print("\n--- test_empty_match_pattern ---")
    print("Output:")
    print(output_df)
    print("Expected:")
    print(expected_df)
    # Select only expected columns for comparison
    output_df_sel = output_df[['ticker', 'match_num']]
    assert_frame_equal(output_df_sel.reset_index(drop=True), expected_df.reset_index(drop=True), check_dtype=False)

def test_empty_match_optional_pattern():
    """Tests pattern that allows empty match via optional quantifier PATTERN (A?)."""
    query = """
    SELECT * FROM df_basic MATCH_RECOGNIZE (
        PARTITION BY ticker
        ORDER BY ts
        MEASURES MATCH_NUMBER() as match_num, FIRST(A.price) as first_a_price
        ONE ROW PER MATCH
        AFTER MATCH SKIP TO NEXT ROW
        PATTERN (A?)
        DEFINE A AS price > 11
    )
    """
    # Ticker A:
    # Row 1 (10): Empty Match (A? -> 0 A's) match_num=1, first_a_price=NULL
    # Row 2 (11): Empty Match (A? -> 0 A's) match_num=2, first_a_price=NULL
    # Row 3 (12): Match (A? -> 1 A) match_num=3, first_a_price=12
    # Row 4 (11): Empty Match (A? -> 0 A's) match_num=4, first_a_price=NULL
    # Row 5 (10): Empty Match (A? -> 0 A's) match_num=5, first_a_price=NULL
    # Row 6 (11): Empty Match (A? -> 0 A's) match_num=6, first_a_price=NULL
    # Row 7 (12): Match (A? -> 1 A) match_num=7, first_a_price=12
    # Row 8 (13): Match (A? -> 1 A) match_num=8, first_a_price=13
    # Ticker B:
    # Row 9 (5): Empty Match (A? -> 0 A's) match_num=1, first_a_price=NULL
    # Row 10 (4): Empty Match (A? -> 0 A's) match_num=2, first_a_price=NULL
    # Row 11 (6): Empty Match (A? -> 0 A's) match_num=3, first_a_price=NULL

    expected_data = [
        {'ticker': 'A', 'match_num': 1, 'first_a_price': None},
        {'ticker': 'A', 'match_num': 2, 'first_a_price': None},
        {'ticker': 'A', 'match_num': 3, 'first_a_price': 12.0},
        {'ticker': 'A', 'match_num': 4, 'first_a_price': None},
        {'ticker': 'A', 'match_num': 5, 'first_a_price': None},
        {'ticker': 'A', 'match_num': 6, 'first_a_price': None},
        {'ticker': 'A', 'match_num': 7, 'first_a_price': 12.0},
        {'ticker': 'A', 'match_num': 8, 'first_a_price': 13.0},
        {'ticker': 'B', 'match_num': 1, 'first_a_price': None},
        {'ticker': 'B', 'match_num': 2, 'first_a_price': None},
        {'ticker': 'B', 'match_num': 3, 'first_a_price': None},
    ]
    expected_df = pd.DataFrame(expected_data)
    expected_df['match_num'] = expected_df['match_num'].astype(np.int64)
    expected_df['first_a_price'] = expected_df['first_a_price'].astype(float) # Measures can be float

    output_df = match_recognize(query, df_basic)
    print("\n--- test_empty_match_optional_pattern ---")
    print("Output:")
    print(output_df)
    print("Expected:")
    print(expected_df)
    # Select only expected columns for comparison
    output_df_sel = output_df[['ticker', 'match_num', 'first_a_price']]
    assert_frame_equal(output_df_sel.reset_index(drop=True), expected_df.reset_index(drop=True), check_dtype=False)


def test_unmatched_rows():
    """Tests ALL ROWS PER MATCH WITH UNMATCHED ROWS."""
    query = """
    SELECT * FROM df_basic MATCH_RECOGNIZE (
        PARTITION BY ticker
        ORDER BY ts
        MEASURES CLASSIFIER() AS var_matched, MATCH_NUMBER() as match_num
        ALL ROWS PER MATCH WITH UNMATCHED ROWS
        PATTERN (A B) -- Pattern that won't match everything
        DEFINE
            A AS price < 11,
            B AS price > PREV(price)
    )
    """
    # Ticker A:
    # Row 1 (10): A
    # Row 2 (11): B -> Match [1, 2], match_num=1
    # Row 3 (12): Unmatched
    # Row 4 (11): Unmatched
    # Row 5 (10): A
    # Row 6 (11): B -> Match [5, 6], match_num=2
    # Row 7 (12): Unmatched
    # Row 8 (13): Unmatched
    # Ticker B:
    # Row 9 (5): A
    # Row 10 (4): Unmatched
    # Row 11 (6): B -> Match [9, 11] (row 10 skipped), match_num=1

    expected_data = [
        # Match 1 (A)
        {'id': 1, 'ticker': 'A', 'price': 10, 'ts': pd.Timestamp('2023-01-01 09:00:00'), 'var_matched': 'A', 'match_num': 1},
        {'id': 2, 'ticker': 'A', 'price': 11, 'ts': pd.Timestamp('2023-01-01 09:01:00'), 'var_matched': 'B', 'match_num': 1},
        # Unmatched (A)
        {'id': 3, 'ticker': 'A', 'price': 12, 'ts': pd.Timestamp('2023-01-01 09:02:00'), 'var_matched': None, 'match_num': None},
        {'id': 4, 'ticker': 'A', 'price': 11, 'ts': pd.Timestamp('2023-01-01 09:03:00'), 'var_matched': None, 'match_num': None},
         # Match 2 (A)
        {'id': 5, 'ticker': 'A', 'price': 10, 'ts': pd.Timestamp('2023-01-01 09:04:00'), 'var_matched': 'A', 'match_num': 2},
        {'id': 6, 'ticker': 'A', 'price': 11, 'ts': pd.Timestamp('2023-01-01 09:05:00'), 'var_matched': 'B', 'match_num': 2},
        # Unmatched (A)
        {'id': 7, 'ticker': 'A', 'price': 12, 'ts': pd.Timestamp('2023-01-01 09:06:00'), 'var_matched': None, 'match_num': None},
        {'id': 8, 'ticker': 'A', 'price': 13, 'ts': pd.Timestamp('2023-01-01 09:07:00'), 'var_matched': None, 'match_num': None},
        # Match 1 (B)
        {'id': 9, 'ticker': 'B', 'price': 5, 'ts': pd.Timestamp('2023-01-01 09:00:00'), 'var_matched': 'A', 'match_num': 1},
        # Unmatched (B)
        {'id': 10, 'ticker': 'B', 'price': 4, 'ts': pd.Timestamp('2023-01-01 09:01:00'), 'var_matched': None, 'match_num': None},
        # Match 1 (B) cont.
        {'id': 11, 'ticker': 'B', 'price': 6, 'ts': pd.Timestamp('2023-01-01 09:02:00'), 'var_matched': 'B', 'match_num': 1},
    ]
    expected_cols = ['id', 'ticker', 'price', 'ts', 'var_matched', 'match_num']
    expected_df = pd.DataFrame(expected_data)[expected_cols]
    # Convert types for comparison
    expected_df['match_num'] = expected_df['match_num'].astype('Int64') # Use nullable integer

    output_df = match_recognize(query, df_basic)
    print("\n--- test_unmatched_rows ---")
    print("Output:")
    print(output_df)
    print("Expected:")
    print(expected_df)
    output_df_reordered = output_df[expected_cols]
    # Sort both dataframes to handle potential ordering differences in unmatched rows
    output_df_sorted = output_df_reordered.sort_values(by=['ticker', 'ts']).reset_index(drop=True)
    expected_df_sorted = expected_df.sort_values(by=['ticker', 'ts']).reset_index(drop=True)
    assert_frame_equal(output_df_sorted, expected_df_sorted, check_dtype=False)


def test_permute():
    """Tests PERMUTE(A, B) pattern."""
    permute_data = [
        {"id": 1, "event": "X", "val": 1, "ts": 1}, # A
        {"id": 2, "event": "Y", "val": 2, "ts": 2}, # B
        {"id": 3, "event": "Z", "val": 3, "ts": 3}, # C
        {"id": 4, "event": "Y", "val": 4, "ts": 4}, # B
        {"id": 5, "event": "X", "val": 5, "ts": 5}, # A
        {"id": 6, "event": "Z", "val": 6, "ts": 6}, # C
    ]
    df_permute = pd.DataFrame(permute_data)
    query = """
    SELECT * FROM df_permute MATCH_RECOGNIZE (
        ORDER BY ts
        MEASURES
            CLASSIFIER() AS var_matched,
            MATCH_NUMBER() as match_num,
            FIRST(A.val) as first_a,
            FIRST(B.val) as first_b
        ALL ROWS PER MATCH
        PATTERN (PERMUTE(A, B) C) -- Match A then B OR B then A, followed by C
        DEFINE
            A AS event = 'X',
            B AS event = 'Y',
            C AS event = 'Z'
    )
    """
    # Match 1: Rows 1(A), 2(B), 3(C) -> Permutation A, B
    # Match 2: Rows 4(B), 5(A), 6(C) -> Permutation B, A
    expected_data = [
        # Match 1
        {'id': 1, 'event': 'X', 'val': 1, 'ts': 1, 'var_matched': 'A', 'match_num': 1, 'first_a': 1, 'first_b': 2},
        {'id': 2, 'event': 'Y', 'val': 2, 'ts': 2, 'var_matched': 'B', 'match_num': 1, 'first_a': 1, 'first_b': 2},
        {'id': 3, 'event': 'Z', 'val': 3, 'ts': 3, 'var_matched': 'C', 'match_num': 1, 'first_a': 1, 'first_b': 2},
        # Match 2
        {'id': 4, 'event': 'Y', 'val': 4, 'ts': 4, 'var_matched': 'B', 'match_num': 2, 'first_a': 5, 'first_b': 4},
        {'id': 5, 'event': 'X', 'val': 5, 'ts': 5, 'var_matched': 'A', 'match_num': 2, 'first_a': 5, 'first_b': 4},
        {'id': 6, 'event': 'Z', 'val': 6, 'ts': 6, 'var_matched': 'C', 'match_num': 2, 'first_a': 5, 'first_b': 4},
    ]
    expected_cols = ['id', 'event', 'val', 'ts', 'var_matched', 'match_num', 'first_a', 'first_b']
    expected_df = pd.DataFrame(expected_data)[expected_cols]
    expected_df['match_num'] = expected_df['match_num'].astype(np.int64)
    expected_df['first_a'] = expected_df['first_a'].astype(np.int64)
    expected_df['first_b'] = expected_df['first_b'].astype(np.int64)


    output_df = match_recognize(query, df_permute)
    print("\n--- test_permute ---")
    print("Output:")
    print(output_df)
    print("Expected:")
    print(expected_df)
    output_df_reordered = output_df[expected_cols]
    assert_frame_equal(output_df_reordered.reset_index(drop=True), expected_df.reset_index(drop=True), check_dtype=False)

def test_classifier_subset():
    """Tests CLASSIFIER() with SUBSET."""
    query = """
    SELECT * FROM df_basic MATCH_RECOGNIZE (
        PARTITION BY ticker
        ORDER BY ts
        MEASURES
            CLASSIFIER() AS base_var,
            CLASSIFIER(DOWN) AS is_down, -- Should return 'B' if matched to B
            CLASSIFIER(UP) AS is_up,     -- Should return 'C' if matched to C
            MATCH_NUMBER() as match_num
        ALL ROWS PER MATCH
        PATTERN (A DOWN+ UP+)
        SUBSET DOWN = (B), UP = (C)
        DEFINE
            A AS price > PREV(price, 1),
            B AS price < PREV(price),
            C AS price > PREV(price)
    )
    """
    # Match 1 (ticker A): Rows 3(A), 4(B), 5(B), 6(C), 7(C), 8(C)
    # Match 2 (ticker B): Rows 9(A - implicit TRUE), 10(B), 11(C)
    expected_data = [
        # Match 1 (A)
        {'id': 3, 'ticker': 'A', 'price': 12, 'ts': pd.Timestamp('2023-01-01 09:02:00'), 'base_var': 'A', 'is_down': None, 'is_up': None, 'match_num': 1},
        {'id': 4, 'ticker': 'A', 'price': 11, 'ts': pd.Timestamp('2023-01-01 09:03:00'), 'base_var': 'B', 'is_down': 'B', 'is_up': None, 'match_num': 1},
        {'id': 5, 'ticker': 'A', 'price': 10, 'ts': pd.Timestamp('2023-01-01 09:04:00'), 'base_var': 'B', 'is_down': 'B', 'is_up': None, 'match_num': 1},
        {'id': 6, 'ticker': 'A', 'price': 11, 'ts': pd.Timestamp('2023-01-01 09:05:00'), 'base_var': 'C', 'is_down': None, 'is_up': 'C', 'match_num': 1},
        {'id': 7, 'ticker': 'A', 'price': 12, 'ts': pd.Timestamp('2023-01-01 09:06:00'), 'base_var': 'C', 'is_down': None, 'is_up': 'C', 'match_num': 1},
        {'id': 8, 'ticker': 'A', 'price': 13, 'ts': pd.Timestamp('2023-01-01 09:07:00'), 'base_var': 'C', 'is_down': None, 'is_up': 'C', 'match_num': 1},
         # Match 2 (B)
        {'id': 9, 'ticker': 'B', 'price': 5, 'ts': pd.Timestamp('2023-01-01 09:00:00'), 'base_var': 'A', 'is_down': None, 'is_up': None, 'match_num': 1},
        {'id': 10, 'ticker': 'B', 'price': 4, 'ts': pd.Timestamp('2023-01-01 09:01:00'), 'base_var': 'B', 'is_down': 'B', 'is_up': None, 'match_num': 1},
        {'id': 11, 'ticker': 'B', 'price': 6, 'ts': pd.Timestamp('2023-01-01 09:02:00'), 'base_var': 'C', 'is_down': None, 'is_up': 'C', 'match_num': 1},
    ]
    expected_cols = ['id', 'ticker', 'price', 'ts', 'base_var', 'is_down', 'is_up', 'match_num']
    expected_df = pd.DataFrame(expected_data)[expected_cols]
    expected_df['match_num'] = expected_df['match_num'].astype(np.int64)
    # Convert object columns with None to appropriate types
    expected_df['is_down'] = expected_df['is_down'].astype(object)
    expected_df['is_up'] = expected_df['is_up'].astype(object)


    output_df = match_recognize(query, df_basic)
    print("\n--- test_classifier_subset ---")
    print("Output:")
    print(output_df)
    print("Expected:")
    print(expected_df)
    output_df_reordered = output_df[expected_cols]
    assert_frame_equal(output_df_reordered.reset_index(drop=True), expected_df.reset_index(drop=True), check_dtype=False)

def test_anchors():
    """Tests start (^) and end ($) anchors."""
    anchor_data = [
        {"id": 1, "val": 10, "ts": 1}, # A
        {"id": 2, "val": 11, "ts": 2}, # B
        {"id": 3, "val": 12, "ts": 3}, # B
        {"id": 4, "val": 13, "ts": 4}, # C
    ]
    df_anchor = pd.DataFrame(anchor_data)

    # Test 1: Start anchor only (^ A B+)
    query1 = """
    SELECT * FROM df_anchor MATCH_RECOGNIZE (
        ORDER BY ts
        MEASURES MATCH_NUMBER() as mn, CLASSIFIER() as cls
        ALL ROWS PER MATCH
        PATTERN (^ A B+)
        DEFINE
            A AS val = 10,
            B AS val > 10
    )
    """
    # Match: Rows 1(A), 2(B), 3(B)
    expected_data1 = [
        {'id': 1, 'val': 10, 'ts': 1, 'mn': 1, 'cls': 'A'},
        {'id': 2, 'val': 11, 'ts': 2, 'mn': 1, 'cls': 'B'},
        {'id': 3, 'val': 12, 'ts': 3, 'mn': 1, 'cls': 'B'},
    ]
    expected_df1 = pd.DataFrame(expected_data1)
    expected_df1['mn'] = expected_df1['mn'].astype(np.int64)
    output_df1 = match_recognize(query1, df_anchor)
    print("\n--- test_anchors (Start) ---")
    print("Output:")
    print(output_df1)
    print("Expected:")
    print(expected_df1)
    assert_frame_equal(output_df1[['id', '


SyntaxError: incomplete input (1399769319.py, line 368)