In [1]:
import msgspec
import numpy as np

from matcher_py import Matcher, SimpleMatcher # type: ignore
from extension_types import MatchTableType, SimpleMatchType, MatchTable

In [2]:
msgpack_encoder = msgspec.msgpack.Encoder()

In [3]:
matcher = Matcher(
    msgpack_encoder.encode(
        {
            "test": [
                MatchTable(
                    table_id=1,
                    match_table_type=MatchTableType.Simple,
                    simple_match_type=SimpleMatchType.MatchFanjian | SimpleMatchType.MatchDeleteNormalize,
                    word_list=["蔔", "你好"],
                    exemption_simple_match_type=SimpleMatchType.MatchFanjian | SimpleMatchType.MatchDeleteNormalize,
                    exemption_word_list=[],
                )
            ]
        }
    )
)

In [4]:
matcher.is_match(r"卜")

True

In [5]:
matcher.word_match(r"你，好")

{'test': '[{"table_id":1,"word":"你好"}]'}

In [6]:
matcher.word_match_as_string("你好")

'{"test":"[{\\"table_id\\":1,\\"word\\":\\"你好\\"}]"}'

In [7]:
matcher.batch_word_match_as_string(["你好", "你好", "你真棒"])

['{"test":"[{\\"table_id\\":1,\\"word\\":\\"你好\\"}]"}',
 '{"test":"[{\\"table_id\\":1,\\"word\\":\\"你好\\"}]"}',
 '{}']

In [8]:
text_array = np.array(["房东巴萨风景嘎哈快睡吧ndsac"] * 10000, dtype=np.dtype("object"))
matcher.numpy_word_match_as_string(text_array)

array(['{}', '{}', '{}', ..., '{}', '{}', '{}'], dtype=object)

In [9]:
text_array = np.array(["房东巴萨风景嘎哈快睡吧ndsac"] * 10000, dtype=np.dtype("object"))
matcher.numpy_word_match_as_string(text_array, inplace=True)
text_array

array(['{}', '{}', '{}', ..., '{}', '{}', '{}'], dtype=object)

In [10]:
simple_matcher = SimpleMatcher(
    msgpack_encoder.encode(
        {
            SimpleMatchType.MatchFanjian | SimpleMatchType.MatchDeleteNormalize: {
                1: "无,法,无,天",
                2: "xxx",
                3: "你好",
                6: r"It's /\/\y duty",
                4: "xxx,yyy",
            },
            SimpleMatchType.MatchFanjian: {
                4: "xxx,yyy",
            },
            SimpleMatchType.MatchNone: {
                5: "xxxxx,xxxxyyyyxxxxx",
            },
        }
    )
)

In [11]:
simple_matcher.is_match("xxx")

True

In [12]:
simple_matcher.simple_process(r"It's /\/\y duty")

[{'word_id': 6, 'word': "It's /\\/\\y duty"}]

In [13]:
simple_matcher.batch_simple_process([r"It's /\/\y duty", "你好", "xxxxxxx"])

[[{'word_id': 6, 'word': "It's /\\/\\y duty"}],
 [{'word_id': 3, 'word': '你好'}],
 [{'word_id': 2, 'word': 'xxx'}]]

In [14]:
text_array = np.array(
    [
        "Laborum eiusmod anim aliqua non veniam laboris officia dolor. Adipisicing sit est irure Lorem duis adipisicing exercitation. Cillum excepteur non anim ipsum eiusmod deserunt veniam. Nulla veniam sunt sint ad velit occaecat in deserunt nulla nisi excepteur. Cillum veniam Lorem aute eu. Nisi voluptate laboris quis sint pariatur ullamco minim pariatur officia non anim nisi nulla ipsum ad. Veniam pariatur ut occaecat ut veniam velit aliquip commodo culpa elit eu eiusmod."
    ]
    * 10000,
    dtype=np.dtype("object"),
)
simple_matcher.numpy_simple_process(text_array)

array([list([]), list([]), list([]), ..., list([]), list([]), list([])],
      dtype=object)

In [15]:
text_array = np.array(
    [
        "Laborum eiusmod anim aliqua non veniam laboris officia dolor. Adipisicing sit est irure Lorem duis adipisicing exercitation. Cillum excepteur non anim ipsum eiusmod deserunt veniam. Nulla veniam sunt sint ad velit occaecat in deserunt nulla nisi excepteur. Cillum veniam Lorem aute eu. Nisi voluptate laboris quis sint pariatur ullamco minim pariatur officia non anim nisi nulla ipsum ad. Veniam pariatur ut occaecat ut veniam velit aliquip commodo culpa elit eu eiusmod."
    ]
    * 10000,
    dtype=np.dtype("object"),
)
simple_matcher.numpy_simple_process(text_array, inplace=True)
text_array

array([list([]), list([]), list([]), ..., list([]), list([]), list([])],
      dtype=object)