In [95]:
def bad_character_heuristic(pattern, text):
    """
    Computes the shift for the pattern based on the bad character rule in the Boyer-Moore algorithm.

    Args:
        pattern (str): The pattern string.
        text (str): The text string, which should be the same length as the pattern.

    Returns:
        int: The number of positions the pattern needs to be shifted to the right.
    """
    # 构建一个字典包含 pattern 中每个字符的最右位置
    rightmost = {c: -1 for c in pattern}
    for i, c in enumerate(pattern):
        rightmost[c] = i

    # 从右向左比较 pattern 和 text
    patternLength = len(pattern)
    for i in range(patternLength - 1, -1, -1):
        if pattern[i] != text[i]:
            # 不匹配的情况
            bad_char = text[i]
            if bad_char in rightmost:
                # 如果 text 中的不匹配字符存在于 pattern 中
                # 计算移动距离: 目标位置 - 当前位置
                shift = max(1, i - rightmost[bad_char])
            else:
                # 如果 text 中的不匹配字符不在 pattern 中
                # 移动距离：整个 pattern 右移至不匹配字符后一位
                shift = i + 1
            return shift

    # 如果字符串完全匹配
    return 0

In [96]:
def good_suffix_shift(pattern,enable_print=False):
    pattern_length = len(pattern)
    suffix = [0] * (pattern_length * 2)
    goodSuffix = [pattern_length] * pattern_length  # 初始化gs数组，假设在没有好后缀的情况下，移动整个模式长度

    # 计算suffix数组
    if enable_print:
        print(f"Initial suffix array: {suffix}")
        print(f"Initial goodSuffix array: {goodSuffix}")

    # 设置模式串最后一个字符的suffix值为模式串的长度，因为整个模式串都是自身的后缀
    suffix[pattern_length - 1] = pattern_length
    # 初始化f为模式串的最后一个索引
    f = pattern_length - 1

    if enable_print:
        print(f"Suffix array after initialization: {suffix}")

    print('\n')

    # 从模式串的倒数第二个字符开始向前遍历
    for i in range(pattern_length - 2, -1, -1):
        # 如果当前索引i大于f且通过之前的计算得到的suffix长度小于从i到f的距离
        # 这表明当前的后缀可以通过之前计算的更长的后缀来简化
        if i > f and suffix[i + pattern_length - 1 - f] < i - f:
            suffix[i] = suffix[i + pattern_length - 1 - f]
            if enable_print:
                print(f"i = {i}: suffix[{i}] can be reused from suffix[{i + pattern_length - 1 - f}] = {suffix[i]}")
        else:
            # 如果当前索引i不满足上述条件，重新设置f为i，重新计算最长公共后缀
            if i < f:
                f = i
            # 从i开始向左对比，计算从i开始的后缀与模式串的后缀的最长公共部分
            # f与模式串的后缀的比较位置同步向左移动
            k = f
            while k >= 0 and pattern[k] == pattern[k + pattern_length - 1 - i]:
                if enable_print:
                    print(f"i = {i}: suffix[{i}] is comparing pattern[{k}] ('{pattern[k]}') with pattern[{k + pattern_length - 1 - i}] ('{pattern[k + pattern_length -1 - i] if k + pattern_length -1 - i < pattern_length else 'OutOfBound'})")
                k -= 1
            # 最长公共后缀的长度为i-f
            suffix[i] = i - k
            if enable_print:
                if k>=0:
                    print(f"i = {i}: suffix[{i}] is calculated as {suffix[i]}, pattern[{k}] ('{pattern[k]}') != pattern[{k + pattern_length - 1 - i}] ('{pattern[k + pattern_length -1 - i] if k + pattern_length -1 - i < pattern_length else 'OutOfBound'})")
                else:
                    #匹配到头
                    #print It is last character of pattern. Thus suffix is equal i+1
                    print(f'It is last character of pattern. Thus suffix is equal {i+1}')
        if enable_print:
            print(f"Suffix array at step {pattern_length - 1 - i}: {suffix} \n")

    # 计算好后缀移动数组gs
    j = 0
    for i in range(pattern_length - 1, -1, -1):
        if suffix[i] == i + 1:  # 全后缀匹配的情况
            while j < pattern_length - 1 - i:
                if goodSuffix[j] == pattern_length:  # 只有当gs[j]未被设置时才更新
                    goodSuffix[j] = pattern_length - 1 - i
                    if enable_print:
                        print(f"goodSuffix[{j}] is set to {goodSuffix[j]} because suffix[{i}] = {suffix[i]} indicates a full suffix match")
                j += 1
        if enable_print:
            print(f"GoodSuffix array at step {pattern_length - 1 - i}: {goodSuffix} \n")

    # 其他情况，根据suffix值设置gs值
    for i in range(pattern_length - 1):
        goodSuffix[pattern_length - 1 - suffix[i]] = pattern_length - 1 - i
        if enable_print:
            print(f"GoodSuffix array after final adjustment at step {i + 1}: {goodSuffix}")

    return goodSuffix

goodSuffix = []

def good_suffix_heuristic(pattern,text,enable_print=False):
    global goodSuffix
    goodSuffix=good_suffix_shift(pattern,enable_print)
    pattern_length = len(pattern)

    j = pattern_length - 1
    while j >= 0 and pattern[j] == text[j]:
        j -= 1
    
    if j < 0:
        return 0
    
    return goodSuffix[j]

In [97]:
def boyer_moore(pattern, text, enable_print=False, enable_suffix_print=False):
    pattern_length = len(pattern)
    text_length = len(text)
    
    if text_length < pattern_length:
        return []
    
    matches = []  # 用于存储所有匹配的位置
    i = 0  # 初始化移动量
    
    while i <= text_length - pattern_length:
        # 获取坏字符启发式的移动量
        shift_bad_char = bad_character_heuristic(pattern, text[i:i + pattern_length])
        
        # 获取好后缀启发式的移动量
        shift_good_suffix = good_suffix_heuristic(pattern, text[i:i + pattern_length], enable_suffix_print)
        
        # 可视化显示当前对齐情况
        if enable_print:
            print(text)
            print(' ' * i + pattern)
        
        # 如果完全匹配，记录匹配位置
        if shift_bad_char == 0 and shift_good_suffix == 0:
            matches.append(i)
            if enable_print:
                print(f"Pattern found at index {i}.")
            i += pattern_length  # 移动模式串，跳过当前匹配的位置
        else:
            # 取两个启发式的最大值进行移动
            shift = max(shift_bad_char, shift_good_suffix)
            if enable_print:
                print(f"Pattern not found at index {i}. Bad char shift: {shift_bad_char}, Good suffix shift: {shift_good_suffix}, Shifting by: {shift}")
            i += shift
            if enable_print:
                print('---' * (pattern_length + 1))  # 分割线，用于区分不同的移动步骤
    
    return matches

In [98]:
text = "AXAAXAAxAAAXAXXAAA"
pattern = "AXA"

result = boyer_moore(pattern, text, enable_print=True)
print(f"Pattern found at indices: {result}")



AXAAXAAxAAAXAXXAAA
AXA
Pattern found at index 0.


AXAAXAAxAAAXAXXAAA
   AXA
Pattern found at index 3.


AXAAXAAxAAAXAXXAAA
      AXA
Pattern not found at index 6. Bad char shift: 2, Good suffix shift: 2, Shifting by: 2
------------


AXAAXAAxAAAXAXXAAA
        AXA
Pattern not found at index 8. Bad char shift: 1, Good suffix shift: 2, Shifting by: 2
------------


AXAAXAAxAAAXAXXAAA
          AXA
Pattern found at index 10.


AXAAXAAxAAAXAXXAAA
             AXA
Pattern not found at index 13. Bad char shift: 1, Good suffix shift: 2, Shifting by: 2
------------


AXAAXAAxAAAXAXXAAA
               AXA
Pattern not found at index 15. Bad char shift: 1, Good suffix shift: 2, Shifting by: 2
------------
Pattern found at indices: [0, 3, 10]
