In [1]:
def build_bad_char_table(pattern):
    """
    Builds the bad character table, which records the last occurrence index of each character in the pattern string.

    Args:
        pattern (str): The pattern string.

    Returns:
        dict: The bad character table, where the keys are characters in the pattern and the values are their last occurrence indices.
    """
    bad_char_table = {}
    for index, char in enumerate(pattern):
        bad_char_table[char] = index
    return bad_char_table


In [2]:
def build_good_suffix_table(pattern):
    """
    Builds the good suffix table for a given pattern.

    Args:
        pattern (str): The pattern to build the good suffix table for.

    Returns:
        list: The good suffix table.

    """
    
    length = len(pattern)
    good_suffix_table = [0] * length
    last_prefix_position = length

    # Process each position in the pattern
    for i in range(length):
        # Check if the suffix of pattern starting from i is a prefix of the whole pattern
        if is_prefix(pattern, i):
            last_prefix_position = i
        good_suffix_table[length - 1 - i] = last_prefix_position - i + length - 1

    # Handle case where the pattern is repeated
    for i in range(length - 1):
        len_suffix = suffix_length(pattern, i)
        good_suffix_table[len_suffix] = length - 1 - i + len_suffix

    return good_suffix_table

def is_prefix(pattern, p):
    """
    Check if the substring from position p to the end of the pattern is a prefix of the pattern.
    
    Args:
        pattern (str): The pattern to check.
        p (int): The starting position of the substring.
        
    Returns:
        bool: True if the substring is a prefix of the pattern, False otherwise.
    """
    
    length = len(pattern)
    j = 0
    while p + j < length:
        if pattern[p + j] != pattern[j]:
            return False
        j += 1
    return True

def suffix_length(pattern, p):
    """
    Calculate the length of the longest match between the suffix ending at position p and the prefix of the pattern.
    
    Args:
        pattern (str): The pattern string.
        p (int): The position of the suffix ending.
        
    Returns:
        int: The length of the longest match.
    """
    
    length = len(pattern)
    i = 0
    while p - i >= 0 and pattern[p - i] == pattern[length - 1 - i]:
        i += 1
    return i


In [3]:
def boyer_moore(text, pattern):
    """
    Searches for all occurrences of a pattern in a given text using the Boyer-Moore algorithm.
    
    Args:
        text (str): The text to search in.
        pattern (str): The pattern to search for.
        
    Returns:
        list: A list of positions where the pattern occurs in the text.
    """
    
    bad_char = build_bad_char_table(pattern)
    good_suffix = build_good_suffix_table(pattern)
    m = len(pattern)
    n = len(text)
    positions = []

    s = 0
    while(s <= n - m):
        j = m - 1
        while j >= 0 and pattern[j] == text[s + j]:
            j -= 1
        if j < 0:
            positions.append(s)
            s += (good_suffix[0] if s + m < n else 1)
        else:
            bad_char_shift = bad_char.get(text[s + j], -1)
            s += max(1, j - bad_char_shift, good_suffix[j])

    return positions


In [4]:
text = "THIS IS A TEST TEXT"
pattern = "TEST"
found_positions = boyer_moore(text, pattern)
print("Pattern found at positions:", found_positions)


Pattern found at positions: [10]
