# 2D pattern matching

Zadanie dotyczy wyszukiwania wzorców dwuwymiarowych.

1. Zaimplementuj algorytm wyszukiwania wzorca 2-wymiarowego 
2. Znajdź w załączonym pliku "haystack.txt" wszyskie sytuacje, gdy taka sama litera występuje na tej samej pozycji w dwóch kolejnych linijkach. Zwróć uwagę, na nierówną długość linii w pliku. 
3. Znajdź wszystkie wystąpienia "th" oraz "t h" w dwóch kolejnych liniach na tej samej pozycji. 
4. Wybierz przynajmniej 4 litery (małe). Znajdź wszystkie wystąpienia tej litery w załączonym pliku "haystack.png" 
5. Znajdź wszystkie wystąpienia słowa "p a t t e r n" w haystack.png. 
6. Porównaj czas budowania automatu i czas wyszukiwania dla różnych rozmiarów wzorca 
7. Podziel plik na 2, 4 i 8 fragmentów (w poziomie) i porównaj czas przeszukiwania 

Załączone pliki to fragmenty książki "Jewels of Stringology".

In [15]:
from collections import deque
from collections import defaultdict
from PIL import Image

In [16]:
class Node:
    def __init__(self, letter=None, parent=None, terminal=False):
        self.letter = letter
        self.parent = parent
        self.terminal = terminal
        self.children = {}
        self.suffix_link = None
        self.dict_link = None
        self.word = self._build_word()

    def _build_word(self):
        if self.parent is None:
            return ''
        else:
            return self.parent.word + self.letter

    def __repr__(self):
        return self.word

### Ex 1

In [164]:
class AhoCorasick:
    def __init__(self, patterns):
        self.root = Node()
        self.pattern_idx = self.build_pattern_idx(patterns)
        self.build_trie(patterns)
        self.build_suffix_links()
        self.build_dict_links()

    def build_pattern_idx(self, patterns):
        pattern_idx = defaultdict(list)
        for idx, pattern in enumerate(patterns):
            pattern_idx[pattern] = idx
        return pattern_idx

    def build_trie(self, patterns):
        for pattern in patterns:
            node = self.root
            for idx, letter in enumerate(pattern):
                terminal = (idx == len(pattern) - 1)
                if letter in node.children:
                    node = node.children[letter]
                    if terminal:
                        node.terminal = True
                else:
                    new_node = Node(letter, parent=node, terminal=terminal)
                    node.children[letter] = new_node
                    node = new_node

    def build_suffix_links(self):
        queue = deque()
        queue.append(self.root)
        while queue:
            node = queue.popleft()
            for letter, child in node.children.items():
                queue.append(child)
                if node == self.root:
                    child.suffix_link = self.root
                else:
                    suffix = node.suffix_link
                    while suffix is not None and letter not in suffix.children:
                        suffix = suffix.suffix_link
                    if suffix is None:
                        child.suffix_link = self.root
                    else:
                        child.suffix_link = suffix.children[letter]

    def build_dict_links(self):
        queue = deque()
        queue.append(self.root)
        while queue:
            node = queue.popleft()
            for letter, child in node.children.items():
                queue.append(child)
                suffix = child.suffix_link
                while suffix is not None and not suffix.terminal:
                    suffix = suffix.suffix_link
                if suffix is None:
                    child.dict_link = self.root
                else:
                    child.dict_link = suffix

    def search(self, text):
        node = self.root
        result = []
        result_idx = ['#' for _ in range(len(text))]
        for idx, letter in enumerate(text):
            while node is not None and letter not in node.children:
                node = node.suffix_link
            if node is None:
                node = self.root
            else:
                node = node.children[letter]
            if node is None:
                node = self.root
            else:
                temp = node
                while temp is not None:
                    if temp.terminal:
                        result.append((idx - len(temp.word) + 1, temp.word))
                        result_idx[idx - len(temp.word) + 1] = self.pattern_idx[temp.word]
                    temp = temp.dict_link
        result_idx = ''.join([str(idx) for idx in result_idx])
        return (result, result_idx)

In [100]:
def align_text(text, splited=True):
    if not splited:
        lines = text.split('\n')
    else:
        lines = text
    max_length = max(len(line) for line in lines)
    aligned_lines = [line.ljust(max_length) for line in lines]
    return aligned_lines

def split_columns(matrix):
    result = []
    splited = [list(column) for column in zip(*matrix)]
    for column in splited:
        result.append(''.join(column))
    return result
        

In [165]:
def pattern_matching_2d(text, pattern):
    # przygotowanie patternu
    pattern = split_columns(align_text(pattern))
    ac_vertical = AhoCorasick(pattern)
    # pattern do znalezienia poziomo
    horizontal_pattern = ""
    for column in pattern:
        horizontal_pattern += str(ac_vertical.pattern_idx[column])
    
    # # przygotowanie tekstu
    text = split_columns(align_text(text, splited=False))
    result = []
    for line in text:
        _, res = ac_vertical.search(line)
        result.append(res)
    print(result)
    print()
    
    # w tym szukamy poziomo
    transposed = [''.join(element) for element in zip(*result)]
    transposed = '\n'.join(transposed)
    # print(transposed)
    print(horizontal_pattern)
    
    ac_horizontal = AhoCorasick([horizontal_pattern])
    result = []
    for idx, line in enumerate(transposed.split('\n')):
        res, _ = ac_horizontal.search(line)
        for found in res:
            result.append((idx, found[0]))
    return result


### Ex 2

In [102]:
with open('haystack.txt', 'r') as f:
    text = f.read()

In [103]:
alphabet = 'abcdefghijklmnopqrstuvwxyz'

for letter in alphabet:
    pattern = [letter, letter]
    result = pattern_matching_2d(text, pattern)
    print("PATTERN: ", pattern)
    print("NUMBER: ", len(result))
    print(result)
    print("--------------------------------")


PATTERN:  ['a', 'a']
NUMBER:  28
[(0, 82), (3, 30), (5, 60), (6, 63), (20, 6), (28, 69), (31, 50), (31, 73), (33, 66), (37, 4), (52, 12), (53, 12), (53, 48), (56, 11), (57, 36), (58, 36), (59, 24), (64, 2), (64, 14), (64, 22), (65, 35), (69, 35), (76, 21), (76, 74), (77, 42), (77, 61), (78, 59), (79, 37)]
--------------------------------
PATTERN:  ['b', 'b']
NUMBER:  0
[]
--------------------------------
PATTERN:  ['c', 'c']
NUMBER:  6
[(3, 54), (10, 45), (13, 10), (41, 0), (68, 0), (82, 41)]
--------------------------------
PATTERN:  ['d', 'd']
NUMBER:  1
[(37, 19)]
--------------------------------
PATTERN:  ['e', 'e']
NUMBER:  48
[(0, 63), (1, 8), (4, 77), (7, 65), (10, 1), (10, 64), (14, 2), (15, 43), (17, 6), (18, 27), (20, 10), (21, 61), (22, 53), (24, 3), (24, 65), (28, 67), (28, 73), (29, 38), (29, 43), (37, 48), (40, 11), (40, 26), (41, 57), (42, 36), (42, 48), (46, 52), (47, 50), (51, 31), (57, 54), (58, 50), (58, 54), (59, 73), (63, 66), (65, 69), (66, 72), (67, 17), (68, 46)

### Ex 3

In [104]:
patterns = [["th", "th"], ["t h", "t h"]]

for pattern in patterns:
    result = pattern_matching_2d(text, pattern)
    print("PATTERN: ", pattern)
    print("NUMBER: ", len(result))
    print(result)
    print("--------------------------------")

PATTERN:  ['th', 'th']
NUMBER:  0
[]
--------------------------------
PATTERN:  ['t h', 't h']
NUMBER:  1
[(37, 0)]
--------------------------------


### Ex 4

In [136]:
def image_to_matrix(file_name):
    '''
    nie jestem dumny z tej funkcji :c
    '''
    image = Image.open(file_name)
    pixels = list(image.getdata())
    width, height = image.size
    text = []
    i = width
    for pixel in pixels:
        if i == width:
            i = 0
            text.append([])

        # wyrównanie do 3 znaków
        if pixel[0] < 10:
            pixel_val = '00' + str(pixel[0])
        elif pixel[0] < 100:
            pixel_val = '0' + str(pixel[0])
        else:
            pixel_val = str(pixel[0])

        text[-1].append(pixel_val)
        i += 1
    result = []
    for line in text:
        line = ''.join(line)
        result.append(line)
    return result

In [167]:
a_img = image_to_matrix('img/a.png')
# for line in a_img:
#     print(line)
with open('test.txt', 'r') as f:
    text = f.read()

result = pattern_matching_2d(text, a_img)
print(result)

e_img = image_to_matrix('img/e.png')
p_img = image_to_matrix('img/p.png')
t_img = image_to_matrix('img/t.png')
haystack_img = image_to_matrix('haystack.png')

haystack_img = '\n'.join(haystack_img)

['#############', '#30###########', '#32###########', '#32###########', '#3###########', '#4###########', '#5###########', '#6###########', '#7###########', '#8###########', '#9###########', '#10###########', '#11###########', '#15###########', '#13###########', '#14###########', '#15###########', '#16###########', '#17###########', '#18###########', '#19###########', '#20###########', '#21###########', '#22###########', '#23###########', '#24###########', '#25###########', '#26###########', '#27###########', '#28###########', '#29###########', '#30###########', '#32###########', '#32###########', '#############']

30323234567891011151314151617181920212223242526272829303232
[]


In [146]:
print("A")
result = pattern_matching_2d(haystack_img, a_img)
print("NUMBER: ", len(result))
print(result)
print("--------------------------------")

# print("E")
# result = pattern_matching_2d(haystack_img, e_img)
# print("NUMBER: ", len(result))  
# print(result)
# print("--------------------------------")

# print("P")
# result = pattern_matching_2d(haystack_img, p_img)
# print("NUMBER: ", len(result))
# print(result)
# print("--------------------------------")

# print("T")
# result = pattern_matching_2d(haystack_img, t_img)
# print("NUMBER: ", len(result))
# print(result)
# print("--------------------------------")

A
NUMBER:  0
[]
--------------------------------
