## Algorytmy tekstowe

## Lab 6 - Wyszukiwanie wzorca 2d

### Jakub Janicki

In [20]:
import time
from collections import defaultdict
from PIL import Image
import imageio
import numpy as np

In [21]:
class Node:
    def __init__(self):
        super().__init__()
        self.state = 0
        self.fail = None
        self.transition = dict()
        self.output = set()


class AhoCorasick:
    def __init__(self, pattern):
        self.nodes = []
        self.make_output_transition(pattern)
        self.make_fail()

    def make_output_transition(self, patterns):
        self.nodes.append(Node())

        for pattern in patterns:
            current = 0
            for i, letter in enumerate(pattern):
                if letter in self.nodes[current].transition:
                    current = self.nodes[current].transition[pattern[i]]
                else:
                    self.nodes.append(Node())
                    self.nodes[current].transition[letter] = len(self.nodes) - 1
                    current = len(self.nodes) - 1
                if i == len(pattern) - 1:
                    self.nodes[current].output.add(tuple(pattern))

    def make_fail(self):
        q = []
        self.nodes[0].fail = 0

        for key, val in self.nodes[0].transition.items():
            if val != -1:
                q.append(val)
                self.nodes[val].fail = 0

        while q:
            curr = q.pop(0)
            for key, val in self.nodes[curr].transition.items():
                q.append(val)
                fail = self.nodes[curr].fail
                while key not in self.nodes[fail].transition and fail != 0:
                    fail = self.nodes[fail].fail
                if key in self.nodes[fail].transition:
                    self.nodes[val].fail = self.nodes[fail].transition[key]
                else:
                    self.nodes[val].fail = fail
                self.nodes[val].output.update(self.nodes[self.nodes[val].fail].output)

In [22]:
def prepare_text(text):
    max_len = max([len(l) for l in text])
    matrix = [[None for _ in range(max_len)] for _ in range(len(text))]
    for i, line in enumerate(text):
        for j, letter in enumerate(line):
            matrix[i][j] = letter
    return matrix

def prepare_pattern(pattern):
    pattern = prepare_text(pattern)
    pattern_columns = [[None for _ in range(len(pattern))] for _ in range(len(pattern[0]))]
    for i, line in enumerate(pattern):
        for j, letter in enumerate(line):
            pattern_columns[j][i] = letter
    return pattern_columns

In [23]:
def search_2d(text, pattern, return_time = False):
    text = prepare_text(text)
    pattern = prepare_pattern(pattern)
    n = len(text)
    m = len(text[0])
    start_t = time.time()

    automata = AhoCorasick(pattern)

    prep_t = time.time() - start_t
    start_t = time.time()
    state_matrix = [[0 for _ in range(m)] for _ in range(n)]
    for j in range(m):
        for i in range(n):
            if i == 0:
                prev =0
            else:
                prev = state_matrix[i-1][j]
            if text[i][j] in automata.nodes[prev].transition:
                state_matrix[i][j] = automata.nodes[state_matrix[i - 1][j]].transition[text[i][j]]
            else:
                while prev != 0 and text[i][j] not in automata.nodes[prev].transition:
                    prev = automata.nodes[prev].fail
                if text[i][j] in automata.nodes[prev].transition:
                    state_matrix[i][j] = automata.nodes[prev].transition[text[i][j]]
                else:
                    state_matrix[i][j] = prev


    seq = [0 for _ in range(len(pattern))]
    for i in range(len(pattern)):
        for l in pattern[i]:
            seq[i] = automata.nodes[seq[i]].transition[l]

    result = []
    for i in range(n):
        for j in range(m - len(pattern)+1):
            if state_matrix[i][j:j + len(pattern)] == seq:
                result.append((i - len(pattern) + 1, j))
    find_t = time.time() - start_t

    if return_time: return result, (prep_t, find_t)
    return result


In [24]:
text = ["ccdas",
        "ccccd",
        "cdccc",
        "dacdc"]

pattern = ["ccd", "ccc", "cdc"]

search_2d(text, pattern)

[(0, 0), (1, 2)]

In [25]:
def image_to_matrix(image):
    pixel_arr = image.load()
    pixels = []
    for i in range(image.height):
        row = []
        for j in range(image.width):
            row.append(pixel_arr[j, i][0])
        pixels.append(row)
    return pixels

In [29]:
text_path = 'assets/haystack.txt'
text_img_path = 'assets/haystack.png'
letters_paths = ['letters/a.png', 'letters/s.png', 'letters/m.png', 'letters/o.png']
pattern_path = 'assets/pattern.png'

with open(text_path, 'r') as file:
    lines = file.readlines()

## Zadanie 2

In [30]:
for char in [chr(i) for i in range(ord('a'), ord('z'))]:
    print(f"Letter: {char}-{char} appeared {len(search_2d(lines, [char, char]))} times \n {search_2d(lines, [char, char])}")

Letter: a-a appeared 28 times 
 [(1, 82), (4, 30), (6, 60), (7, 63), (21, 6), (29, 69), (32, 50), (32, 73), (34, 66), (38, 4), (53, 12), (54, 12), (54, 48), (57, 11), (58, 36), (59, 36), (60, 24), (65, 2), (65, 14), (65, 22), (66, 35), (70, 35), (77, 21), (77, 74), (78, 42), (78, 61), (79, 59), (80, 37)]
Letter: b-b appeared 0 times 
 []
Letter: c-c appeared 6 times 
 [(4, 54), (11, 45), (14, 10), (42, 0), (69, 0), (83, 41)]
Letter: d-d appeared 1 times 
 [(38, 19)]
Letter: e-e appeared 48 times 
 [(1, 63), (2, 8), (5, 77), (8, 65), (11, 1), (11, 64), (15, 2), (16, 43), (18, 6), (19, 27), (21, 10), (22, 61), (23, 53), (25, 3), (25, 65), (29, 67), (29, 73), (30, 38), (30, 43), (38, 48), (41, 11), (41, 26), (42, 57), (43, 36), (43, 48), (47, 52), (48, 50), (52, 31), (58, 54), (59, 50), (59, 54), (60, 73), (64, 66), (66, 69), (67, 72), (68, 17), (69, 46), (70, 15), (71, 49), (72, 38), (73, 23), (74, 27), (77, 6), (78, 6), (79, 65), (81, 6), (82, 14), (83, 47)]
Letter: f-f appeared 2 times

In [31]:
for char in [chr(i) for i in range(ord('A'), ord('Z'))]:
    print(f"Letter: {char}-{char} appeared {len(search_2d(lines, [char, char]))} times \n {search_2d(lines, [char, char])}")

Letter: A-A appeared 0 times 
 []
Letter: B-B appeared 0 times 
 []
Letter: C-C appeared 0 times 
 []
Letter: D-D appeared 0 times 
 []
Letter: E-E appeared 0 times 
 []
Letter: F-F appeared 0 times 
 []
Letter: G-G appeared 0 times 
 []
Letter: H-H appeared 0 times 
 []
Letter: I-I appeared 0 times 
 []
Letter: J-J appeared 0 times 
 []
Letter: K-K appeared 0 times 
 []
Letter: L-L appeared 0 times 
 []
Letter: M-M appeared 0 times 
 []
Letter: N-N appeared 0 times 
 []
Letter: O-O appeared 0 times 
 []
Letter: P-P appeared 0 times 
 []
Letter: Q-Q appeared 0 times 
 []
Letter: R-R appeared 0 times 
 []
Letter: S-S appeared 0 times 
 []
Letter: T-T appeared 0 times 
 []
Letter: U-U appeared 0 times 
 []
Letter: V-V appeared 0 times 
 []
Letter: W-W appeared 0 times 
 []
Letter: X-X appeared 0 times 
 []
Letter: Y-Y appeared 0 times 
 []


## Zadanie 3 

In [32]:
print(f"[ th ] appeared {len(search_2d(lines, ['th', 'th']))} times \n {search_2d(lines, ['th', 'th'])}")
print(f"[ t h ] appeared {len(search_2d(lines, ['t h', 't h']))} times \n {search_2d(lines, ['t h', 't h'])}")

[ th ] appeared 0 times 
 []
[ t h ] appeared 1 times 
 [(36, 0)]


## Zadanie 4

In [33]:
def find_letter(letter_string, text_img, l_img):
    img = image_to_matrix(text_img)
    letter = image_to_matrix(l_img)
    print(f"Letter: {letter_string} appeared {len(search_2d(img, letter))} times \n {search_2d(img, letter)}\n")

In [34]:
with Image.open(text_img_path) as text_img, Image.open(letters_paths[0]) as a_img, Image.open(letters_paths[1]) as s_img, \
        Image.open(letters_paths[2]) as m_img, Image.open(letters_paths[3]) as o_img, Image.open(pattern_path) as pattern_img:

    find_letter("a",text_img, a_img)
    find_letter("s",text_img, s_img)

    find_letter("m",text_img, m_img)
    find_letter("o",text_img, o_img)

Letter: a appeared 356 times 
 [(37, 206), (37, 262), (37, 322), (37, 486), (37, 622), (37, 750), (59, 302), (59, 332), (59, 390), (59, 641), (59, 699), (81, 176), (81, 202), (81, 398), (81, 564), (81, 588), (103, 55), (103, 104), (103, 271), (125, 273), (125, 327), (125, 618), (147, 125), (147, 155), (147, 246), (147, 353), (147, 548), (147, 631), (169, 55), (169, 89), (169, 193), (169, 321), (169, 517), (169, 556), (169, 585), (169, 707), (191, 83), (191, 257), (191, 405), (191, 576), (191, 620), (191, 679), (213, 434), (213, 535), (235, 119), (235, 471), (235, 592), (257, 93), (257, 445), (279, 62), (279, 125), (279, 268), (279, 459), (279, 562), (301, 36), (323, 87), (323, 175), (323, 434), (323, 494), (323, 572), (345, 97), (345, 221), (345, 272), (345, 408), (345, 576), (367, 25), (367, 247), (367, 273), (367, 603), (367, 656), (389, 119), (389, 227), (389, 369), (389, 704), (411, 25), (433, 247), (433, 282), (433, 511), (433, 593), (433, 654), (455, 83), (455, 127), (455, 244), 

Letter: m appeared 131 times 
 [(32, 140), (32, 469), (32, 722), (98, 611), (120, 256), (120, 555), (142, 365), (142, 674), (164, 107), (164, 390), (164, 500), (186, 66), (186, 503), (186, 521), (208, 503), (230, 49), (230, 322), (230, 540), (252, 133), (252, 311), (252, 489), (274, 251), (318, 254), (318, 555), (340, 204), (340, 471), (340, 500), (340, 553), (340, 649), (362, 224), (362, 569), (384, 73), (384, 210), (384, 341), (406, 67), (428, 201), (450, 384), (472, 93), (472, 291), (472, 437), (472, 547), (494, 95), (494, 671), (516, 370), (538, 161), (538, 546), (538, 620), (560, 429), (582, 732), (648, 279), (648, 369), (648, 455), (648, 666), (670, 653), (692, 104), (714, 198), (736, 263), (736, 644), (780, 386), (780, 404), (780, 596), (802, 362), (802, 554), (802, 668), (824, 389), (846, 210), (846, 587), (846, 698), (868, 623), (890, 82), (890, 152), (912, 58), (934, 47), (934, 65), (1000, 26), (1000, 643), (1022, 26), (1022, 55), (1088, 82), (1088, 414), (1132, 56), (1176, 2

## Zadanie 5

In [35]:
def find_pattern(image_path, pattern_path):
    img = image_to_matrix(image_path)
    pattern = image_to_matrix(pattern_path)
    print("Pattern appeared:", search_2d(img, pattern))

In [36]:
with Image.open(text_img_path) as text_img, Image.open(letters_paths[0]) as a_img, Image.open(letters_paths[1]) as s_img, \
        Image.open(letters_paths[2]) as m_img, Image.open(letters_paths[3]) as o_img, Image.open(pattern_path) as pattern_img:
    find_pattern(text_img, pattern_img)

Pattern appeared: [(391, 183), (413, 427), (457, 241), (501, 141), (545, 247)]


## Zadanie 6

In [37]:
def measure_search_time(lines, text, img, text_img):
    print("Text finding")
    x, (prep_t , find_t) = search_2d(lines, text, return_time=True)
    print(f"Building automata took: {prep_t} s\nFinding took: {find_t} s")
    img = image_to_matrix(img)
    text_img = image_to_matrix(text_img)
    print("Image finding")
    x, (prep_t , find_t) = search_2d(img, text_img, return_time=True)
    print(f"Building automata took: {prep_t} s \nFinding took: {find_t} s")

In [38]:
small_text = "a"
medium_text = "pattern"
big_text =["To the contarary, English contains",
              "t h a n 700,000 words. T h e rep",
              "problem a bit more challenging."]
big_img_path = "assets/big.png"
large_img_path = "assets/large.png"
large_text = ["T h e search of words or p a t t e r n s in static texts is quite a different question",
             "t h a n the previous pattern-matching mechanism. Dictionaries, for example,",
             "are organized in order to speed u p the access to entries. Another example",
             "of the same question is given by indexes. Technical books often contain a n",
             "index of chosen terms t h a t gives pointers to p a r t s of the text related to words",
             "in the index. T h e algorithms involved in the creation of an index form a",
             "specific group. T h e use of dictionaries or lexicons is often related t o n a t u r a l",
             "language processing. Lexicons of programming languages are small, and their",
             "representation is not a difficult problem during the development of a compiler."]

with Image.open(text_img_path) as text_img, Image.open(letters_paths[0]) as a_img, \
    Image.open(big_img_path) as big_img, Image.open(pattern_path) as pattern_img, \
    Image.open(large_img_path) as large_img:
        print("=== Small text ===")
        measure_search_time(lines, small_text, text_img, a_img)
        print("\n=== Medium text ===")
        measure_search_time(lines, medium_text, text_img, pattern_img)
        print("\n=== Big text ===")
        measure_search_time(lines, big_text, text_img, big_img)
        print("\n=== Large text ===")
        measure_search_time(lines, large_text, text_img, large_img)

=== Small text ===
Text finding
Building automata took: 0.0 s
Finding took: 0.010270833969116211 s
Image finding
Building automata took: 0.03339266777038574 s 
Finding took: 3.43282413482666 s

=== Medium text ===
Text finding
Building automata took: 0.0 s
Finding took: 0.011501312255859375 s
Image finding
Building automata took: 0.03166627883911133 s 
Finding took: 5.1952126026153564 s

=== Big text ===
Text finding
Building automata took: 0.0010478496551513672 s
Finding took: 0.011143684387207031 s
Image finding
Building automata took: 0.13703489303588867 s 
Finding took: 4.288925886154175 s

=== Large text ===
Text finding
Building automata took: 0.011591196060180664 s
Finding took: 0.02822113037109375 s
Image finding
Building automata took: 0.948235273361206 s 
Finding took: 2.9291017055511475 s


## Zadanie 7

In [40]:
def divide_and_measure(lines, pattern):
    for div in [1, 2, 4, 8]:
        parts = []
        curr = 0
        for i in range(div):
            if i < div - 1:
                parts.append(lines[curr:curr+(len(lines) // div)])
            else:
                parts.append(lines[curr:curr+len(lines)])
            curr += len(lines) // div

        t = 0
        for p in parts:
            x, (pre_t, find_t) = search_2d(p, pattern, return_time=True)
            t += pre_t + find_t
        print(f"{div} parts took {t} s")

In [41]:

with Image.open(text_img_path) as text_img, Image.open(letters_paths[0]) as a_img, \
        Image.open(big_img_path) as big_img, Image.open(pattern_path) as pattern_img:
    divide_and_measure(lines,big_text)

1 parts took 0.007799386978149414 s
2 parts took 0.02195119857788086 s
4 parts took 0.012528657913208008 s
8 parts took 0.01105499267578125 s
