## Lab4 Edit distance and lcs

In [2]:
# -*- coding: utf-8 -*-


from enum import Enum
import codecs
import random
from bisect import bisect
from enum import Enum
from spacy.lang.pl import Polish
from spacy.tokenizer import Tokenizer

In [3]:
text1a = "los"
text1b = "kloc"
text2a = "Łódź"
text2b = "Lodz"
text3a = "kwintesencja"
text3b = "quintessence"
text4a = "ATGAATCTTACCGCCTCG"
text4b = "ATGAGGCTCTGGCCCCTG"
data = [[text1a,text1b], [text2a,text2b], [text3a,text3b], [text4a,text4b]]

### Edit distance

In [4]:
class Operation(Enum):
    NO_CHANGE = 0
    ADD = 1
    DELETE = 2
    REPLACE = 3

def delta1(a, b):
    return a != b

In [5]:
def edit_distance(x, y, d):
    path = [[Operation.DELETE for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    edit_table = [[float('inf') for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    for i in range(len(x) + 1):
        edit_table[i][0] = i
        path[i][0] = Operation.DELETE
    for j in range(len(y) + 1):
        edit_table[0][j] = j
        path[0][j] = Operation.ADD

    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            if edit_table[i][j] > edit_table[i - 1][j - 1] + d(x[i - 1], y[j - 1]):
                edit_table[i][j] = edit_table[i - 1][j - 1] + d(x[i - 1], y[j - 1])
                if x[i - 1] == y[i - 1]:
                    path[i][j] = Operation.NO_CHANGE
                else:
                    path[i][j] = Operation.REPLACE

            if edit_table[i][j] > edit_table[i - 1][j] + 1:
                edit_table[i][j] = edit_table[i - 1][j] + 1
                path[i][j] = Operation.DELETE
            if edit_table[i][j] > edit_table[i][j - 1] + 1:
                edit_table[i][j] = edit_table[i][j - 1] + 1
                path[i][j] = Operation.ADD

    return path, edit_table[-1][-1]

In [6]:
def visualise_edit(x, y, path):
    back_path = []
    i, j = len(x), len(y)
    while i != 0 or j != 0:
        step = path[i][j]
        back_path.append(step)
        if step == Operation.ADD:
            j -= 1
        elif step == Operation.DELETE:
            i -= 1
        else:
            i -= 1
            j -= 1

    forward_path = reversed(back_path)
    i, j = 0, 0
    for direction in forward_path:
        if direction == Operation.NO_CHANGE:
            i += 1
            j += 1
        elif direction == Operation.REPLACE:
            print(f"{x[:i]} : ({x[i]} -> {y[j]}) : {x[i + 1:]}")
            x = x[:i] + y[j] + x[i + 1:]
            i += 1
            j += 1
        elif direction == Operation.DELETE:
            print(f"{x[:i]} * {x[i + 1:]}")
            x = x[:i] + x[i + 1:]
        else:
            print(f"{x[:i]} + {y[j]} + {x[i:]}")
            x = x[:i] + y[j] + x[i:]
            i += 1
            j += 1
    print(x)

In [7]:
x = "los"
y = "kloc"
print(f"{x} ---> {y}")
path, edit_dis = edit_distance(x,y,delta1)
print(f"edit distance: {edit_dis}")
visualise_edit(x,y,path)

los ---> kloc
edit distance: 2
 + k + los
k : (l -> l) : os
kl : (o -> o) : s
klo : (s -> c) : 
kloc


In [8]:
for x,y in data:
    print(f"{x} ---> {y}")
    path, edit_dis = edit_distance(x,y,delta1)
    print(f"edit distance: {edit_dis}")
    visualise_edit(x,y,path)
    print()

los ---> kloc
edit distance: 2
 + k + los
k : (l -> l) : os
kl : (o -> o) : s
klo : (s -> c) : 
kloc

Łódź ---> Lodz
edit distance: 3
 : (Ł -> L) : ódź
L : (ó -> o) : dź
Lod : (ź -> z) : 
Lodz

kwintesencja ---> quintessence
edit distance: 5
 : (k -> q) : wintesencja
q : (w -> u) : intesencja
quinte + s + sencja
quintess : (e -> e) : ncja
quintesse : (n -> n) : cja
quintessen : (c -> c) : ja
quintessenc * a
quintessenc : (a -> e) : 
quintessence

ATGAATCTTACCGCCTCG ---> ATGAGGCTCTGGCCCCTG
edit distance: 7
ATGA : (A -> G) : TCTTACCGCCTCG
ATGAG : (T -> G) : CTTACCGCCTCG
ATGAGGCT + C + TACCGCCTCG
ATGAGGCTC : (T -> T) : ACCGCCTCG
ATGAGGCTCT : (A -> G) : CCGCCTCG
ATGAGGCTCTG : (C -> G) : CGCCTCG
ATGAGGCTCTGG : (C -> C) : GCCTCG
ATGAGGCTCTGGC : (G -> C) : CCTCG
ATGAGGCTCTGGCCCC : (T -> T) : CG
ATGAGGCTCTGGCCCCT * G
ATGAGGCTCTGGCCCCTG



### LCS

In [9]:
class Direction(Enum):
    NO_CHANGE = 0
    UP = 1
    LEFT = 2

In [10]:
def edit_distance_lcs(x, y):
    path = [[Direction.UP for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    edit_table = [[0 for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    for i in range(len(x) + 1):
        path[i][0] = Direction.UP
    for j in range(len(y) + 1):
        path[0][j] = Direction.LEFT

    for i in range(1, len(x) + 1):
        for j in range(1, len(y) + 1):
            if x[i - 1] != y[j - 1]:
                if edit_table[i - 1][j] > edit_table[i][j - 1]:
                    path[i][j] = Direction.UP
                    edit_table[i][j] = edit_table[i - 1][j]
                else:
                    path[i][j] = Direction.LEFT
                    edit_table[i][j] = edit_table[i][j - 1]
            else:
                edit_table[i][j] = edit_table[i - 1][j - 1] + 1
                path[i][j] = Direction.NO_CHANGE

    return path, edit_table[-1][-1]

In [11]:
def visualise_lcs(x, y, path):
    result = []
    i, j = len(x), len(y)
    while i != 0 or j != 0:
        step = path[i][j]
        if step == Direction.UP:
            i -= 1
        elif step == Direction.LEFT:
            j -= 1
        else:
            i -= 1
            j -= 1
            result.append(x[i])
    result.reverse()
    print("lcs: "+"".join(result))

In [12]:
x = "los"
y = "kloc"
print(f"{x} --- {y}")
path, edit_dis = edit_distance_lcs(x,y)
print(f"lcs length: {edit_dis}")
visualise_lcs(x,y,path)
print()

los --- kloc
lcs length: 2
lcs: lo



In [13]:
for x,y in data:
    print(f"{x} --- {y}")
    path, edit_dis = edit_distance_lcs(x,y)
    print(f"lcs length: {edit_dis}")
    visualise_lcs(x,y,path)
    print()

los --- kloc
lcs length: 2
lcs: lo

Łódź --- Lodz
lcs length: 1
lcs: d

kwintesencja --- quintessence
lcs length: 8
lcs: intesenc

ATGAATCTTACCGCCTCG --- ATGAGGCTCTGGCCCCTG
lcs length: 13
lcs: ATGACTTCCCCTG



In [14]:
def lcs(x, y):
    ranges = [len(y)]
    y_letters = list(y)
    for char in x:
        positions = [j for j, l in enumerate(y_letters) if l == char]
        positions.reverse()
        for p in positions:
            k = bisect(ranges, p)
            if k == bisect(ranges, p - 1):
                if k < len(ranges) - 1:
                    ranges[k] = p
                else:
                    ranges[k:k] = [p]

    return len(ranges) - 1

In [15]:
def new_text(tokens):
    res = []
    for t in tokens:
        if random.random() >= 0.03:
            res.append(t)
    return res


def write_to_file(tokens, path):
    with open(path, 'w') as new_file:
        for token in tokens:
            new_file.write(token.text_with_ws)


def get_from_file(path):
    with codecs.open(path, 'r', 'utf-8') as new_file:
        return new_file.readlines()

In [16]:
def diff(x, y):
    edit_table = [[0 for _ in range(len(y) + 1)] for _ in range(len(x) + 1)]
    for i in range(len(x) + 1):
        for j in range(len(y) + 1):
            if i > 0 and j > 0:
                if x[i - 1] == y[j - 1]:
                    edit_table[i][j] = edit_table[i - 1][j - 1] + 1
                else:
                    edit_table[i][j] = max(edit_table[i - 1][j],
                                           edit_table[i][j - 1])
    res = []
    i = len(x) - 1
    j = len(y) - 1
    while i >= 0 and j >= 0:
        if x[i] == y[j]:
            i -= 1
            j -= 1
        elif edit_table[i][j - 1] >= edit_table[i - 1][j]:
            res.append(f"> ({j}) {y[j]}")
            j -= 1
        elif edit_table[i][j - 1] < edit_table[i - 1][j]:
            res.append(f"< ({i}) {x[i]}")
            i -= 1
    while i >= 0:
        res.append(f"< ({i}) {x[i]}")
        i -= 1

    while j >= 0:
        res.append(f"> ({j}) {y[j]}")
        j -= 1

    for line in reversed(res):
        print(line)

In [None]:
def test():
    tokenizer = Tokenizer(Polish().vocab)
    with open('assets/romeo-i-julia-700.txt', "r") as file:
        text = file.read()
        tokens = tokenizer(text)

        tokens1 = new_text(tokens)
        tokens2 = new_text(tokens)
        write_to_file(tokens1, 'assets/tokens1.txt')
        write_to_file(tokens2, 'assets/tokens2.txt')

        print(f"Length of token: {len(tokens)}")
        print(f"Length of first new token: {len(tokens1)}")
        print(f"Length of second new token: {len(tokens2)}")

        path, lcs1 = edit_distance_lcs(tokens1, tokens2)
        print(f"lcs1: {lcs1}")
        print(f"lcs2: {lcs(tokens1, tokens2)}")

        tokens1 = get_from_file('assets/tokens1.txt')
        tokens2 = get_from_file('assets/tokens2.txt')
        diff(tokens1, tokens2)

In [46]:
test()

Length of token: 2272
Length of first new token: 2209
Length of second new token: 2215
lcs1: 2155
lcs2: 2155
< (2) Romeo Julia

> (2) Romeo i Julia

< (21)  * SAMSON, GRZEGORZ — Kapuleta

> (21)  * SAMSON, GRZEGORZ — słudzy Kapuleta

< (29)  * PANI KAPULET — małżonka 

> (29)  * PANI KAPULET — małżonka Kapuleta

< (37) Rzecz odbywa się przez większą część sztuki w Weronie, przez część piątego aktu w Mantui.

> (37) odbywa się przez większą część sztuki w Weronie, przez część piątego aktu w Mantui.

< (45) Dwa rody, zacne jednako i sławne —Tam, gdzie się rzecz ta rozgrywa, w Weronie,

> (45) Dwa rody, zacne jednako i sławne —

> (46) Tam, gdzie się rzecz ta rozgrywa, w Weronie,

< (49) Z łon tych dwu wrogów wzięło bowiem 

> (50) Z łon tych dwu wrogów wzięło bowiem życie,

< (50) Pod z gwiazd, kochanków dwoje;

> (51) Pod najstraszliwszą z gwiazd, kochanków dwoje;

< (59) Które otoczcie cierpliwymi względy,

> (60) Które otoczcie względy,

< (69) SCENA PIERWSZA

> (70) SCENA 

< (70) 

