In [3]:
import numpy as np
from collections import defaultdict as dd
from typing import List, Dict, Set
import nltk
from tqdm import tqdm_notebook, tnrange
import pickle
from termcolor import colored

from enum import Enum

In [4]:
def load_word_set():
    all_lemmas = set()
    for line in open('Dane/polimorfologik-2.1.txt', encoding='utf-8'):
        L = line.split(';')[1].lower()
        all_lemmas.add(L)
    return all_lemmas

In [5]:
def get_normal_form(w):
    polish_map = {
        "ż": "z",
        "ź": "z",
        "x": "z",
        "ó": "o",
        "ł": "l",
        "ć": "c",
        "ą": "a",
        "ń": "n",
        "ę": "e",
    }
    ortographic_map = {
        "u": "o",
        "om": "a",
        "en": "e",
        "em": "e",
        "on": "a",
        "rz": "e",
        "ch": "h",
#         "fk": "wk",
#         "af": "aw",
        "f": "w",
        "sz": "z",
    }
    tmp = ""
    for c in w:
        tmp += c if c not in polish_map else polish_map[c]
    l = len(w)
    res = ""
    i = 0
    while i < l:
        if i < l - 1 and tmp[i:(i+2)] in ortographic_map:
            res += ortographic_map[tmp[i:(i+2)]]
            i += 2
            continue
        if tmp[i] in ortographic_map:
            res += ortographic_map[tmp[i]]
            i += 1
            continue
        res += tmp[i]
        i += 1
    return res

In [6]:
def get_dictionaries():
    ws = load_word_set()
    res1 = dd(list)
    res2 = dd(list)
    for w in word_set:
        norm_w = get_normal_form(w)
        res1[norm_w].append(w)
        res2[(w[0], len(w))].append(w)
        if len(w) > 1:
            res2[(w[1], len(w))].append(w)
    return res1, res2

In [22]:
class EditOp(Enum):
    INS = 1
    DEL = 2
    CHAN = 3

In [52]:
def levenshtein(seq1, seq2, ins_cost=1, del_cost=1, ch_cost=1):
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    matrix_op = [[[] for _ in range(size_y)] for _ in range(size_x)]
#     print(del_cost)
#     print(size_x)
    for x in range(size_x):
#         print(x)
        matrix[x, 0] = x*del_cost
        matrix_op[x][0] = [EditOp.DEL] * x
    for y in range(size_y):
        matrix[0, y] = y*ins_cost
        matrix_op[0][y] = [EditOp.INS] * y

#     print(matrix)
    for x in range(1, size_x):
        for y in range(1, size_y):
#             print(f"{x},{y}")
#             print(matrix[x-1, y] + del_cost)
            matrix[x,y] = matrix[x-1, y] + del_cost
            matrix_op[x][y] = matrix_op[x-1][y] + [EditOp.DEL]
            cost = matrix[x, y-1] + ins_cost
            if cost <= matrix[x, y]:
                matrix[x, y] = cost
                matrix_op[x][y] =  matrix_op[x][y-1] + [EditOp.INS]
            cost = matrix[x-1, y-1] if seq1[x-1] == seq2[y-1] else matrix[x-1, y-1] + ch_cost
            ops = matrix_op[x-1][y-1] if seq1[x-1] == seq2[y-1] else matrix_op[x-1][y-1] + [EditOp.CHAN]
            if cost <= matrix[x, y]:
                matrix[x, y] = cost
                matrix_op[x][y] =  ops
    return (matrix[size_x - 1, size_y - 1], matrix_op[size_x-1][size_y-1])

In [50]:
def argmin(l):
    res = None
    for i, el in enumerate(l):
        if res is None or el < l[res]:
            res = i
    return res
            

In [25]:
def find_closest(w, words, ins_cost=1, del_cost=1, ch_cost=1):
    edit_distances_and_ops = [levenshtein(w, w1, ins_cost, del_cost, ch_cost) for w1 in tqdm_notebook(words)]
    edit_distances = [el[0] for el in edit_distances_and_ops]
    i = argmin(edit_distances)
    return words[i], edit_distances_and_ops[i]

In [26]:
def correct_word(w, norm_dicts, ins_cost=1, del_cost=1, ch_cost=1):
    d1, d2 = norm_dicts
    norm_w = get_normal_form(w)
    if norm_w in d1:
        return find_closest(w, d1[norm_w], ins_cost, del_cost, ch_cost)
    l = len(w)
    w_candidates = []
    for i in range(l-1, l+2):
        w_candidates += d2[(w[0], i)][:]
    if len(w) > 1:
        for i in range(l -1, l + 2):
            w_candidates += d2[(w[1], i)]
    return find_closest(w, w_candidates, ins_cost, del_cost, ch_cost)

In [14]:
word_set = load_word_set()

In [16]:
d1, d2 = get_dictionaries()

In [None]:
result = []
with open("Dane/literowki1.txt", "r") as ifile:
    for line in ifile:
        line = nltk.word_tokenize(line)
        correct, incorrect = line[0], line[1]
        corrected, ops = correct_word(line[1], (d1, d2))
        result.append((corrected, ops))
        print(f"{line[0]}, {line[1]} : {corrected}")

Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  


HBox(children=(FloatProgress(value=0.0, max=406510.0), HTML(value='')))


lokomotywa, lokomowtuwa : lokomotywa


HBox(children=(FloatProgress(value=0.0, max=435137.0), HTML(value='')))


lokomotywa, kolokotywa : kolektiwa


HBox(children=(FloatProgress(value=0.0, max=406510.0), HTML(value='')))


lokomotywa, lokonowaywa : lokomotywa


HBox(children=(FloatProgress(value=0.0, max=435137.0), HTML(value='')))


lokomotywa, kolomotywa : lokomotywa


HBox(children=(FloatProgress(value=0.0, max=311426.0), HTML(value='')))


lokomotywa, lokotaywa : lokowana


HBox(children=(FloatProgress(value=0.0, max=142557.0), HTML(value='')))


prawdopodobieństwo, prawodpodoniestso : prawdopodobieństwo


HBox(children=(FloatProgress(value=0.0, max=54659.0), HTML(value='')))


prawdopodobieństwo, prawdopopdobieństwo : prawdopodobieństwo


HBox(children=(FloatProgress(value=0.0, max=91000.0), HTML(value='')))


prawdopodobieństwo, prawdopodobieńśtwo : prawdopodobieństwo


HBox(children=(FloatProgress(value=0.0, max=31347.0), HTML(value='')))


prawdopodobieństwo, prawodpodoniesnitswo : prawdopodobieństwom


HBox(children=(FloatProgress(value=0.0, max=31347.0), HTML(value='')))


prawdopodobieństwo, prawwdopodobeinistwo : prawdopodobieństwom


HBox(children=(FloatProgress(value=0.0, max=31347.0), HTML(value='')))


prawdopodobieństwo, prawdopodobiensitwao : prawdopodobieństwom


HBox(children=(FloatProgress(value=0.0, max=54659.0), HTML(value='')))

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)




prawdopodobieństwo, prawdopdoodobienśtwo : prawdopodobieństwom


HBox(children=(FloatProgress(value=0.0, max=142557.0), HTML(value='')))


prawdopodobieństwo, prawdopodbieństwi : prawdopodobieństw


HBox(children=(FloatProgress(value=0.0, max=91000.0), HTML(value='')))


prawdopodobieństwo, prawodpoodbieństwo : prawdopodobieństwo


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


prawdomówny, prawdomównu : prawdomówno


HBox(children=(FloatProgress(value=0.0, max=486580.0), HTML(value='')))


prawdomówny, proawdomówny : prawdomówny


HBox(children=(FloatProgress(value=0.0, max=295969.0), HTML(value='')))


prawdomówny, prawdowny : prawiony


HBox(children=(FloatProgress(value=0.0, max=464487.0), HTML(value='')))


komputerek, kompteurerk : dompteurek


HBox(children=(FloatProgress(value=0.0, max=435137.0), HTML(value='')))


komputerek, komputerer : komputerem


HBox(children=(FloatProgress(value=0.0, max=1.0), HTML(value='')))


komputerek, komputerek : komputerek


HBox(children=(FloatProgress(value=0.0, max=435137.0), HTML(value='')))

In [None]:
print(result)