In [1]:
import re
from collections import Counter
from time import perf_counter
import os

In [None]:
def tokens(text):
    return re.findall(r'\w+', text.lower())

WORDS = Counter(tokens(open('').read()))

def probability(word, N=SUM(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N

def correction(word):
    'Most probable spelling correction for word.'
    return max(candidates(word), key=probability)

def candidates(word):
    'Generate possible spelling corrections for word.'
    return (known([word]) or known(edits1(word)) or known(edits2(word)) or [word])

def known(tokens):
    'The subset of `tokens` that appear in the dictionary of WORDS.'
    return set(w for w in tokens if w in WORDS)

def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)

def edits2(word): 
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))

In [None]:
word = 'bulan juni'
splits = [(word[:i], word[i:])    for i in range(len(word) + 1)]
splits

In [None]:
correction('kucink')

# Test Code Unigram

In [None]:
def unit_tests():
    assert correction('kalkulatf') == 'kalkulatif'              # insert
    assert correction('mmandan') == 'memandang'           # replace 2
    assert correction('minjadi') == 'menjadi'               # replace
    assert correction('permusyawartn') == 'permusyawaratan'       # insert 2
    assert correction('teersebut') == 'tersebut'            # delete
    assert correction('naivgasi') =='navigasi'                  # transpose
    assert correction('menginformasiknn') =='menginformasikan'                 # transpose + delete
    assert correction('memang') == 'memang'                     # known
    assert correction('sampan') == 'sampan' # unknown
    assert tokens('Ini adalah sebuah TEST.') == ['ini', 'adalah', 'sebuah', 'test']
    assert Counter(tokens('Ini adalah sebuah test. 123; sebuah TEST adalah ini.')) == (Counter({'123': 1, 'sebuah': 2, 'adalah': 2, 'test': 2, 'ini': 2}))
    assert len(WORDS) == 432184
    assert sum(WORDS.values()) == 4902106
    assert WORDS.most_common(10) == [('yang', 151796),('dan', 109411),('di', 70168),('dengan', 54857),('itu', 51588), ('ini', 44693), ('tidak', 43062), ('untuk', 42823), ('dari', 41478), ('dalam', 39131)]
    assert WORDS['yang'] == 151796
    assert P('luring') == 0
    assert 0.01 < P('yang') < 0.4
    return 'unit_tests pass'

In [None]:
def spelltest(tests, verbose=False):
    "Run correction(wrong) on all (right, wrong) pairs; report results."
    start = perf_counter()
    good, unknown = 0, 0
    n = len(tests)
    for right, wrong in tests:
        w = correction(wrong)
        good += (w == right)
        if w != right:
            unknown += (right not in WORDS)
            if verbose:
                print('correction({}) => {} ({}); expected {} ({})'
                      .format(wrong, w, WORDS[w], right, WORDS[right]))
    dt = perf_counter() - start
    print('{:.0%} dari {} kata benar, ({:.0%} unknown) dengan {:.0f} kata per second '
          .format(good / n, n, unknown / n, n / dt))
    
def Testset(lines):
    "Parse 'right: wrong1 wrong2' lines into [('right', 'wrong1'), ('right', 'wrong2')] pairs."
    return [(right.lower(), wrong.lower())
            for (right, wrongs) in (line.split(':') for line in lines)
            for wrong in wrongs.split()]

In [None]:
print(unit_tests())
print("hasil Unigram : ")
spelltest(Testset(open('/kaggle/input/mix-sentence-indonesia/spell-error.txt')), True) 