# Portuguese spell checker

## Open the file (corpus)

In [1]:
with open("artigos.txt", "r") as file:
    articles = file.read()

print(articles[:500])




imagem 

Temos a seguinte classe que representa um usuário no nosso sistema:

java

Para salvar um novo usuário, várias validações são feitas, como por exemplo: Ver se o nome só contém letras, [**o CPF só números**] e ver se o usuário possui no mínimo 18 anos. Veja o método que faz essa validação:

java 

Suponha agora que eu tenha outra classe, a classe `Produto`, que contém um atributo nome e eu quero fazer a mesma validação que fiz para o nome do usuário: Ver se só contém letras. E aí? Vou


## Analyzing the file

In [2]:
# Number of letters and not the number of words
len(articles)

2605046

In [10]:
len("Hello")

5

In [7]:
text_example = "Hello, are you ok?"
tokens = text_example.split()

In [8]:
print(len(tokens))

4


In [9]:
# The Split separates with the punctuation
print(tokens)

['Hello,', 'are', 'you', 'ok?']


## Using Natural Language Toolkit (NLTK)

In [12]:
import nltk

# nltk.download('punkt') # It is needed to download some packages
separated_words = nltk.tokenize.word_tokenize(text_example)
print(separated_words)

['Hello', ',', 'are', 'you', 'ok', '?']


In [14]:
# It includes the symbols
len(separated_words)

6

In [66]:
# The isalpha() method returns True if all the characters are alphabet letters (a-z).
'./'.isalpha()

False

In [16]:
# Creating a function to get just the words in a token list
def separate_words(tokens_list):
    words_list = []
    for token in tokens_list:
        if token.isalpha():
            words_list.append(token)
    return words_list

separate_words(separated_words)

['Hello', 'are', 'you', 'ok']

In [17]:
list_tokens = nltk.tokenize.word_tokenize(articles)
words_list = separate_words(list_tokens)
print(f"The number of words is {len(words_list)}")

The number of words is 403031


In [18]:
print(words_list[:10])

['imagem', 'Temos', 'a', 'seguinte', 'classe', 'que', 'representa', 'um', 'usuário', 'no']


## Normalize the words

In [22]:
# Creating a function to normalize all the words (put every words in lowercase)
def normalization(words_list):
    normalized_list = []
    for word in words_list:
        normalized_list.append(word.lower())
    return normalized_list

normalized_list = normalization(words_list)
print(normalized_list[:10])

['imagem', 'temos', 'a', 'seguinte', 'classe', 'que', 'representa', 'um', 'usuário', 'no']


In [23]:
# Took off the repeated words // This is the number of words that our model will learn (This can be limitation)
len(set(normalized_list))

18464

In [24]:
word_test = "lgica"
(word_test[:1],word_test[1:])

('l', 'gica')

## Creating the spell checker

In [27]:
word_example = "lgica"

def insert_letters(slices):
    new_words = []
    letters = 'abcdefghijklmnopqrstuvwxyzáâàãéêèẽíîìĩóôõòúûùũç'
    for left_word, right_word in slices:
        for letter in letters:
            new_words.append(left_word + letter + right_word)
    return new_words

def words_generator(word):
    slices = []
    for i in range(len(word)+1):
        slices.append((word[:i], word[i:]))
    generated_words = insert_letters(slices)
    return generated_words

generated_words = words_generator(word_example)
print(generated_words)

['algica', 'blgica', 'clgica', 'dlgica', 'elgica', 'flgica', 'glgica', 'hlgica', 'ilgica', 'jlgica', 'klgica', 'llgica', 'mlgica', 'nlgica', 'olgica', 'plgica', 'qlgica', 'rlgica', 'slgica', 'tlgica', 'ulgica', 'vlgica', 'wlgica', 'xlgica', 'ylgica', 'zlgica', 'álgica', 'âlgica', 'àlgica', 'ãlgica', 'élgica', 'êlgica', 'èlgica', 'ẽlgica', 'ílgica', 'îlgica', 'ìlgica', 'ĩlgica', 'ólgica', 'ôlgica', 'õlgica', 'òlgica', 'úlgica', 'ûlgica', 'ùlgica', 'ũlgica', 'çlgica', 'lagica', 'lbgica', 'lcgica', 'ldgica', 'legica', 'lfgica', 'lggica', 'lhgica', 'ligica', 'ljgica', 'lkgica', 'llgica', 'lmgica', 'lngica', 'logica', 'lpgica', 'lqgica', 'lrgica', 'lsgica', 'ltgica', 'lugica', 'lvgica', 'lwgica', 'lxgica', 'lygica', 'lzgica', 'lágica', 'lâgica', 'làgica', 'lãgica', 'légica', 'lêgica', 'lègica', 'lẽgica', 'lígica', 'lîgica', 'lìgica', 'lĩgica', 'lógica', 'lôgica', 'lõgica', 'lògica', 'lúgica', 'lûgica', 'lùgica', 'lũgica', 'lçgica', 'lgaica', 'lgbica', 'lgcica', 'lgdica', 'lgeica', 'lgfica',

In [28]:
def spell_checker(word):
    generated_words = words_generator(word)
    corrected_word = max(generated_words, key=probability)
    return corrected_word

In [29]:
frequency = nltk.FreqDist(normalized_list) # frequency of each word
all_words = len(normalized_list)
frequency.most_common(10)

[('de', 15502),
 ('o', 14056),
 ('que', 12230),
 ('a', 11099),
 ('e', 10501),
 ('para', 7710),
 ('um', 6367),
 ('é', 5899),
 ('uma', 5220),
 ('do', 5124)]

In [30]:
# It is used global variables just in this cases, because it will be used this function many times.
def  probability(generated_words):
    return frequency[generated_words]/all_words 

probability("logica")

0.0

In [31]:
 probability("lógica")

0.00023819507680550628

In [32]:
spell_checker(word_example)

'lógica'

In [None]:
def create_test_data(file_name):
    list_test_words = []
    file = open(file_name, "r")
    for line in file:
        correct, wrong = line.split()
        list_test_words.append((correct, wrong))
    f.close()
    return list_test_words

test_list = create_test_data("palavras.txt")

## Creating an evaluator for the spell checker

In [34]:
def evaluator(tests):
    word_number = len(tests)
    got_right = 0
    for correct, wrong in tests:
        corrected_word = spell_checker(wrong)
        if corrected_word == correct:
            got_right += 1
    hit_rate = round(got_right*100/word_number, 2)
    print(f"{hit_rate}% of {word_number} words")

evaluator(test_list)

1.08% of 186 words


## Deleting a character

In [44]:
def deleting_character(slices):
    new_words = []
    for left, right in slices:
        new_words.append(left + right[1:])
    return new_words

## Improving the word generator

In [48]:
def words_generator(word):
    slices = []
    for i in range(len(word)+1):
        slices.append((word[:i], word[i:]))
    generated_words = insert_letters(slices)
    generated_words += deleting_character(slices)
    return generated_words

In [49]:
word_example = "lóigica"
generated_words = words_generator(word_example)
print(generated_words)

['alóigica', 'blóigica', 'clóigica', 'dlóigica', 'elóigica', 'flóigica', 'glóigica', 'hlóigica', 'ilóigica', 'jlóigica', 'klóigica', 'llóigica', 'mlóigica', 'nlóigica', 'olóigica', 'plóigica', 'qlóigica', 'rlóigica', 'slóigica', 'tlóigica', 'ulóigica', 'vlóigica', 'wlóigica', 'xlóigica', 'ylóigica', 'zlóigica', 'álóigica', 'âlóigica', 'àlóigica', 'ãlóigica', 'élóigica', 'êlóigica', 'èlóigica', 'ẽlóigica', 'ílóigica', 'îlóigica', 'ìlóigica', 'ĩlóigica', 'ólóigica', 'ôlóigica', 'õlóigica', 'òlóigica', 'úlóigica', 'ûlóigica', 'ùlóigica', 'ũlóigica', 'çlóigica', 'laóigica', 'lbóigica', 'lcóigica', 'ldóigica', 'leóigica', 'lfóigica', 'lgóigica', 'lhóigica', 'lióigica', 'ljóigica', 'lkóigica', 'llóigica', 'lmóigica', 'lnóigica', 'loóigica', 'lpóigica', 'lqóigica', 'lróigica', 'lsóigica', 'ltóigica', 'luóigica', 'lvóigica', 'lwóigica', 'lxóigica', 'lyóigica', 'lzóigica', 'láóigica', 'lâóigica', 'làóigica', 'lãóigica', 'léóigica', 'lêóigica', 'lèóigica', 'lẽóigica', 'líóigica', 'lîóigica', 'lì

In [50]:
evaluator(test_list)

41.4% of 186 words


## Making more words variation

In [51]:
def insert_letters(slices):
    new_words = []
    letters = 'abcdefghijklmnopqrstuvwxyzáâàãéêèẽíîìĩóôõòúûùũç'
    for left, right in slices:
        for letter in letters:
            new_words.append(left + letter + right)
    return new_words

def deleting_character(slices):
    new_words = []
    for left, right in slices:
        new_words.append(left + right[1:])
    return new_words

def change_letter(slices):
    new_words = []
    letters = 'abcdefghijklmnopqrstuvwxyzáâàãéêèẽíîìĩóôõòúûùũç'
    for left, right in slices:
        for letter in letters:
            new_words.append(left + letter + right[1:])
    return new_words

def inverte_letter(slices):
    new_words = []
    for left, right in slices:
        if len(right) > 1:
            new_words.append(left + right[1] + right[0] + right[2:])
    return new_words

def words_generator(word):
    slices = []
    for i in range(len(word)+1):
        slices.append((word[:i],word[i:]))
    generated_words = insert_letters(slices)
    generated_words += deleting_character(slices)
    generated_words += change_letter(slices)
    generated_words += inverte_letter(slices)
    return generated_words

def spell_checker(word):
    generated_words = words_generator(word)
    corrected_word = max(generated_words, key=probability)
    return corrected_word

def evaluator(tests):
    word_number = len(tests)
    got_right = 0
    for correct, wrong in tests:
        corrected_word = spell_checker(wrong)
        if corrected_word == correct:
            got_right += 1
    hit_rate = round(got_right*100/word_number, 2)
    print(f"{hit_rate}% of {word_number} words")

In [54]:
evaluator(test_list)

76.34% of 186 words


## Calculation of the rate of unknown words

In [59]:
def evaluator(tests, vocabulary):
    words_number = len(tests)
    got_right = 0
    unkown = 0
    for right, wrong in tests:
        corrected_word = spell_checker(wrong)
        if corrected_word == right:
            got_right += 1
        else:
            unkown += (right not in vocabulary)    
    right_rate = round(got_right*100/words_number, 2)
    unknown_rate = round(unkown*100/words_number, 2)
    print(f"{right_rate}% of {words_number} words, unkown is {unknown_rate}%")

vocabulary = set(normalized_list)
evaluator(test_list, vocabulary)

76.34% of 186 words, unkown is 6.99%


## Creating a turbine generator

In [67]:
# This function will return a correct word in a distance of 2 from the words spelled wrongly
# Apply twice the words_generator
def turbine_generator(generated_words):
    new_words = []
    for word in generated_words:
        new_words += words_generator(word)
    return new_words

In [68]:
word = "lóiigica"
all_words_generated = turbine_generator(words_generator(word))
# check if the right words is in the generated list of words
"lógica" in all_words_generated

True

In [69]:
# Generates a lot of words if just one wrong word. So this is not efficient
len(all_words_generated)

787396

## Creating a new spell checker, avoiding unnecessary words generated in the process.

In [70]:
def new_spell_checker(word):
    generated_words = words_generator(word)
    turbinated_words = turbine_generator(generated_words)
    all_words = set(generated_words + turbinated_words)
    # If I can corret this words it is returned this word
    candidates = [word]
    for word in all_words:
        if word in vocabulary:
            candidates.append(word)
    corrected_word = max(candidates, key=probability)
    return corrected_word

In [71]:
word = "lóiigica"
new_spell_checker(word)

'lógica'

## Improving the evaluator

In [72]:
def evaluator(tests, vocabulary):
    words_number = len(tests)
    got_right = 0
    unkown = 0
    for right, wrong in tests:
        corrected_word = new_spell_checker(wrong)
        unkown += (right not in vocabulary)
        if corrected_word == right:
            got_right += 1
        # This else is in a wrong place. The unknown words does not depend if the word is corrected or not
        else:
            print(wrong + "-" + spell_checker(wrong) + "-" + corrected_word)
    right_rate = round(got_right*100/words_number, 2)
    unknown_rate = round(unkown*100/words_number, 2)
    print(f"{right_rate}% of {words_number} words, unkown is {unknown_rate}%")

In [None]:
def evaluator(tests, vocabulary):
    words_number = len(tests)
    got_right = 0
    unkown = 0
    for right, wrong in tests:
        corrected_word = new_spell_checker(wrong)
        unknow += (right not in vocabulary) # right place
        if corrected_word == right:
            got_right += 1
        # This unknow is in a wrong place. The unknown words does not depend if the word is corrected or not
        else:
            #unknow += (right not in vocabulary) 
            print(wrong + "-" + spell_checker(wrong) + "-" + corrected_word)
    right_rate = round(got_right*100/words_number, 2)
    unknown_rate = round(unknow*100/words_number, 2)
    print(f"{right_rate}% of {words_number} words, unkown is {unknown_rate}%")

In [73]:
# See the wrong word, word passed to the spell checker and the right word
# The performance decrease
vocabulary = set(normalized_list)
evaluator(test_list, vocabulary) # the new evaluator corrects a lot the words

esje-esse-se
sãêo-são-não
dosa-dos-do
eme-em-de
eàssa-essa-esse
daõs-das-da
céda-cada-da
noâ-no-o
enêão-então-não
tĩem-tem-em
nossah-nossa-nosso
teb-tem-de
atĩ-até-a
âem-em-de
foo-foi-o
serr-ser-se
entke-entre-então
van-vai-a
çeus-seus-seu
eû-e-de
temeo-tempo-temos
semre-sempre-ser
elaá-ela-ele
síó-só-se
siàe-site-se
seém-sem-em
peln-pelo-ele
aléra-alura-agora
tdia-dia-da
tuúo-tudo-tipo
jé-é-de
sãô-são-não
odos-dos-do
siua-sua-seu
elpe-ele-esse
teos-temos-os
eũsa-essa-esse
vjmos-vamos-temos
dms-dos-de
cava-java-para
ános-nos-no
èaso-caso-as
túem-tem-em
daáos-dados-dos
nossk-nosso-nosso
tãer-ter-ser
vté-até-é
búm-bem-um
sçerá-será-ser
entró-entre-então
uai-vai-a
sâus-seus-seu
ìeu-seu-de
fual-qual-sua
elal-ela-ele
skó-só-se
secm-sem-em
aluéa-alura-além
dil-dia-de
sód-só-se
eúaa-aeúaa-essa
ró-só-de
dĩaz-adĩaz-da
correptor-corretor-correto
trtica-tática-prática
ewpoderamento-aewpoderamento-ewpoderamento
îgato-gato-fato
cakvalo-acakvalo-carvalho
canelac-acanelac-janela
tênisy-atênisy-tênisy

The performance decrease because in the set of the words the wrong answers are to 1 distance to the right word

In [74]:
# The better one for this set of data
def evaluator(tests, vocabulary):
    words_number = len(tests)
    got_right = 0
    unkown = 0
    for right, wrong in tests:
        corrected_word = spell_checker(wrong)
        unkown += (right not in vocabulary)
        if corrected_word == right:
            got_right += 1 
    right_rate = round(got_right*100/words_number, 2)
    unknown_rate = round(unkown*100/words_number, 2)
    print(f"{right_rate}% de {words_number} palavras, unkown é {unknown_rate}%")

vocabulary = set(normalized_list)
evaluator(test_list, vocabulary)

76.34% de 186 palavras, unkown é 6.99%


In [75]:
word = "lóiigica"
print(new_spell_checker(word))
print(spell_checker(word))

lógica
alóiigica
