# Alura Course - Spell Checker PT-BR

## Download data

In [1]:
from helper.data_acquisition import from_url
from helper.data_manipulation import unzip

In [2]:
from_url('https://github.com/alura-cursos/corretor/archive/master.zip')

In [3]:
unzip('./data/master.zip', './data/')

## Read the file with articles

In [4]:
with open('data/corretor-master/artigos.txt', 'r', encoding='utf-8') as file:
    corpus = file.read()

In [5]:
print(corpus[:500])




imagem 

Temos a seguinte classe que representa um usuário no nosso sistema:

java

Para salvar um novo usuário, várias validações são feitas, como por exemplo: Ver se o nome só contém letras, [**o CPF só números**] e ver se o usuário possui no mínimo 18 anos. Veja o método que faz essa validação:

java 

Suponha agora que eu tenha outra classe, a classe `Produto`, que contém um atributo nome e eu quero fazer a mesma validação que fiz para o nome do usuário: Ver se só contém letras. E aí? Vou


## Tokenizing the corpus and create the word catalog

In [6]:
import nltk

In [7]:
nltk.download('punkt', quiet=True)

True

**Getting a list with all words from curpus string**

In [8]:
tokens = nltk.tokenize.word_tokenize(corpus)

In [9]:
len(tokens)

515905

In [10]:
words_catalog = [ w for w in tokens if w.isalpha() ]

In [11]:
len(words_catalog)

403104

**normalizing the words list**

In [12]:
words_catalog = [ w.lower() for w in words_catalog ]

**Removing repeated words**

In [13]:
words_catalog = set(words_catalog)

In [14]:
len(words_catalog)

18465

## Implementing a spell checker scenario: a missing character

In [15]:
def insert_character(slices):
    characters = 'abcdefghijklmnopqrstuvwxyzàáâãèéêìíîòóôõùúûç'
    return [ l+c+r for l, r in slices for c in characters ]

def word_generator(word):
    slices = [ (word[:ix], word[ix:]) for ix in range(len(word)+1) ]
    return insert_character(slices)

In [16]:
word_generator('lgica')[:5]

['algica', 'blgica', 'clgica', 'dlgica', 'elgica']

In [17]:
def spell_checker_suggestions(word):
    generated_words = word_generator(word)
    return [ w for w in generated_words if w in words_catalog ]

len_word_catalog = len(words_catalog)
word_frequency = nltk.FreqDist([ w for w in tokens if w.isalpha() ])
def spell_checker(word):
    generated_words = word_generator(word)
    return max(generated_words, key=lambda w: word_frequency[w]/len_word_catalog)

In [18]:
print("Spell checker for word 'lgica':", spell_checker('lgica'))
print("Spell checker for word 'jva':", spell_checker('jva'))
print("Spell checker for word 'pra':", spell_checker('pra'))

print("Suggestions for word 'lgica':", spell_checker_suggestions('lgica'))
print("Suggestions checker for word 'jva'", spell_checker_suggestions('jva'))
print("Suggestions checker for word 'pra':", spell_checker_suggestions('pra'))

Spell checker for word 'lgica': lógica
Spell checker for word 'jva': java
Spell checker for word 'pra': para
Suggestions for word 'lgica': ['lógica']
Suggestions checker for word 'jva' ['java']
Suggestions checker for word 'pra': ['para', 'pera', 'pura', 'pára', 'pras']


## Creating the method to evaluate the spell checker

In [19]:
def load_test_words():
    with open('./data/corretor-master/palavras.txt', 'r', encoding='utf-8') as f:
        return [ tuple(l.split()) for l in f.readlines() ]

In [20]:
test_words = load_test_words()
def evaluate():
    success_rate = sum([ 1 for t in test_words if spell_checker(t[1]) == t[0] ]) / len(test_words)
    print(f'The spell checker success rate is: {success_rate}')

In [21]:
evaluate()

The spell checker success rate is: 0.010752688172043012


## Continue the spell checker implementation

In [22]:
def remove_character(slices):
    return [ l+r[1:] for l, r in slices ]

In [23]:
def word_generator(word):
    slices = [ (word[:ix], word[ix:]) for ix in range(len(word)+1) ]
    return insert_character(slices) + remove_character(slices)

In [24]:
print("Spell checker for word 'lógicaa':", spell_checker('lgica'))
print("Spell checker for word 'jaava':", spell_checker('jva'))
print("Spell checker for word 'prax':", spell_checker('pra'))

print("Suggestions for word 'lógicaa':", spell_checker_suggestions('lgica'))
print("Suggestions checker for word 'jaava'", spell_checker_suggestions('jva'))
print("Suggestions checker for word 'prax':", spell_checker_suggestions('pra'))

Spell checker for word 'lógicaa': lógica
Spell checker for word 'jaava': java
Spell checker for word 'prax': para
Suggestions for word 'lógicaa': ['lógica']
Suggestions checker for word 'jaava' ['java', 'ja']
Suggestions checker for word 'prax': ['para', 'pera', 'pura', 'pára', 'pras', 'pr', 'pra']


In [25]:
evaluate()

The spell checker success rate is: 0.41397849462365593


In [26]:
def replace_character(slices):
    characters = 'abcdefghijklmnopqrstuvwxyzàáâãèéêìíîòóôõùúûç'
    return [ l+c+r[1:] for l, r in slices for c in characters ]

In [27]:
def word_generator(word):
    slices = [ (word[:ix], word[ix:]) for ix in range(len(word)+1) ]
    return insert_character(slices) + remove_character(slices) + replace_character(slices)

In [28]:
print("Spell checker for word 'lugica':", spell_checker('lugica'))
print("Spell checker for word 'jxva':", spell_checker('jxva'))
print("Spell checker for word 'paxa':", spell_checker('paxa'))

print("Suggestions for word 'lugica':", spell_checker_suggestions('lugica'))
print("Suggestions checker for word 'jxva'", spell_checker_suggestions('jxva'))
print("Suggestions checker for word 'paxa':", spell_checker_suggestions('paxa'))

Spell checker for word 'lugica': lógica
Spell checker for word 'jxva': java
Spell checker for word 'paxa': para
Suggestions for word 'lugica': ['lógica']
Suggestions checker for word 'jxva' ['java']
Suggestions checker for word 'paxa': ['taxa', 'poxa', 'puxa', 'paga', 'para']


In [29]:
evaluate()

The spell checker success rate is: 0.7688172043010753


In [30]:
def change_character_order(slices):
    return [ l+r[1]+r[0]+r[2:] for l, r in slices if len(r) > 1]

In [31]:
def word_generator(word):
    slices = [ (word[:ix], word[ix:]) for ix in range(len(word)+1) ]
    return insert_character(slices) + remove_character(slices) + replace_character(slices) + change_character_order(slices)

In [32]:
print("Spell checker for word 'ólgica':", spell_checker('ólgica'))
print("Spell checker for word 'ajva':", spell_checker('ajva'))
print("Spell checker for word 'paar':", spell_checker('paar'))

print("Suggestions for word 'ólgica':", spell_checker_suggestions('ólgica'))
print("Suggestions checker for word 'ajva'", spell_checker_suggestions('ajva'))
print("Suggestions checker for word 'paar':", spell_checker_suggestions('paar'))

Spell checker for word 'ólgica': lógica
Spell checker for word 'ajva': java
Spell checker for word 'paar': para
Suggestions for word 'ólgica': ['lógica']
Suggestions checker for word 'ajva' ['java']
Suggestions checker for word 'paar': ['pagar', 'parar', 'par', 'par', 'phar', 'pair', 'paas', 'para']


In [33]:
evaluate()

The spell checker success rate is: 0.7688172043010753


## Add unknown words rate to evaluation function

In [34]:
def evaluate():
    success_rate = sum([ 1 for t in test_words if spell_checker(t[1]) == t[0] ]) / len(test_words)
    print(f'The spell checker success rate is: {success_rate}')
    
    unknown_rate = sum([ 1 for t in test_words if t[0] not in words_catalog ]) / len(test_words)
    print(f'The unknown words rate is: {unknown_rate}')

In [35]:
evaluate()

The spell checker success rate is: 0.7688172043010753
The unknown words rate is: 0.06989247311827956


## Enhance the spell checker

In [36]:
def word_generator_enhanced(word):
    generated_words = word_generator(word)
    other_generated_words = [ w2 for w in generated_words for w2 in word_generator(w) ]
    all_generated_words = list(set(generated_words + other_generated_words))
    return [word] + [ w for w in all_generated_words if w in words_catalog ]

word_generator_enhanced('lóoogica')

['lóoogica', 'lógica']

In [37]:
def new_spell_checker_suggestions(word):
    return word_generator_enhanced(word)

def new_spell_checker(word):
    generated_words = word_generator_enhanced(word)
    return max(generated_words, key=lambda w: word_frequency[w]/len_word_catalog)

In [38]:
print("Spell checker for word 'óllgica':", new_spell_checker('óllgica'))
print("Spell checker for word 'jaaava':", new_spell_checker('jaaava'))
print("Spell checker for word 'paxr':", new_spell_checker('paxr'))

print("Suggestions for word 'óllgica':", new_spell_checker_suggestions('óllgica'))
print("Suggestions checker for word 'paxr'", new_spell_checker_suggestions('paxr'))
print("Suggestions checker for word 'paxr':", new_spell_checker_suggestions('paxr'))

Spell checker for word 'óllgica': lógica
Spell checker for word 'jaaava': java
Spell checker for word 'paxr': para
Suggestions for word 'óllgica': ['óllgica', 'lógica', 'bélgica']
Suggestions checker for word 'paxr' ['paxr', 'pao', 'pass', 'pair', 'por', 'phar', 'puxo', 'puxar', 'pixar', 'puxe', 'para', 'paz', 'pago', 'pr', 'page', 'mar', 'parar', 'par', 'pagar', 'sair', 'paga', 'ax', 'pau', 'paas', 'dar', 'pôr', 'taxa', 'paul', 'puxa', 'papo', 'psr', 'pior', 'pay', 'pad', 'poxa', 'max', 'var', 'per', 'exr', 'país', 'pais', 'pra', 'tar', 'jar', 'pixlr', 'pai', 'cair', 'paro', 'bar', 'pare', 'path', 'ar', 'paira', 'px']
Suggestions checker for word 'paxr': ['paxr', 'pao', 'pass', 'pair', 'por', 'phar', 'puxo', 'puxar', 'pixar', 'puxe', 'para', 'paz', 'pago', 'pr', 'page', 'mar', 'parar', 'par', 'pagar', 'sair', 'paga', 'ax', 'pau', 'paas', 'dar', 'pôr', 'taxa', 'paul', 'puxa', 'papo', 'psr', 'pior', 'pay', 'pad', 'poxa', 'max', 'var', 'per', 'exr', 'país', 'pais', 'pra', 'tar', 'jar', '

In [39]:
def evaluate():
    success_rate = sum([ 1 for t in test_words if new_spell_checker(t[1]) == t[0] ]) / len(test_words)
    print(f'The spell checker success rate is: {success_rate}')
    
    unknown_rate = sum([ 1 for t in test_words if t[0] not in words_catalog ]) / len(test_words)
    print(f'The unknown words rate is: {unknown_rate}')

In [40]:
evaluate()

The spell checker success rate is: 0.553763440860215
The unknown words rate is: 0.06989247311827956


In [41]:
print(spell_checker('lgica'))
print(new_spell_checker('lgica'))

lógica
fica
