# Autocompletion Using Fast autocomplete
Fast autocomplete uses a DWG to search through words that can be fed into an autocomplete object type. As such, we must both populate and rank words to feed into our object before we can use autocomplete

In [None]:
import nltk
from nltk.corpus import brown, words
from fast_autocomplete import AutoComplete
import string

## Build the library
Run this if the word_dict.json file has not been made yet. Otherwise, skip to the next section

In [134]:
import nltk
from nltk.corpus import brown, words
from fast_autocomplete import AutoComplete
import string

# Load the Brown Corpus
nltk.download('brown')
corpus = brown.words()


# Want to filter to only valid words. We use valid words as this is a default
with open('valid_words.txt', 'r') as file:
    valid_words_raw = file.read().splitlines()
valid_words = [word.lower() for word in valid_words_raw]



# # Count the frequency of each word
word_freq = nltk.FreqDist(corpus)

# # Create a list of words sorted by frequency
# Note we remove all cases where words have punctuation besides ' and - 
corpus_raw = [word.lower() for word in corpus if all(c not in string.punctuation or c in ["'", "-"] for c in word)]
corpus = corpus_raw # Can filter here if desired



[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\andre\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


In [172]:
# Get frequency distribution
word_freq = nltk.FreqDist(corpus)
len(word_freq.most_common()) # number of unique words from the corpus

48484

In [173]:
# Create a list of words sorted by frequency
words = [(word, freq) for word, freq in word_freq.most_common(30000) if word in valid_words]
# Create a dictionary of words in the required format
word_dict = {}
words_new = {}
for word, freq in words:
    word_dict[word] = [{}, word, freq]
    words_new[word] = {'count':freq}

# Print the first 10 words in the dictionary
for word, data in list(word_dict.items())[:10]:
    print(word, data)
print("number of words:",len(word_dict.items()))

the [{}, 'the', 69971]
of [{}, 'of', 36412]
and [{}, 'and', 28853]
to [{}, 'to', 26158]
in [{}, 'in', 21337]
that [{}, 'that', 10594]
is [{}, 'is', 10109]
was [{}, 'was', 9815]
he [{}, 'he', 9548]
for [{}, 'for', 9489]
number of words: 22378


Lets first try storing the words into a json and load directly from there

In [175]:
### ONLY RUN IF WE WANT TO REPLACE THE CURRENT JSON DIctionary!!!
# import json
# # Convert the word_dict to a JSON string
# word_dict_json = json.dumps(word_dict)

# # Write the JSON string to a file
# with open('word_dict.json', 'w') as f:
#     f.write(word_dict_json)

## Use the autocomplete
Now we can create an autocomplete object using the json file and run inference

In [186]:
from fast_autocomplete import autocomplete_factory

content_files = {
    'words': {
        'filepath': 'word_dict.json',
        'compress': True  # means compress the graph data in memory
    }
}

autocomplete = autocomplete_factory(content_files=content_files)

In [187]:
autocomplete.search(word='cras',size=10, max_cost=100)

[['crash'], ['crashed'], ['crashing'], ['crass'], ['crashes']]

Below is to troubleshoot any missing words

In [188]:
# If a word is missing, we can check if it is vaid words, or just did not occur in the corpus
test_word = "potato"
if test_word in valid_words:
    try:
        autocomplete.words[test_word]
        print(test_word, "is valid and in corpus")
    except:
        print(test_word,"is valid but NOT in the corpus")
else:
    print(test_word,"not in valid word set")


potato is valid and in corpus


## Draw DWG
We can use this to generate a graph

In [190]:
from fast_autocomplete import AutoComplete, DrawGraphMixin


class AutoCompleteDraw(DrawGraphMixin, AutoComplete):
    pass

autocomplete = AutoCompleteDraw(words=words_new)
autocomplete.draw_graph('graph.png')

You need to install pygraphviz in order to draw graphs


UnboundLocalError: local variable 'pgv' referenced before assignment