In [133]:
'''----------------Import modules START------------------'''

import sys
import time
import re
import editDist

import nltk
nltk.download('wordnet')
from nltk.corpus import wordnet as wn
from nltk import FreqDist
nltk.download('brown')
from nltk.corpus import brown

from ipypb import irange

from operator import itemgetter, attrgetter
from collections import deque
from sortedcontainers import SortedSet,SortedList

'''----------------Import modules END--------------------'''

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Akbar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package brown to
[nltk_data]     C:\Users\Akbar\AppData\Roaming\nltk_data...
[nltk_data]   Package brown is already up-to-date!


'----------------Import modules END--------------------'

In [175]:

'''------------------------------------------------------'''

class DAWG_Node:
    ''' ______________________________________________________ '''
    '''|                DAWG_Node Class START                 |'''
    '''|                        ...                           |'''
    '''|                        ...                           |'''
    NextId = 0
    '''|                        ...                           |'''
    '''|------------------INITIALIZE NODE---------------------|'''
    '''|                        ...                           |'''
    def __init__(self):
        self.id = DAWG_Node.NextId
        DAWG_Node.NextId += 1
        self.final = False
        self.edges = {}
    '''|                        ...                           |'''
    '''|-----------------------STRING-------------------------|'''
    '''|                        ...                           |'''
    def __str__(self):        
        arr = []
        if self.final: 
            arr.append("1")
        else:
            arr.append("0")
        for (label, node) in self.edges.items():
            arr.append( label )
            arr.append( str( node.id ) )
        return "_".join(arr)
    '''|                        ...                           |'''
    '''|-----------------------HASH---------------------------|'''
    '''|                        ...                           |'''
    def __hash__(self):
        return self.__str__().__hash__()
    '''|                        ...                           |'''
    '''|---------------------EQUALITY-------------------------|'''
    '''|                        ...                           |'''
    def __eq__(self, other):
        return self.__str__() == other.__str__()
    def get_candidates(self,prefix,word,tolerance):
        if(len(self.t3)>=3):
            return []
        #print(prefix+"|"+word)
        candidates = []
        if( tolerance==0):
            for (label, node) in self.edges.items():
                if(word and label==word[0]):
                    candidates += node.get_candidates(prefix+label,word[1:],tolerance) # no replace
        if( tolerance>0 ):
            candidates += self.get_candidates(prefix,word[1:],tolerance-1) # delete
            for (label, node) in self.edges.items():
                candidates += node.get_candidates(prefix+label,word,tolerance-1) # insert
                if(word):
                    candidates += node.get_candidates(prefix+label,word[1:],tolerance-1) # bad replace
        if(self.final and not word):
            candidates.append(prefix)
            self.t3.add(prefix)
        return candidates
    '''|                        ...                           |'''
    '''|                        ...                           |'''
    '''|                DAWG_Node Class END                   |'''
    ''' ______________________________________________________ '''

    
    

class myDict:
    wordSet = SortedSet([])
    errorList = []
    t3 = set()
    ''' ______________________________________________________ '''
    '''|                myDict Class START                    |'''
    '''|                        ...                           |'''
    '''|                        ...                           |'''
    '''|                        ...                           |'''
    '''|------------------INITIALIZE DICT---------------------|'''
    '''|                        ...                           |'''
    def __init__(self):
        self.previousWord = ""
        self.root = DAWG_Node()
        self.uncheckedNodes = []
        self.minimizedNodes = {}
        self.load_custom_dict()
        self.load_brown()
        self.finish()
    '''|                        ...                           |'''
    '''|------------------INSERT WORD-------------------------|'''
    '''|                        ...                           |'''
    def insert( self, word ):
        if word < self.previousWord:
            print("ERROR");
            raise Exception("Error: not in alphabetical order.")
        commonPrefix = 0
        maxPrefix =  min( len(word),len(self.previousWord) )
        for i in range( maxPrefix ):
            if word[i] != self.previousWord[i]: break
            commonPrefix += 1
        self._minimize( commonPrefix )
        
        if len(self.uncheckedNodes) == 0:
            node = self.root
        else:
            node = self.uncheckedNodes[-1][2]

        for letter in word[commonPrefix:]:
            nextNode = DAWG_Node()
            node.edges[letter] = nextNode
            self.uncheckedNodes.append( (node, letter, nextNode) )
            node = nextNode
            
        node.final = True
        self.previousWord = word
    '''|                        ...                           |'''
    '''|------------------FINISH MINIMIZATIONS----------------|'''
    '''|                        ...                           |'''
    def finish( self ):
        self._minimize( 0 );
    '''|                        ...                           |'''
    '''|------------------MINIMIZE DAWG-----------------------|'''
    '''|                        ...                           |'''
    def _minimize( self, downTo ):
        for i in range( len(self.uncheckedNodes) - 1, downTo - 1, -1 ):
            (parent, letter, child) = self.uncheckedNodes[i];
            if child in self.minimizedNodes:
                parent.edges[letter] = self.minimizedNodes[child]
            else:
                self.minimizedNodes[child] = child;
            self.uncheckedNodes.pop()
    '''|                        ...                           |'''
    '''|------------------LOOK UP A WORD----------------------|'''
    '''|                        ...                           |'''
    def lookup_DAWG( self, word ):
        node = self.root
        for letter in word:
            if letter not in node.edges: return False
            node = node.edges[letter]
        return node.final
    #!!!!!!!!!!!!!!!!
    def lookup_to_DAWG( self, word ):
        node = self.root
        i=0
        candidates = []
        for letter in word:
            if letter not in node.edges: 
                print(word[:i])
                #print(node.edges.items())
                #print({key: value for key, value in node.edges if node.edges[key].final})
                #print(node.edges)
                for key, value in node.edges.items():
                    if(value=="1"):
                        print(key)
                        print(node.edges[str(key)])
                        candidates.append(node.edges[str(key)])
                print(candidates)
                return node.edges
            node = node.edges[letter]
            i+=1
        return node.final
    
    def goodApostrophe(self, word):
        word_no_apst = re.sub("(\'s$)|(s\'$)",'',word)
        if word == word_no_apst:
            return False
        elif self.lookUp(word_no_apst):
            return True
        else:
            return False
    
    def good_hyphen(self,word):
        pattern_compound = re.compile(r"([^\-]+)")
        accept_compound = True
        roots = list(filter(None, word.split('-')))
        if(len(roots)==1):
            return self.goodApostrophe(word)
        for r , root in enumerate(roots):
            if self.lookUp(root) or self.goodApostrophe(root):
                continue
            else:
                accept_compound = False
                break
        return accept_compound
    
    def lookUp( self, word ):
        if(word in self.wordSet):
            return True
        elif(wn.synsets(word,'asrnv')):
            self.wordSet.add(word)
            return True
        elif(self.good_hyphen(word)):
            self.wordSet.add(word)
            return True
    '''|                        ...                           |'''
    '''|------------------COUNT STATES------------------------|'''
    '''|                        ...                           |'''
    def nodeCount( self ):
        return len(self.minimizedNodes)
    '''|                        ...                           |'''
    '''|------------------COUNT TRANSITIONS-------------------|'''
    '''|                        ...                           |'''
    def edgeCount( self ):
        count = 0
        for node in self.minimizedNodes:
            count += len(node.edges)
        return count
    '''|                        ...                           |'''
    '''|------------------LOAD CUSTOM DICT--------------------|'''
    '''|                        ...                           |'''
    def load_custom_dict( self ):
        CUSTOMDICT = SortedSet([])
        with open("./hardcode/custom_dict.txt",'r') as file:
            for line in file:
                word = "".join(line.split())
                self.wordSet.add(word.lower())
        print("Custom Dictionary Loaded :)")
    def load_brown( self ):
        frequency_list = FreqDist(w.lower() for w in brown.words() if (re.search('[a-zA-Z]+',w) and self.lookUp(w.lower())) )
        print("Brown Loaded :)")
    def readText(self, file):
        pattern = re.compile(r"([\w\-\']*[a-zA-Z]+[\w\-\']*)")
        with open("mobydick.txt") as file:
            for line in file:                      # foreach line
                for match in re.finditer(pattern, line):
                    word = line[match.start():match.end()].lower()
                    if( not self.lookUp(word)):
                        self.errorList.append(word)
        print("Moby-Dick Loaded :)")
    def add(self,word):
        self.lookUp(word)
    def gen_DAWG(self):
        for word in self.wordSet:
            self.insert(word)
    def wordCount(self):
        return len(self.wordSet)
    def print_about(self):
        print("Dictionary(DAWG) contains "
              , Dictionary.wordCount()
              , "words as (" 
              ,Dictionary.nodeCount()
              , "STATES ," 
              ,Dictionary.edgeCount()
              , "TRANSITIONS )")
    def f7(self,seq):
        seen = set()
        seen_add = seen.add
        return [x for x in seq if not (x in seen or seen_add(x))]
    def corrections(self,misspelled):
        maxEdit = 100
        top3 = [[" ",maxEdit],[" ",maxEdit],[" ",maxEdit]]
        candidates = self.candidates(misspelled,1)
        for c in candidates:
            entry = [ c , 1 ]
            top3 += [entry]
            top3 = sorted(top3, key=lambda x: (x[1]))[:3]
            maxEdit = int(top3[2][1])
        for Dword in self.wordSet:
            if(maxEdit==1):
                break
            d = editDist.minEditDist(misspelled , Dword , maxEdit)
            if d < maxEdit:
                entry = [ Dword , d ]
                top3 += [entry]
                top3 = sorted(top3, key=lambda x: (x[1]))[:3]
                maxEdit = int(top3[2][1])
        print(misspelled + ": " + ', '.join( list(map( lambda x : x[0],top3))))
    def errors(self,):
        e = self.errorList
        el = self.f7(e)
        print(len(el))
        for i in irange(0,len(el[:100]),1):
            error = el[i]
            self.corrections(error)
    def candidates(self,word,tolerance):
        self.t3 = self.t3.clear()
        root = self.root
        c = list(set(root.get_candidates("",word,tolerance)[:3]))
        return(c)
    '''|                        ...                           |'''
    '''|                        ...                           |'''
    '''|                myDict Class END                      |'''
    ''' ______________________________________________________ '''


In [176]:
%time Dictionary = myDict()
Dictionary.add("at")
Dictionary.add("mat")
Dictionary.add("hat")
Dictionary.add("rat")
Dictionary.add("rats")
Dictionary.add("fill")

Custom Dictionary Loaded :)
Brown Loaded :)
Wall time: 10.1 s


In [177]:
%time Dictionary.readText("mobydick.txt")

Moby-Dick Loaded :)
Wall time: 468 ms


In [178]:
%time Dictionary.gen_DAWG()

Wall time: 1.42 s


In [179]:
Dictionary.print_about()

Dictionary(DAWG) contains  42956 words as ( 30991 STATES , 58178 TRANSITIONS )


In [180]:
Dictionary.candidates("zat",1)

UnboundLocalError: local variable 't3' referenced before assignment

In [None]:
Dictionary.errors()