# 定义 lexnode

In [1]:
from collections import Counter
class LexNode:
    def __init__(self, val):
        self.val = val
        self.children = []
        # set the property so that we can differentiate the start node, normal(between) node and end of word node
        # 0: normal node
        # 1: start node
        # 2: end-of-word node
        self.property = 0

    # pretty print tree in command line
    def pretty_str(self, level=0):
        #print(self.val, level)
        if self.property==0:
            ret = "--"+self.val
        elif self.property==2:
            ret="--"+self.val+"\n"
        else:
            ret=self.val
        num_children=len(self.children)
        for i in range(num_children):
            if level!=0:
                if num_children>=2 and i!=0:
                    ret += "|  "*level+"|"+self.children[i].pretty_str(level+1)
                elif num_children==1 and self.children[0].val=="*":
                    return ret
                else:
                    ret += self.children[i].pretty_str(level+1) 
            elif level==0:
                if num_children>=2 and i!=0:
                    ret += "|\n"+"|"+self.children[i].pretty_str(level+1)
                else:
                    ret += self.children[i].pretty_str(level+1)
            
        return ret
    
    def set_level(self, level=0):
        self.level=level
        for child in self.children:
            child.set_level(level + 1)
        return
    
    def get_max_level(self, level=0):
        if len(self.children) == 0:
            return level
        child_levels = [child.get_max_level(level=level + 1) for child in self.children]
        return max(child_levels)

# 定义建 lextree

In [2]:
class BuildLextree:
    def __init__(self, dic_file_path):
        self.txt2words(dic_file_path)
        self.tree=LexNode('*')
        # dummy symbol for the root of the tree
        self.tree.property = 1
        n_words = len(self.words)
        word_lens = [len(w) for w in self.words]
        #find the longest word in this dictionary
        self.max_word_len = max(word_lens)
        # pad the words to the same length using '$', in order to avoid index out of range
        for n in range(n_words):
            self.words[n] = self.words[n].ljust(self.max_word_len,'$')
        print("There are {} words in this dictionary".format(len(self.words)))
        
    def txt2words(self,file_path):
        self.words=[]
        dictionary= open (file_path, "r")
        words=dictionary.readlines()
        for word in words:
            self.words.append(word.strip('\n'))
        
    def append_lex_node(self,parent, child):
        #This function just append the child node to the paretn node
        #It would check whether the parent is a LexNode!
        assert type(parent) is LexNode and type(child) is LexNode
        parent.children.append(child)
    
    def build_lextree(self):
        #this is the function to build the lextree from the self.words and root node "*"
        self._build_lextree(self.tree, self.words,0)
        
        
    def _build_lextree(self,node, words, i):
        '''
        This function is to build the tree based on the specific node. It is the basic part of this class
        input: 
        1.the node that need to add children
        2.the words that need to be reviewed
        3. the index that the character on which need to be reviewed
        4. the legthn of the longthest word
        '''
        # once, it reach the longest word and finished that part, we should stop
        if i == self.max_word_len:
            print("The Lextree has been build")
        #record the words that have reached their end
        to_be_removed=[]
        for word in words:
            # if current character is the end of a word, add it as a separate node even if the same node can be shared. This
            # is to ensure that every leaf represent exactly one word.
            if (i + 1 < self.max_word_len and word[i + 1] =='$') or i == self.max_word_len - 1:
                child = LexNode(word[i])
                child.property = 2
                #add "*" at the end of the word
                child.children.append(self.tree)
                self.append_lex_node(node, child)
                to_be_removed.append(word)
        #Now, remove the word that have reached the end
        if len(to_be_removed)!=0:
            for word in to_be_removed:
                words.remove(word)
        #for the rest words, let's append their left characters        
        n_words = len(words)
        # get the characters in j-th position
        chars = [words[n][i] for n in range(n_words)]
        counts = Counter(chars)
        #print(counts)
        for character, count in counts.items():
            # we only focus on this single character here
            child = LexNode(character)
            self.append_lex_node(node, child)
            #get the words that the i th character is the current character we look at
            words_to_be_review_for_this_character = [word for idx, word in enumerate(words) if word[i] == character]
            self._build_lextree(child, words_to_be_review_for_this_character, i + 1)

# 读取字典

In [3]:
#dic_file_path="chr_dict.txt"
dic_file_path="dict_1.txt"

# 根据字典建树

In [4]:
buildlextree=BuildLextree(dic_file_path)
buildlextree.build_lextree()
lextree=buildlextree.tree

There are 6251 words in this dictionary


# 该字典最长的单词的长度是

In [5]:
longest_word_lenth=buildlextree.max_word_len

# 按照教授的要求 print 这个字典

In [6]:
print(lextree.pretty_str(0))

*--a
|
|--e
|
|--i
|
|--o
|
|--a--n
|  |--m
|  |--n
|  |--s
|  |--t
|  |--b--a--n--d--o--n
|  |  |--b--a--s
|  |  |  |--r--e--v--i--a--t--i--o--n
|  |  |--d--o--m--i--n--a--l
|  |  |--e--l--a
|  |  |  |--r--n--e--t--h--y
|  |  |--i--d--e--s
|  |  |--l--e
|  |  |--o--l--i--s--h--i--n--g
|  |  |  |--r--t--i--o--n--i--s--t--s
|  |  |  |--u--t
|  |  |--r--a--h--a--m
|  |  |  |--i--d--g--e
|  |  |--s--e--n--c--e--s
|  |  |  |--o--l--v--e--d
|  |  |  |--t--i--n--e--n--t
|  |  |--u--n--d--a--n--t--l--y
|  |--c--c--e--d--e--s
|  |  |  |  |--n--t--u--a--t--i--n--g
|  |  |  |  |--p--t
|  |  |  |--i--d--e--n--t
|  |  |  |--o--m--m--o--d--a--t--e--d
|  |  |  |  |  |--p--a--n--y
|  |  |  |  |  |  |--l--i--s--h--m--e--n--t
|  |  |  |  |--r--d
|  |  |  |  |--u--n--t--a--n--c--y
|  |  |  |--r--u--e
|  |  |  |--u--r--i--d--e
|  |  |--e--r--o
|  |  |--h--e--s
|  |  |--k--e--r--'--s
|  |  |  |--r--o--y--d
|  |  |--q--u--a--i--n--t
|  |  |  |  |--i--t
|  |  |--r--o--n--y--m
|  |  |  |  |--s--s
|  |  |--t-

# spell checker的定义

In [18]:
import numpy as np
import copy
class SpellChecker:
    def __init__(self,beam,longest_word_lenth=45,tolerate_num_of_error=2):
        self.beam = int(beam)
        self.lextree=None
        self.dist_fun=lambda *args: int(args[0] != args[1])
        self.longest_word_lenth=longest_word_lenth
        self.tolerate_num_of_error=tolerate_num_of_error

    def fit(self,lextree):
        self.lextree=lextree
        assert type(self.lextree) is LexNode
        self.nodes = []
        self.get_nodes(self.lextree)
        # get self.transitions
        self.transitions = {}
        self.get_children={}
        to_children={}
        n_nodes = len(self.nodes)
        self.word_ends = []
        # record the end idx of each words, therefore, at the end of the vertibe , we can get the costs of each word
        for i in range(n_nodes):
            n = self.nodes[i]
            #print(str(i)+"    "+n.val)
            if n.property == 2:
                self.word_ends.append(i)
            self.get_children[i]=[]
            # add transition if there is any. to get the parent node of current node
            if len(n.children) > 0:
                for child in n.children:
                    self.get_children[i].append(self.nodes.index(child))
                    self.transitions[self.nodes.index(child)] = i
        self.transitions[0]=0 # "start point repeating"
        #print(self.word_ends)

    def mute_nodes(self,r):
        self.muted_nodes[r]=1
        #mute it children
        to_mute_children=self.get_children[r]
        #print(to_mute_children)
        if len(to_mute_children)!=0:
            for child_idx in to_mute_children:
                if child_idx!=0:
                    self.mute_nodes(child_idx)
                
    def get_nodes(self, lexnode):
        self.nodes.append(lexnode)
        if lexnode.property==2:
            return
        for child in lexnode.children:
            self.get_nodes(child)
    
    def text_vertibe(self,x,longest=True,get_string=True):
        #initialize the input x by adding "*" at the beginning of x. 
        x = '*' + x
        # initialize cost matrix
        n_cols = len(x)
        n_rows = len(self.nodes)
        costs = np.full((n_rows, n_cols), np.inf)
        costs[0,:]=range(n_cols)
        
        n_nodes = len(self.nodes)
        self.muted_nodes=np.zeros([n_nodes])

        # fill cost matrix
        for c in range(n_cols):
            for r in range(1,n_rows):
                if not self.muted_nodes[r]:
                    current_cost=self.dist_fun(self.nodes[r].val,x[c])
                    if c==0:#if it is the initialization.
                        if self.transitions[r]==0:
                            costs[r,c]=current_cost
                        else:
                            #if it is the middle nodes, then the cost is based on it previous node
                            costs[r,c]=costs[self.transitions[r],c]+current_cost
                    elif c==1:
                        if current_cost==0:
                            to_check=[costs[r,c-1]+1, # repeat the node from previous state
                                      costs[self.transitions[r],c-1],#get the node from previous state
                                     ] 
                        else:
                            to_check=[costs[r,c-1], # repeat the node from previous state
                                      costs[self.transitions[r],c-1],#get the node from previous state
                                     ] 
                        nnn=np.argmin(to_check)
                        costs[r,c]=to_check[nnn]+current_cost
                    else:
                        if current_cost==0:
                            to_check=[costs[r,c-1]+1, # repeat the node from previous state
                                      costs[self.transitions[r],c-1],#get the node from previous state
                                      costs[self.transitions[r],c-2]+1 # the c-1 is assumed as the insertion
                                     ]
                        else:
                            #that means they are different
                            to_check=[costs[r,c-1], # repeat the node from previous state
                                      costs[self.transitions[r],c-1],#get the node from previous state
                                      costs[self.transitions[r],c],#update from current state previous node
                                      costs[self.transitions[r],c-2]+1 # the c-1 is assumed as the insertion
                                     ]
                        nnn=np.argmin(to_check)
                        costs[r,c]=to_check[nnn]+current_cost


                    if costs[r,c]>=self.tolerate_num_of_error+len(x):
                        self.mute_nodes(r)
            #print(costs[:25,c])
        
        
        #find the minimum cost at the end of each word
        costs_for_words=[]
        for end_idx in self.word_ends:
            costs_for_words.append(costs[end_idx,-1])
        #print(costs_for_words[0:40])
        
        min_cost=min(costs_for_words)
        #print(min_cost)
        #get the index of minimum cost
        min_idxs=np.where(costs_for_words==min_cost)[0]
        #print(min_idxs)
        if get_string:
            result=self.nodes_to_word(min_idxs,longest=longest)
            return result
        else:
            return min_idxs, min_cost
    
    def nodes_to_word(self,nodes,longest=True):
        results=[]
        for idx in nodes:
            #get the real index of the end of the word
            idx=self.word_ends[idx]
            #if the min cost is the real end node, then we can get a real word
            result=""
            while self.transitions[idx]!=0:
                result=self.nodes[idx].val+result
                #print(self.nodes[idx].val)
                idx=self.transitions[idx]
            result=self.nodes[idx].val+result
            results.append(result)
            #print(results)
        if longest:
            #return the longest word in the candidate
            res = max(results, key=len, default='')
            return res
        return results
    
    
    def get_nodes_from_token(self,tokens,end_token):
        nodes=[]
        time=tokens[0]
        value=tokens[1]
        previous=tokens[2]
        previous_time=end_token[-1]
        nodes.append(end_token[1])
        while previous_time!=0:
            idxs=np.where(time==previous_time)[0]
            previous_time=previous[idxs[0]]
            nodes.insert(0,value[idxs[0]])
        return nodes
    
    def nodes_to_sentence(self,nodes):
        '''
        input [0,1,2]
        ouput a an ann
        '''
        sentence=""
        for idx in nodes:
            #if the min cost is the real end node, then we can get a real word
            result=""
            while self.transitions[idx]!=0:
                result=self.nodes[idx].val+result
                #print(self.nodes[idx].val)
                idx=self.transitions[idx]
            result=self.nodes[idx].val+result
            sentence+=result+" "
        
        return  sentence
    
    
    def nodes_list_to_sentences(self,nodes_list):
        words_list=self.nodes_list_to_words(nodes_list)
        num_words_in_each_words_list=list(map(len, words_list))
        num_possible_sentences=np.product(num_words_in_each_words_list)
        print("there are {} possible sentences".format(num_possible_sentences))
        padded_words_list=[]
        for words in words_list:
            if len(words)==0:
                continue
            repeat_time=int(num_possible_sentences/len(words))
            aaa_words_list=[]
            for i in range(repeat_time):
                aaa_words_list+=words
            padded_words_list.append(aaa_words_list)
        #print(padded_words_list)
        
        padded_words_list=np.array(padded_words_list)
        padded_words_list=np.column_stack(padded_words_list)
        sentences=[]
        for i in padded_words_list:
            sentence=""
            for word in i:
                sentence+=word +" "
            sentences.append(sentence)
        
        return sentences
    
    
    
    
    def nodes_list_to_words(self,nodes_list):
        '''
        input [[0],[],[1,2],[2]]
        ouput [["a"],["an", "ann"],["ann"]]
        '''
        all_words=[]
        for current_word_idxs in nodes_list:
            if len(current_word_idxs)==0:
                continue
            else:
                current_words=[]
                for idx in current_word_idxs:
                    #if the min cost is the real end node, then we can get a real word
                    result=""
                    while self.transitions[idx]!=0:
                        result=self.nodes[idx].val+result
                        #print(self.nodes[idx].val)
                        idx=self.transitions[idx]
                    result=self.nodes[idx].val+result
                    current_words.append(result)
                all_words.append(current_words)
        #print(all_words)
        return all_words
    
    def check_end_of_words(self,costs):
        costs_for_words=[]
        for end_idx in self.word_ends:
            costs_for_words.append(costs[end_idx])
        min_cost=min(costs_for_words)
#         print("minimum cost at this point is")
#         print(min_cost)
        #get the index of minimum cost
        min_idxs=np.where(costs_for_words==min_cost)[0]
        # delete the "*" where its cost is always 0
        min_idxs = min_idxs[min_idxs != 0]
        return min_cost,min_idxs
  
    def unsegment_text_vertibe(self,x,longest=True,n_best=10,get_sentence=True,loop_cost=0.002):
        #set different types of cost
        deletion_cost = 1.0
        self.diagonal_cost= 1.0 # 如果单词不符合就会是这样哦
        insertion_cost = 1.0
        match_cost = 0.0
        space_cost = 0.0
        loop_cost = loop_cost
        
        self.n_best=n_best
        # initialize cost matrix
        n_cols = len(x)
        n_rows = len(self.nodes)
        costs = np.full(n_rows, np.inf)
        # * to all other nodes
        initial_cost=np.zeros(n_rows)
        for r in range(n_rows):
            current_cost=1
            if self.transitions[r]==0:
                initial_cost[r]=current_cost
            else:
                #if it is the middle nodes, then the cost is based on it previous node
                initial_cost[r]=initial_cost[self.transitions[r]]+current_cost
        initial_cost[0]=0
        #print(initial_cost)
        #we allow one insertion and one deletion in each word
        initial_nodes=self.get_children[0]
        next_to_check_nodes=[]
        for node in initial_nodes:
            next_to_check_nodes.append(node)
            next_to_check_nodes+=self.get_children[node]
        # we allow the deletion. therefore, each time, we update the current node and its child node
        initial_nodes=next_to_check_nodes
        
        
#         print("initial to check nodes are ")
#         print(initial_nodes)
        y_level=[]
        y_level.append(initial_nodes)
        result_level=[[]]
        
        z_level=[]
        z_level.append(copy.deepcopy(initial_cost))
        # fill cost matrix
        for c in range(n_cols):
            next_result_level=[]
            next_z_level=[]
            next_y_level=[]
           
            next_result_level_filtered=[]
            next_z_level_filtered=[]
            next_y_level_filtered=[]
            
            temp_min_cost_level=[]
            
            #here, Huangrui get the num_nodes in each path
            num_nodes_in_each_path=list(map(len, result_level))
            #print("num of nodes in current sentences in  each path is {}".format(num_nodes_in_each_path))
            #here is the mean of the num_nodes_in_each_path
            mean_num_nodes=int(np.median(num_nodes_in_each_path))
            #print("median num of nodes in current sentences in  each path is {}".format(mean_num_nodes))
            
            print("dealing with the {} th input character".format(c+1))
            #here, Huangrui get the num_nodes in each path
            num_nodes_in_each_path=list(map(len, result_level))
            #print("num of nodes in current sentences in  each path is {}".format(num_nodes_in_each_path))
            #here is the mean of the num_nodes_in_each_path
            mean_num_nodes=int(np.median(num_nodes_in_each_path))
            #print("median num of nodes in current sentences in  each path is {}".format(mean_num_nodes))
            for current_possible_choice in range(len(z_level)):
                next_to_check_nodes=[]
                current_nodes=y_level[current_possible_choice]
                #print(current_nodes)
                z_column=z_level[current_possible_choice]
                current_result=result_level[current_possible_choice]
                temp_cost_column=copy.deepcopy(costs)
                
                #Huangrui try pruning here, basicly  for a sentence, i want some words
                #so i would compare the current "words count" of "each sentence"
            
                for r in current_nodes:
                    #print("updating {} th node ".format(r))
                    next_to_check_nodes+=self.get_children[r]
                    if self.nodes[r].val==x[c]:
                        diagonal_cost=match_cost
                    else:
                        diagonal_cost=self.diagonal_cost
                        
                    #print("current diagonal_cost is {}".format(diagonal_cost))
                    to_check=[z_column[r]+ insertion_cost, # 从左往右 相当于insertion 
                             z_column[self.transitions[r]]+diagonal_cost,#斜对角 
                             temp_cost_column[self.transitions[r]]+deletion_cost#there is a deletion of previous charactor
                            ]
                    #先更新所有的点的cost
                    temp_cost_column[r]= min(to_check)
                    
                #check the end of the words after each updata
                min_cost=min(temp_cost_column[self.word_ends])
                #print(min_cost)
                temp_min_cost_level.append(min_cost)
                #print(temp_min_cost_level)
                next_result_level.append(current_result)
                next_z_level.append(temp_cost_column)                
                next_y_level.append(sorted(list(set(next_to_check_nodes)),reverse=False))
            #print("before filtering there are {} paths".format(len(next_z_level)))
            #now filtering the temp_min_cost
            sorted_temp_min_cost_level=sorted(temp_min_cost_level)
            if len(sorted_temp_min_cost_level)<self.n_best:
                upfloor_min_cost=sorted_temp_min_cost_level[-1]
            else:
                upfloor_min_cost=sorted_temp_min_cost_level[n_best-1]
            
            #print("temp_min_cost_level is {}".format(temp_min_cost_level))
#             print(len(next_y_level))
#             print(len(next_z_level))
            global_min_path_cost=min(temp_min_cost_level)
            print(global_min_path_cost)
            if c==n_cols-1:
                #print("at this point the results are {}".format(len(next_result_level_filtered)))
                for choice in range(len(temp_min_cost_level)):
                    current_min_cost=temp_min_cost_level[choice]
#                     print("reach the end")
#                     print(current_min_cost)
                    min_idxs=np.where(next_z_level[choice][self.word_ends]==current_min_cost)[0]
                    current_possible_words=[]
                    for end_idx in min_idxs:
                        end_idx=self.word_ends[end_idx]
                        current_possible_words.append(end_idx)   
#                     print(current_possible_words)
                    aaa_result=copy.deepcopy(next_result_level[choice])
                    aaa_result.append(current_possible_words)
                    next_z_level_filtered.append(aaa_initial_cost)
                    next_result_level_filtered.append(aaa_result)
                    continue
                #print("at this point the results are {}".format(len(next_result_level_filtered)))
            
            else:
                for choice in range(len(temp_min_cost_level)):
                    current_min_cost=temp_min_cost_level[choice]
                    if current_min_cost<=upfloor_min_cost:
                    #we need to prepare the "new path" here
                        if (current_min_cost<=self.tolerate_num_of_error*(mean_num_nodes+1)):
                            #we find possible new path:
#                            print("current_Min_cost is {}".format(current_min_cost))
#                             print(next_z_level[choice])
                            min_idxs=np.where(next_z_level[choice][self.word_ends]==current_min_cost)[0]
#                             print("current_min_idxs are {}".format(min_idxs))
                            current_possible_words=[]
                            for end_idx in min_idxs:
                                end_idx=self.word_ends[end_idx]
                                current_possible_words.append(end_idx)   
                            #print(current_possible_words)
                            aaa_result=copy.deepcopy(next_result_level[choice])
                            aaa_result.append(current_possible_words)
                            # now add the initial cost with min_cost and loop cost
                            aaa_initial_cost=copy.deepcopy(initial_cost)
                            aaa_initial_cost+=current_min_cost
                            aaa_initial_cost+=loop_cost
                            next_y_level_filtered.append(initial_nodes)
                            next_z_level_filtered.append(aaa_initial_cost)
                            next_result_level_filtered.append(aaa_result)


                        next_y_level_filtered.append(next_y_level[choice])
                        next_z_level_filtered.append(next_z_level[choice])
                        next_result_level_filtered.append(next_result_level[choice])
                     
            #print("after filtering there are {} paths".format(len(next_z_level_filtered)))  
            z_level=next_z_level_filtered
            y_level=next_y_level_filtered
            result_level=next_result_level_filtered

            #print(result_level)
            
        #now, we have reach the end of the input unsegmented sentence, let's review every sentence we could have
#         print("Huangrui has reach the end of the unsegmented sentence!")
#         print(temp_min_cost_level)
#         print(len(temp_min_cost_level))
#         print(len(result_level))
        min_sentence_cost=min(temp_min_cost_level)
#         print("minimum cost at this point is")
#         print(min_sentence_cost)
#         print()
        if get_sentence:
            sentences=[]
            final_results_node_list=[]
            #get the index of minimum cost
            min_sentence_idxs=np.where(temp_min_cost_level==min_sentence_cost)[0]
            for min_sentence_idx in min_sentence_idxs:
                #print(min_sentence_idx)
                result=result_level[min_sentence_idx]
                #print(result)
                final_results_node_list.append(result)
                if get_sentence:
                    current_sentence=self.nodes_list_to_sentences(result)
                    #print(current_sentence)
                    sentences.append(current_sentence)
                return sentences,final_results_node_list,min_sentence_cost
        else:
            return min_sentence_cost

# initialized the Spellchecker and load lextress to this Spellchecker

In [19]:
spellchecker=SpellChecker(3,longest_word_lenth=9,tolerate_num_of_error=2)
spellchecker.fit(lextree)

# Test spellchecker

unsegment part, n_best means the beam, just the first n best path Huangrui has found

for word correction, this version only use pruning because Huangrui prefers accuracy rather than time efficeincy

In [9]:
# x="axn"
# spellchecker.text_vertibe(x,True)

In [21]:
x="byhookorbycrooksherepliedhemustbecaughtifidon'tgethimishalldie"
sentences,final_results_node_list,min_sentence_cost=spellchecker.unsegment_text_vertibe(x,n_best=10)

dealing with the 1 th input character
1.0
dealing with the 2 th input character
0.0
dealing with the 3 th input character
1.0
dealing with the 4 th input character
1.002
dealing with the 5 th input character
1.002
dealing with the 6 th input character
0.002
dealing with the 7 th input character
0.004
dealing with the 8 th input character
0.004
dealing with the 9 th input character
1.004
dealing with the 10 th input character
0.006
dealing with the 11 th input character
1.006
dealing with the 12 th input character
1.008
dealing with the 13 th input character
0.008
dealing with the 14 th input character
0.01
dealing with the 15 th input character
0.008
dealing with the 16 th input character
1.008
dealing with the 17 th input character
1.01
dealing with the 18 th input character
0.01
dealing with the 19 th input character
1.01
dealing with the 20 th input character
1.01
dealing with the 21 th input character
1.012
dealing with the 22 th input character
2.012
dealing with the 23 th input c

In [11]:
min_sentence_cost

In [22]:
sentences[0][1]

"by hook or by crook she rev flied he must be caught if i don't get him i shall die "

# read  typo.txt and correc the word

In [13]:
# file=open("typos.txt","r").readlines()
# sentences=[]
# for sentence in file:
#     words=sentence.strip('\n').split(" ")
#     if len(words)==1:
#         #which means it is an empty line
#         continue
#     corrected_sentence=""
#     for word in words:
#         corrected_word=spellchecker.text_vertibe(word)
#         print(corrected_word)
#         corrected_sentence+=corrected_word+" "
#     print(corrected_sentence)
#     sentences.append(corrected_sentence)

# For Problem 1 accuracy please refer to another note book

In [23]:
# file=open("segmented.txt","r").readlines()
# correct_seg=[]
# for sentence in file:
#     words=sentence.strip('\n').split(" ")
#     if len(words)==1:
#         #which means it is an empty line
#         continue
#     for word in words:
#         correct_seg.append(word)




# def accuracy_cal(result, correct_seg):
#     result = result.split()
#     d1 = abs(len(result) - len(correct_seg))
#     d2 = 0
#     for i in result:
#         if i not in correct_seg:
#             d2 += 1
#     return d1 + d2
# accuracy = accuracy_cal(result, correct_seg)
# print("accuracy: ", accuracy)
# accuracy_lst.append(accuracy)

# Read the unsegmented0.txt and segment it

In [27]:
n_best=10

file=open("unsegmented0.txt","r").readlines()
total_cost=0
for sentence in file:
    sentence=sentence.strip('\n')
    min_sentence_cost=spellchecker.unsegment_text_vertibe(sentence,n_best=n_best,get_sentence=False,loop_cost=0.00001)
    print("the min cost for this sentence is {}".format(min_sentence_cost))
    total_cost+=min_sentence_cost
print("Over all cost for this file with n_best equal to {} is {}".format(n_best,total_cost))

dealing with the 1 th input character
0.0
dealing with the 2 th input character
0.0
dealing with the 3 th input character
1.0
dealing with the 4 th input character
0.0
dealing with the 5 th input character
1.0
dealing with the 6 th input character
1e-05
dealing with the 7 th input character
2e-05
dealing with the 8 th input character
1e-05
dealing with the 9 th input character
2e-05
dealing with the 10 th input character
2e-05
dealing with the 11 th input character
3.0000000000000004e-05
dealing with the 12 th input character
1.00003
dealing with the 13 th input character
3.0000000000000004e-05
dealing with the 14 th input character
4e-05
dealing with the 15 th input character
1.00004
dealing with the 16 th input character
1.00004
dealing with the 17 th input character
1.00004
dealing with the 18 th input character
4e-05
dealing with the 19 th input character
1.00004
dealing with the 20 th input character
1.00005
dealing with the 21 th input character
5e-05
dealing with the 22 th input

30.000369999999993
dealing with the 152 th input character
31.000369999999993
dealing with the 153 th input character
31.000369999999993
dealing with the 154 th input character
32.00036999999999
dealing with the 155 th input character
32.00037999999999
dealing with the 156 th input character
32.000389999999996
dealing with the 157 th input character
32.000389999999996
dealing with the 158 th input character
33.000389999999996
dealing with the 159 th input character
34.000389999999996
dealing with the 160 th input character
33.000389999999996
dealing with the 161 th input character
33.000389999999996
dealing with the 162 th input character
33.0004
dealing with the 163 th input character
34.0004
dealing with the 164 th input character
34.0004
dealing with the 165 th input character
34.0004
dealing with the 166 th input character
35.0004
dealing with the 167 th input character
35.00041
dealing with the 168 th input character
35.00041
dealing with the 169 th input character
36.00041
dealin

74.0007100000001
dealing with the 295 th input character
74.0007100000001
dealing with the 296 th input character
74.0007100000001
dealing with the 297 th input character
75.0007100000001
dealing with the 298 th input character
75.0007200000001
dealing with the 299 th input character
75.0007200000001
dealing with the 300 th input character
76.0007200000001
dealing with the 301 th input character
76.0007300000001
dealing with the 302 th input character
76.0007300000001
dealing with the 303 th input character
76.0007300000001
dealing with the 304 th input character
77.0007300000001
dealing with the 305 th input character
77.0007300000001
dealing with the 306 th input character
77.0007400000001
dealing with the 307 th input character
77.00075000000011
dealing with the 308 th input character
77.00075000000011
dealing with the 309 th input character
78.00075000000011
dealing with the 310 th input character
78.00076000000011
dealing with the 311 th input character
78.00076000000011
dealing w

2.0001600000000006
dealing with the 58 th input character
1.0001700000000007
dealing with the 59 th input character
2.0001700000000007
dealing with the 60 th input character
2.0001800000000007
dealing with the 61 th input character
1.0001800000000007
dealing with the 62 th input character
2.0001800000000007
dealing with the 63 th input character
2.000190000000001
dealing with the 64 th input character
1.0001900000000008
dealing with the 65 th input character
2.000190000000001
dealing with the 66 th input character
1.0002000000000009
dealing with the 67 th input character
2.000200000000001
dealing with the 68 th input character
2.000210000000001
dealing with the 69 th input character
2.000210000000001
dealing with the 70 th input character
2.000210000000001
dealing with the 71 th input character
1.000210000000001
dealing with the 72 th input character
2.000210000000001
dealing with the 73 th input character
2.000220000000001
dealing with the 74 th input character
1.000220000000001
the m

4.000200000000001
dealing with the 79 th input character
3.000200000000001
dealing with the 80 th input character
4.000200000000001
dealing with the 81 th input character
4.000210000000001
dealing with the 82 th input character
3.000210000000001
dealing with the 83 th input character
4.000210000000001
dealing with the 84 th input character
4.0002200000000006
dealing with the 85 th input character
3.000220000000001
dealing with the 86 th input character
4.0002200000000006
dealing with the 87 th input character
3.000230000000001
dealing with the 88 th input character
3.000240000000001
dealing with the 89 th input character
4.000230000000001
dealing with the 90 th input character
3.000230000000001
dealing with the 91 th input character
4.000230000000001
dealing with the 92 th input character
3.000240000000001
dealing with the 93 th input character
3.000250000000001
dealing with the 94 th input character
3.0002600000000013
dealing with the 95 th input character
3.000250000000001
the min co

In [28]:
total_cost

106.00158000000016

# Read the Unsegment.txt and segment and correct it

In [31]:
n_best=10

file=open("unsegmented.txt","r").readlines()
total_cost=0
for sentence in file:
    sentence=sentence.strip('\n')
    min_sentence_cost=spellchecker.unsegment_text_vertibe(sentence,n_best=n_best,get_sentence=False,loop_cost=0.00001)
    print("the min cost for this sentence is {}".format(min_sentence_cost))
    total_cost+=min_sentence_cost
print("Over all cost for this file with n_best equal to {} is {}".format(n_best,total_cost))


dealing with the 1 th input character
0.0
dealing with the 2 th input character
0.0
dealing with the 3 th input character
1.0
dealing with the 4 th input character
1.0
dealing with the 5 th input character
1e-05
dealing with the 6 th input character
1.00001
dealing with the 7 th input character
2e-05
dealing with the 8 th input character
1.00002
dealing with the 9 th input character
1.00002
dealing with the 10 th input character
1.00003
dealing with the 11 th input character
2.00002
dealing with the 12 th input character
2.0000299999999998
dealing with the 13 th input character
2.0000299999999998
dealing with the 14 th input character
2.00003
dealing with the 15 th input character
2.0000400000000003
dealing with the 16 th input character
3.00004
dealing with the 17 th input character
2.0000500000000003
dealing with the 18 th input character
3.0000500000000003
dealing with the 19 th input character
3.0000600000000004
dealing with the 20 th input character
3.0000600000000004
dealing with

46.000400000000006
dealing with the 151 th input character
46.000400000000006
dealing with the 152 th input character
46.000400000000006
dealing with the 153 th input character
47.000400000000006
dealing with the 154 th input character
47.000400000000006
dealing with the 155 th input character
47.000400000000006
dealing with the 156 th input character
48.000400000000006
dealing with the 157 th input character
48.00041000000001
dealing with the 158 th input character
48.00041000000001
dealing with the 159 th input character
49.00041000000001
dealing with the 160 th input character
49.00041000000001
dealing with the 161 th input character
49.00041000000001
dealing with the 162 th input character
49.00041000000001
dealing with the 163 th input character
49.00042000000001
dealing with the 164 th input character
49.00042000000001
dealing with the 165 th input character
49.000430000000016
dealing with the 166 th input character
50.000430000000016
dealing with the 167 th input character
50.00

97.0006800000001
dealing with the 293 th input character
97.0006900000001
dealing with the 294 th input character
97.0006900000001
dealing with the 295 th input character
97.0006900000001
dealing with the 296 th input character
98.0006900000001
dealing with the 297 th input character
98.0006900000001
dealing with the 298 th input character
98.00070000000011
dealing with the 299 th input character
99.00070000000011
dealing with the 300 th input character
99.00070000000011
dealing with the 301 th input character
100.00070000000011
dealing with the 302 th input character
100.00070000000011
dealing with the 303 th input character
100.00070000000011
dealing with the 304 th input character
100.00071000000011
dealing with the 305 th input character
101.00071000000011
dealing with the 306 th input character
101.00071000000011
dealing with the 307 th input character
101.00071000000011
dealing with the 308 th input character
101.00071000000011
dealing with the 309 th input character
101.00072000

9.000239999999998
dealing with the 65 th input character
9.000239999999998
dealing with the 66 th input character
10.000239999999998
dealing with the 67 th input character
9.000249999999998
dealing with the 68 th input character
10.000249999999998
dealing with the 69 th input character
10.000259999999997
dealing with the 70 th input character
11.000259999999997
dealing with the 71 th input character
10.000259999999997
dealing with the 72 th input character
11.000259999999997
dealing with the 73 th input character
11.000259999999997
dealing with the 74 th input character
10.000269999999997
the min cost for this sentence is 10.000269999999997
dealing with the 1 th input character
1.0
dealing with the 2 th input character
1.0
dealing with the 3 th input character
1.0
dealing with the 4 th input character
2.0
dealing with the 5 th input character
2.0
dealing with the 6 th input character
2.0
dealing with the 7 th input character
2.00001
dealing with the 8 th input character
3.00001
dealing

18.00021999999999
dealing with the 82 th input character
18.00021999999999
dealing with the 83 th input character
18.00022999999999
dealing with the 84 th input character
18.00023999999999
dealing with the 85 th input character
18.00024999999999
dealing with the 86 th input character
19.00024999999999
the min cost for this sentence is 19.00024999999999
Over all cost for this file with n_best equal to 10 is 171.00150000000014


In [32]:
total_cost

171.00150000000014

# Analysis of the result

For the first long sentence, it is too hard to deal with it. Therefore its own cost is 126. The rest sentences' costs are 10, 16 and 19.


Compared with unsegmented0. There are indeed more errors as second total_cost is larger than the first one.