In [2]:
import jieba
import string
from IPython.display import HTML

class InvertSeach2():
    #init the data from doc_file
    def __init__(self, doc_file):
        '''
        init term, doc_id, doc_list from doc_file
        '''
        self.terms = dict()
        self.doc_id = 0
        self.doc_list = []
        
        #read doc_file
        with open(doc_file, 'r') as f:
            data= f.readlines()
            data = set(data)
            f.close()
        
        #update doc_listselfand doc_id, terms
        for doc in data:
            self.insert(doc.lower())

    #extender function to better adjust the jiebe cut function
    def extender(self, state, cache, cutflage=0):
        '''
        extend cache to result depending on state and cutflage: 0 cut_for_search, 1 cut
        '''
        result = []
        if state:
            result.append(cache)
        else:
            if cutflage == 0:
                result.extend(list(jieba.cut_for_search(cache)))
            elif cutflage == 1:
                result.extend(list(jieba.cut(cache)))
            else:
                print('Worning! cutflage only 0 and 1 is available. predefined as 0 cut_for_search')
                result.extend(list(jieba.cut_for_search(cache)))
        return result
    
    #split chinese words and english words
    def parse_doc(self, doc, cutflage=0):
        '''
        set up Chinese and English identifier
        '''
        result = []
        cache = ''
        e_sign = ['-', ':', '.', '!']
        last_state = ''
        for c in doc:
            cur_state = c in string.ascii_letters or c.isdigit() or c in e_sign
            if c == ' ':
                result.extend(self.extender(last_state, cache, cutflage))
                result.append(' ');
                cache = ''
                last_state = ''
            else:
                if last_state == cur_state:
                    cache += c
                else:
                    result.extend(self.extender(last_state, cache,cutflage))
                    cache = c
                last_state = cur_state
        if cache:
            result.extend(self.extender(last_state, cache, cutflage))
        return result
    
    #over write insert method
    def insert(self, doc):
        '''
        add doc to original doc list
        '''
        self.doc_list.append(doc)
        for term in self.parse_doc(doc):
            if term in self.terms:
                self.terms[term].add(self.doc_id)
            else:
                self.terms[term] = set([self.doc_id])
        self.doc_id += 1
        return self.doc_id + 1
    
    #over write find method
    def find(self, keywords):
        '''
        find keywords in terms reuturn each term set result
        '''
        result = None
        for word in self.parse_doc(keywords, cutflage=1):
            if result is None:
                result = self.terms.get(word, set())
            else:
                result = result & self.terms.get(word, set())
        if result is None:
            result = set()
        return result
    
    #over write convert method
    def convert(self, keywords):
        '''
        convert keywords into set expression
        '''
        result = ''
        #convet keywords to standard match format
        keywords_ = keywords.split()
        keywords = ' '.join(keywords_)
        word = self.parse_doc(keywords, cutflage=1)
        print('|'.join(word))
        sign = ['(', ')', 'and', 'AND', 'or', 'OR', 'not', 'NOT', '-', ' ']
        id = 0
        num = len(word)
        while id < num:
            if word[id] == '(' or word[id] == ')' or word[id] == ' ':
                # XXX ( XXXX ) XXXX right ) case
                if word[id] == ')' and id+2 < num and word[id+2] not in sign:
                    result += ' )  & '
                    id += 2
                    continue
                result += word[id]
            elif word[id] == 'and' or word[id] == 'AND':
                result += '&'
            elif word[id] == 'or' or word[id] == 'OR':
                result += '|'
            elif word[id] == 'not' or word[id] == 'NOT'or word[id] == '-':
                result += '-'
            #two words without logical 
            elif id + 1 < num and word[id+1] not in(' ', ')'):
                result +=  'self.find("{}")'.format(word[id]) + ' & '
            #two chineses together
            elif id + 2< num and word[id+2] not in sign:
                result += 'self.find("{}")'.format(word[id]) + ' & '
                id += 2
                continue
            # XXX ( XXXX ) XXXX left ( case
            elif id + 2< num and word[id+2] == '(':
                result += 'self.find("{}")'.format(word[id]) + ' & '
                id += 2
                continue
            # XXX ( XX XX ) XXXX spance in ( ) case
            elif id + 2< num and word[id+2] not in sign:
                result += ' & ' + 'self.find("{}")'.format(word[id])
                id += 2
                continue
            else:
                result +=  'self.find("{}")'.format(word[id])
            id += 1
        return result
    
    #over write hightlight method
    def hightlight(self, doc, keywords):
        '''
        hightlight keywords in doc
        '''
        sign = ['(', ')', 'and', 'AND', 'or', 'OR', 'not', 'NOT', '-', ' ']
        for word in self.parse_doc(keywords, cutflage=1):
            if word not in sign:
                doc = doc.replace(word, '<span style="color:red">{}</span>'.format(word))
        return doc

    #forward max split subfunction
    def forwardSplit(self, doc, keywords):
        '''
        split doc(in lower case) by keywords
        '''
        e_sign = ['-', ':', '.', '!']
        n = 0
        word_set = set()
        query_parts = self.parse_doc(keywords)
        for query_part in query_parts:
            word_set.add(query_part)
            #update maximum length of n
            if len(query_part) > n:
                n = len(query_part)
        i = 0
        result = []
        while True:
            end_idx = i + n
            if end_idx > len(doc):
                end_idx = len(doc)
            for j in range(end_idx, i, -1):
                #print(j,i,doc[i:j])
                if doc[i:j] in word_set:
                    break
            if doc[i:j] in word_set:
                result.append('<span style="color:red">{}</span>'.format(doc[i:j]))
            else:
                result.append(doc[i:j])
            i = j
            if i == len(doc):
                break
        return ''.join(result)

    #now method for hightlighter with forward max split
    def highlighter2(self, doc, keywords):
        display(HTML(self.forwardSplit(doc, keywords)))

    #over write search function
    def search(self, keywords):
        '''
        searh keywords in doc_list by invert search
        '''
        keywords = keywords.lower()
        result_ = self.convert(keywords)
        print(result_)
        print(eval(result_))
        for id in eval(result_):
            self.highlighter2(self.doc_list[id], keywords)

In [4]:
searcher = InvertSeach2('titles.txt')
#keywords = '华为 mate30'
keywords = '华为 Mate30'
searcher.search(keywords)

华为| |mate30
self.find("华为") & self.find("mate30")
{376, 73, 83, 412}
