Deterministic finite automaton
===

有限自动机

https://en.wikipedia.org/wiki/Deterministic_finite_automaton



In [3]:
from collections import defaultdict


keywords = (
    '测试一',
    '测试二',
    '天安门',
)


class DFAFilter:
    """
    Examples:
    ::
        dfa_filter = DFAFilter()
        dfa_filter.build_chains(keywords_set)
        dfa_filter.load_keywords(raw_text)
    """

    def load_keywords(self, raw_text):
        """
        Args:
            raw_text (str):

        Returns:
            set: keywords that in raw_text
        """
        assert getattr(self, '_chains', None), 'Should invoke build_chains first'
        return self.filter_keyword(raw_text)

    def build_chains(self, keywords):
        """
        Args:
            keywords (set): lexicon of keywords
        """
        chains = {}
        for word in keywords:
            node = chains
            for char in word:
                if char not in node:
                    node[char] = {}

                node = node[char]

        self._chains = chains

    def is_word_in_chains(self, chains, raw_text, n_len, i):
        if raw_text[i] not in chains:
            return None

        if not chains[raw_text[i]]:
            return i

        if i == n_len - 1:
            return None

        return self.is_word_in_chains(chains=chains[raw_text[i]],
                                      raw_text=raw_text,
                                      n_len=n_len,
                                      i=i+1)

    def filter_keyword(self, raw_text):
        result_keywords = set()
        i, n_len = 0, len(raw_text)
        for i in range(n_len):
            li = self.is_word_in_chains(self._chains, raw_text, n_len, i)
            if li is not None:
                result_keywords.add(raw_text[i: li+1])

        return result_keywords
            
    
    
dfa_f = DFAFilter()
dfa_f.build_chains(keywords)
dfa_f.load_keywords(raw_text)

{'天安门'}

In [13]:
# 生成 keywords txt
results = set()
with open('/Users/laisky/Downloads/keywords.txt', 'r') as f:
    for word in f:
        results.add(word.strip())
        
with open('/Users/laisky/Downloads/keywords.txt', 'w') as f:
    for word in results:
        f.write('{}\n'.format(word)) 