In [7]:
import re
from collections import Counter
import toolz #处理迭代器、字典、列表等数据结构的工具
'''
re这个模块的主要作用是让你能够在 Python 程序中使用正则表达式来处理字符串。
匹配 (Matching)：检查一个字符串是否符合某个特定的模式（格式）。
搜索 (Searching)：在一个字符串中查找符合特定模式的部分。
替换 (Substitution)：查找字符串中符合模式的部分，并将其替换为其他内容。
分割 (Splitting)：根据模式将字符串分割成多个部分。

re 模块提供了很多函数来实现这些功能，比如：
re.match(): 从字符串开头尝试匹配模式。
re.search(): 在整个字符串中搜索第一个匹配模式的位置。
re.findall(): 查找字符串中所有匹配模式的子串，并以列表形式返回。
re.split(): 根据模式分割字符串。
re.sub(): 查找并替换。
re.compile(): 编译正则表达式。
'''
def wordpunct_tokenize(text):
    #\w匹配Unicode字符     ^\w\s匹配非Unicode字符和非空白字符（标点符号）

    _pattern = r"\w+|[^\w\s]+" 
    
    #编译为对象   re.MULTILINE表示$和^可以匹配下一行开头和结尾，re.DOTALL表示点号也可以匹配换行符
    _regexp = re.compile(_pattern, flags=re.UNICODE | re.MULTILINE | re.DOTALL) 
    return _regexp.findall(text)

corpus = [
        "Baby, I don't feel so good", 
        "six words you never understood",
        "I'll never let you go", 
        "five words you'll never say (aww)",
        "I laugh along like nothing's wrong" ,
        "four days has never felt so long",
        "If three's a crowd and two was us",
        "one slipped away (hahahahaha)",
        "I just wanna make you feel okay",
        "But all you do is look the other way",
        "I can't tell you how much I wish I didn't wanna stay",
        "I just kinda wish you were gay",
        "Is there a reason we're not through?",
        "Is there a 12-step just for you?",
        "Our conversation's all in blue",
        "11 \"heys\" (Hey, hey, hey, hey)",
        "Ten fingers tearin' out my hair",
        "Nine times, you never made it there",
        "I ate alone at seven, you were six minutes away",
        "How am I supposed to make you feel okay",
        "When all you do is walk the other way?",
        "I can't tell you how much I wish I didn't wanna stay",
        "I just kinda wish you were gay",
        "To spare my pride",
        "To give your lack of interest, an explanation",
        "Don't say I'm not your type",
        "Just say that I'm not your preferred sexual orientation",
        "I'm so selfish",
        "But you make me feel helpless, yeah",
        "And I can't stand another day",
        "Stand another day",
        "I just wanna make you feel okay",
        "But all you do is look the other way, hmm",
        "I can't tell you how much I wish I didn't wanna stay",
        "I just kinda wish you were gay",
        "I just kinda wish you were gay",
        "I just kinda wish you were gay",
]

print(wordpunct_tokenize(''.join(corpus)))

['Baby', ',', 'I', 'don', "'", 't', 'feel', 'so', 'goodsix', 'words', 'you', 'never', 'understoodI', "'", 'll', 'never', 'let', 'you', 'gofive', 'words', 'you', "'", 'll', 'never', 'say', '(', 'aww', ')', 'I', 'laugh', 'along', 'like', 'nothing', "'", 's', 'wrongfour', 'days', 'has', 'never', 'felt', 'so', 'longIf', 'three', "'", 's', 'a', 'crowd', 'and', 'two', 'was', 'usone', 'slipped', 'away', '(', 'hahahahaha', ')', 'I', 'just', 'wanna', 'make', 'you', 'feel', 'okayBut', 'all', 'you', 'do', 'is', 'look', 'the', 'other', 'wayI', 'can', "'", 't', 'tell', 'you', 'how', 'much', 'I', 'wish', 'I', 'didn', "'", 't', 'wanna', 'stayI', 'just', 'kinda', 'wish', 'you', 'were', 'gayIs', 'there', 'a', 'reason', 'we', "'", 're', 'not', 'through', '?', 'Is', 'there', 'a', '12', '-', 'step', 'just', 'for', 'you', '?', 'Our', 'conversation', "'", 's', 'all', 'in', 'blue11', '"', 'heys', '"', '(', 'Hey', ',', 'hey', ',', 'hey', ',', 'hey', ')', 'Ten', 'fingers', 'tearin', "'", 'out', 'my', 'hairNine

In [3]:
class BPEtokenizer():
    special = ['<UKN>', '<END>', '<PAD>','<MAD>']  #特殊字符填充
    
    def __init__(self,vocab_size = 10000, lowercase = True,basic_tokenizer = wordpunct_tokenize,
                 unk='<UNK>', sep='<SEP>', pad='<PAD>', cls='<CLS>', mask='<MASK>', user_specials=None):
        self.vocal_size = vocab_size
        self.lowercase = lowercase
        self.tokenizer = basic_tokenizer
        self.special = [unk, sep, pad, cls, mask]
        


    def loadAndTransform(self, vocab_fn=None, vocab=None):
        if vocab:
            self.vocab = vocab
        else:
            self.vocab = [l.strip() for l in open(vocab_fn, 'r')]
        vocab_len = len(self. vocab)
        self.voToid = {x: y for x, y in enumerate(self.vocab)} #把字符转换为索引
        self.idTovo = {y: x for x, y in self.voto2d.items()}  #把索引转换为字符
         
    def train(self, corpus = list, max_step = 10000, out_fn = 'vocabulary.txt'):

        #########################################统计词频################################################
        if self.lowercase:
            corpus = [s.lower() for s in corpus]
        
        #map用于把一个函数依次对一个数据结构中的元素使用，并返回一个迭代器
        corpus = list(map(self.tokenizer,corpus))
        
        
        #展平该列表
        corpus = toolz.concat(corpus)
        
        #把每个元素转换成元组并加入结尾符，计算每个单词出现的次数。Counter返回一个元素计数的字典。
        split_corpus = Counter(tuple(word) + ('/w',) for word in corpus)

        #split_corpus = Counter([tuple(word)+ ('<\W>', ) for word in toolz.concat(map(self.tokenizer, corpus))])
        

        ##########################################逐步合并高频词为token并生成词表#################################
        vocab = self._count_vocab(split_corpus)

        for i in range(max_step):
            split_corpus, vocab_cnt = self._countAndMerge(split_corpus)   #保留单词结构，统计一个单词内出现的二元字词的词频
            vocab = self._count_vocab(split_corpus)   #把单词切碎，只统计字符级别的词频
            if len(vocab) > self.vocal_size or vocab_cnt < 0 : break
        
        ##########插入特殊词######################
        for s in self.special:
            if s not in vocab:
                vocab.insert(0,(s,9999))
        
        #####导出列表#####

        with open(out_fn,'w') as f:
            f.write('\n'.join(token for token, _ in vocab))
        
        self.vocab = [token for token, _ in vocab]

        return vocab 


      
    
    def _count_vocab(self, split_corpus): 
        _countWord = Counter([data for data in toolz.concat([w * x  for w, x in split_corpus.items()])])      #.items()顺序访问字典中的元素，enumerate访问元组或者列表
        _sortWord = sorted(_countWord.items(), key = lambda x : x[1], reverse = True) #按第一维度降序排序字符
        return _sortWord
    
    def _countAndMerge(self, split_corpus):
        ngram = 2
        bigramCounter = Counter()

        for token, count in split_corpus.items():  #循环扫描每个单词和其出现频率
            if count < 2 : continue          #跳过小于2的子词
            for subwords in toolz.sliding_window(ngram, token):  #使用2的滑动窗口在单词上滚动
                bigramCounter[subwords] += count  #将每个长度为2的子词的出现次数记录下来

        if len(bigramCounter) > 0 :
            max_bigram_key = max(bigramCounter, key=bigramCounter.get)    #找出最大频率的二元子词，max会循环读取可迭代对象，并对每个对象执行key对应的函数，比较函数计算出的值。（对于字典，读出的是键）
        else: return split_corpus , -1
        
        max_bigram_cnt = bigramCounter.get(max_bigram_key)
        
        list_split_corpus_key = list(split_corpus.keys())
        for tokens in list_split_corpus_key:
        
            temp_tokens = ' '.join(tokens)   #jion方法可以把数据结构中的参数合并成字符串，.前面的符号是合并时每个元素间插入的字符
            
            temp_tokens = temp_tokens.replace(' '.join(max_bigram_key), ''.join(max_bigram_key))   #把原始的token中的分离字符替换为合并在一起的二元高频字符

            new_tokens = tuple(temp_tokens.split(' '))  #split方法通过括号里的字符把字符串分开，返回一个列表，再转换成元组得到例如(I lo v e) 的形式，其中lo是之前统计得到的高频二元字词

            #temp_split_corpus = tuple(' '.join(tokens).replace(' '.join(list_split_corpus_key), ''.join(list_split_corpus_key)).split(' '))
            if  new_tokens != tokens:
                split_corpus[new_tokens] = split_corpus[tokens]
                split_corpus.pop(tokens)
        return split_corpus, max_bigram_cnt
    
    
BPE = BPEtokenizer()
sequence = BPE.train(corpus = corpus)
print(sequence)

    



    


[('<MASK>', 9999), ('<CLS>', 9999), ('<PAD>', 9999), ('<SEP>', 9999), ('<UNK>', 9999), ('a', 27), ('i/w', 25), ('/w', 23), ('you/w', 21), ('o', 20), ('i', 20), ("'/w", 19), ('e', 17), ('r', 16), ('t/w', 15), ('l', 12), ('e/w', 12), ('s/w', 12), ('n', 11), ('u', 11), ('p', 11), ('g', 10), ('d', 10), ('s', 10), (',/w', 9), ('just/w', 9), ('n/w', 9), ('w', 8), ('h', 8), ('wish/w', 8), ('f', 7), ('t', 7), ('were/w', 6), ('is/w', 5), ('gay/w', 5), ('feel/w', 5), ('wanna/w', 5), ('never/w', 5), ('kinda/w', 5), ('m/w', 5), ('v', 4), ('y', 4), ('m', 4), ('a/w', 4), ('h/w', 4), ('all/w', 4), ('can/w', 4), ('hey/w', 4), ('how/w', 4), ('make/w', 4), ('c', 3), ('1', 3), ('k', 3), ('b', 3), ('th', 3), ('er', 3), ('way/w', 3), ('ne/w', 3), ('other/w', 3), ('the/w', 3), ('there/w', 3), ('he', 3), ('(/w', 3), (')/w', 3), ('?/w', 3), ('say/w', 3), ('so/w', 3), ('do/w', 3), ('to/w', 3), ('te', 3), ('okay/w', 3), ('stay/w', 3), ('but/w', 3), ('not/w', 3), ('your/w', 3), ('tell/w', 3), ('much/w', 3), ('di

In [17]:
def countTokens(corpus):
    
        corpus = [s.lower() for s in corpus]
        
        #map用于把一个函数依次对一个数据结构中的元素使用，并返回一个迭代器
        corpus = list(map(wordpunct_tokenize,corpus))
        print(corpus)
        #展平该列表
        corpus = toolz.concat(corpus)
        
        #把每个元素转换成元组并加入结尾符，计算每个单词出现的次数
        split_corpus = Counter(tuple(word) + ('/w',) for word in corpus)
        return split_corpus
print (countTokens(corpus))

[['baby', ',', 'i', 'don', "'", 't', 'feel', 'so', 'good'], ['six', 'words', 'you', 'never', 'understood'], ['i', "'", 'll', 'never', 'let', 'you', 'go'], ['five', 'words', 'you', "'", 'll', 'never', 'say', '(', 'aww', ')'], ['i', 'laugh', 'along', 'like', 'nothing', "'", 's', 'wrong'], ['four', 'days', 'has', 'never', 'felt', 'so', 'long'], ['if', 'three', "'", 's', 'a', 'crowd', 'and', 'two', 'was', 'us'], ['one', 'slipped', 'away', '(', 'hahahahaha', ')'], ['i', 'just', 'wanna', 'make', 'you', 'feel', 'okay'], ['but', 'all', 'you', 'do', 'is', 'look', 'the', 'other', 'way'], ['i', 'can', "'", 't', 'tell', 'you', 'how', 'much', 'i', 'wish', 'i', 'didn', "'", 't', 'wanna', 'stay'], ['i', 'just', 'kinda', 'wish', 'you', 'were', 'gay'], ['is', 'there', 'a', 'reason', 'we', "'", 're', 'not', 'through', '?'], ['is', 'there', 'a', '12', '-', 'step', 'just', 'for', 'you', '?'], ['our', 'conversation', "'", 's', 'all', 'in', 'blue'], ['11', '"', 'heys', '"', '(', 'hey', ',', 'hey', ',', 'hey

In [None]:
def _count_vocab(split_corpus): 
    _countWord = Counter([data for data in toolz.concat([w * x  for w, x in split_corpus.items()])])      #.items()顺序访问字典中的元素，enumerate访问元组或者列表
    print(_countWord)
    print(([w * x  for w, x in split_corpus.items()]))
    _sortWord = sorted(_countWord.items(), key = lambda x : x[1], reverse = True) #按第一维度降序排序字符，返回列表
    return _sortWord

print(_count_vocab(sequence))

Counter({'/w': 238, 'e': 103, 'a': 90, 'o': 82, 'n': 64, 's': 63, 'u': 50, 'h': 49, 'r': 45, 'i': 44, 'l': 41, 't': 40, 'w': 39, 'd': 34, 't/w': 31, 'y': 28, 'y/w': 28, 'i/w': 25, "'": 19, 'm': 19, 'k': 17, 'g': 15, 'f': 14, 'p': 11, 'c': 10, ',': 9, 'v': 9, 'j': 9, 'b': 6, 'x': 4, '(': 3, ')': 3, '?': 3, '1': 3, '"': 2, '2': 1, '-': 1})
[(',', '/w', ',', '/w', ',', '/w', ',', '/w', ',', '/w', ',', '/w', ',', '/w', ',', '/w', ',', '/w'), ('d', 'o', 'n', '/w', 'd', 'o', 'n', '/w'), ("'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w', "'", '/w'), ('f', 'e', 'e', 'l', '/w', 'f', 'e', 'e', 'l', '/w', 'f', 'e', 'e', 'l', '/w', 'f', 'e', 'e', 'l', '/w', 'f', 'e', 'e', 'l', '/w'), ('s', 'o', '/w', 's', 'o', '/w', 's', 'o', '/w'), ('g', 'o', 'o', 'd', '/w'), ('s', 'i', 'x', '/w', 's', 'i', 'x', '/w'), ('w', 'o', 'r', 'd', 's', '/w', 'w', 'o', 'r', 'd

In [21]:
def _countAndMerge( split_corpus):
    ngram = 2
    bigramCounter = Counter()

    for token, count in split_corpus.items():  #循环扫描每个单词和其出现频率
        if count < 2 : continue          #跳过小于2的子词
        for subwords in toolz.sliding_window(ngram, token):  #使用2的滑动窗口在单词上滚动
            bigramCounter[subwords] += count  #将每个长度为2的子词的出现次数记录下来

    if len(bigramCounter) > 0 :
        max_bigram_key = max(bigramCounter, key=bigramCounter.get)    #找出最大频率的二元子词，max会循环读取可迭代对象，并对每个对象执行key对应的函数，比较函数计算出的值。（对于字典，读出的是键）
    else: return split_corpus , -1
    
    max_bigram_cnt = bigramCounter.get(max_bigram_key)
    
    list_split_corpus_key = list(split_corpus.keys())
    for tokens in list_split_corpus_key:
       
        temp_tokens = ' '.join(tokens)   #jion方法可以把数据结构中的参数合并成字符串，.前面的符号是合并时每个元素间插入的字符
        
        temp_tokens = temp_tokens.replace(' '.join(max_bigram_key), ''.join(max_bigram_key))   #把原始的token中的分离字符替换为合并在一起的二元高频字符

        new_tokens = tuple(temp_tokens.split(' '))  #split方法通过括号里的字符把字符串分开，返回一个列表，再转换成元组得到例如(I lo v e) 的形式，其中lo是之前统计得到的高频二元字词

        #temp_split_corpus = tuple(' '.join(tokens).replace(' '.join(list_split_corpus_key), ''.join(list_split_corpus_key)).split(' '))
        if  new_tokens != tokens:
            split_corpus[new_tokens] = split_corpus[tokens]
            split_corpus.pop(tokens)
    return split_corpus, max_bigram_cnt

print(_countAndMerge(sequence))


    




(Counter({('i/w',): 25, ('yo', 'u', '/w'): 21, ("'", '/w'): 19, (',', '/w'): 9, ('t/w',): 9, ('j', 'u', 's', 't/w'): 9, ('w', 'i', 's', 'h', '/w'): 8, ('w', 'e', 'r', 'e', '/w'): 6, ('f', 'e', 'e', 'l', '/w'): 5, ('n', 'e', 'v', 'e', 'r', '/w'): 5, ('w', 'a', 'n', 'n', 'a', '/w'): 5, ('i', 's', '/w'): 5, ('k', 'i', 'n', 'd', 'a', '/w'): 5, ('g', 'a', 'y/w'): 5, ('m', 'a', 'k', 'e', '/w'): 4, ('a', 'l', 'l', '/w'): 4, ('c', 'a', 'n', '/w'): 4, ('h', 'o', 'w', '/w'): 4, ('h', 'e', 'y/w'): 4, ('s', 'o', '/w'): 3, ('(', '/w'): 3, (')', '/w'): 3, ('s', '/w'): 3, ('a', '/w'): 3, ('d', 'o', '/w'): 3, ('t', 'h', 'e', '/w'): 3, ('o', 't', 'h', 'e', 'r', '/w'): 3, ('t', 'e', 'l', 'l', '/w'): 3, ('m', 'u', 'c', 'h', '/w'): 3, ('d', 'i', 'd', 'n', '/w'): 3, ('t', 'h', 'e', 'r', 'e', '/w'): 3, ('?', '/w'): 3, ('t', 'o', '/w'): 3, ('m', '/w'): 3, ('s', 'a', 'y/w'): 3, ('o', 'k', 'a', 'y/w'): 3, ('w', 'a', 'y/w'): 3, ('s', 't', 'a', 'y/w'): 3, ('b', 'u', 't/w'): 3, ('n', 'o', 't/w'): 3, ('yo', 'u', '