In [1]:
import os
import getConfig
import jieba
from zhon.hanzi import punctuation
import re
import collections

In [2]:
# @save
def get_convs():
    gConfig = {}
    gConfig=getConfig.get_config()
    conv_path = gConfig['resource_data']
    convs = []  # 用于存储对话的列表
    with open(conv_path, encoding='utf-8') as f:
        one_conv = []        # 存储一次完整对话
        for line in f:
            line = line.strip('\n').replace('?', '')#去除换行符，并将原文件中已经分词的标记去掉，重新用结巴分词.
            line=re.sub(r"[%s]+" %punctuation, "",line)
            if line == '':
                continue
            if line[0] == gConfig['e']:
                if one_conv:
                    convs.append(one_conv)
                one_conv = []
            elif line[0] == gConfig['m']:
                tmp = line.split(' ')[1]
                if '=' not in tmp:
                    # print(one_conv)
                    one_conv.append(tmp)#将一次完整的对话存储下来
                else:
                    continue
    return convs
convs = get_convs()
print(convs[:3])

[['呵呵', '是王若猫的'], ['不是', '那是什么'], ['怎么了', '我很难过安慰我~']]


In [3]:
# @save
def tokenize(convs, token='word'):  # "word": 单个词，"char": 单个字  
    """
        把所有对话分词化，每个对话包括两个list
    """
    if token == 'word':
        seq = []        
        for conv in convs:
            if len(conv) == 1:
                continue
            if len(conv) % 2 != 0:  # 因为默认是一问一答的，所以需要进行数据的粗裁剪，对话行数要是偶数的
                conv = conv[:-1]
            for i in range(len(conv)):
                if i % 2 == 0:
                    ask=[word for word in jieba.cut(conv[i])]#使用jieba分词器进行分词
                    answer=[word for word in jieba.cut(conv[i+1])]
                    seq.append([ask, answer])#因为i是从0开始的，因此偶数行为发问的语句，奇数行为回答的语
        return seq
    elif token == 'char':
        seq = []        
        for conv in convs:
            if len(conv) == 1:
                continue
            if len(conv) % 2 != 0:  # 因为默认是一问一答的，所以需要进行数据的粗裁剪，对话行数要是偶数的
                conv = conv[:-1]
            for i in range(len(conv)):
                if i % 2 == 0:
                    ask=[char for char in conv[i]] #使用单个磁
                    answer=[char for char in conv[i+1]]
                    seq.append([ask, answer])#因为i是从0开始的，因此偶数行为发问的语句，奇数行为回答的语
        return seq
    else:
        print('ERROR: unknown token type: ' + token)



In [4]:
tokens = tokenize(convs, "word")
print("tokens:", tokens[:5], '\n', "convs:", convs[:5])

Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\RuiLi\AppData\Local\Temp\jieba.cache
Loading model cost 0.578 seconds.
Prefix dict has been built succesfully.
tokens: [[['呵呵'], ['是', '王若', '猫', '的']], [['不是'], ['那', '是', '什么']], [['怎么', '了'], ['我', '很', '难过', '安慰', '我', '~']], [['开心', '点哈', ',', '一切', '都', '会', '好', '起来'], ['嗯']], [['我', '还', '喜欢', '她', ',', '怎么办'], ['我', '帮', '你', '告诉', '她', '发短信', '还是', '打电话']]] 
 convs: [['呵呵', '是王若猫的'], ['不是', '那是什么'], ['怎么了', '我很难过安慰我~'], ['开心点哈,一切都会好起来', '嗯'], ['我还喜欢她,怎么办', '我帮你告诉她发短信还是打电话']]


In [5]:
def count_corpus(tokens):  #@save
    """Count token frequencies."""
    # Here `tokens` is a 1D list or 2D list
    if len(tokens) == 0 or isinstance(tokens[0], list):
        # Flatten a list of token lists into a list of tokens
        tokens = [token for lines in tokens for line in lines for token in line]
    return collections.Counter(tokens)

In [6]:
ct = count_corpus(tokens)
i = 0
for token, freq in ct.items():
    i = i + 1
    print(token, freq)
    if i>10:
        break
   

呵呵 8751
是 108569
王若 16
猫 338
的 141211
不是 13468
那 23474
什么 26808
怎么 17398
了 109369
我 187151


In [7]:
class Vocab:  # @save
    def __init__(self, tokens=[], min_freq=0, reserved_tokens=[]):
        counter = count_corpus(tokens)
        self.token_freqs = sorted(counter.items(), key=lambda x:x[1], reverse=True)
        self.unk, uniq_tokens = 0, ['<unk>'] + reserved_tokens
        uniq_tokens += [token for token, freq in self.token_freqs if freq >= min_freq and token not in uniq_tokens]
        self.idx_to_token, self.token_to_idx = [], dict()
        for token in uniq_tokens:
            self.idx_to_token.append(token)
            self.token_to_idx[token] = len(self.idx_to_token) - 1

    def __len__(self):
        return len(self.idx_to_token)

    def __getitem__(self, tokens):
        if not isinstance(tokens, (list, tuple)):
            return self.token_to_idx.get(tokens, self.unk)
        return [self.__getitem__(token) for token in tokens]

    def to_tokens(self, indices):
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.to_tokens[idx] for idx in indices]

In [8]:
vocab = Vocab(tokens)
print(list(vocab.token_to_idx.items())[:10])

[('<unk>', 0), ('你', 1), ('我', 2), ('的', 3), ('了', 4), ('是', 5), (',', 6), ('不', 7), ('啊', 8), ('~', 9)]


In [9]:
for i in [0, 10]:
    print("words", tokens[i])
    print('indices:', vocab[tokens[i]])

words [['呵呵'], ['是', '王若', '猫', '的']]
indices: [[60], [5, 10629, 1107, 3]]
words [['许兵', '是', '谁'], ['是', '我', '善良', '可爱', '的', '主人', '的', '老公', '啊']]
indices: [[16623, 5, 27], [5, 2, 774, 91, 3, 35, 3, 163, 8]]


In [10]:
def load_corpus_xiaohuangji50w_nofenci(max_tokens=-1):  # @save
    convs = get_convs()
    tokens = tokenize(convs, token = "word")
    vocab = Vocab(tokens)
    corpus = [vocab[token] for lines in tokens for line in lines for token in line]
    if max_tokens > 0:
        corpus = corpus[:max_tokens]
    return corpus, vocab

corpus, vocab = load_corpus_xiaohuangji50w_nofenci()
vocab.token_freqs, len(corpus), len(vocab)

([('你', 260265),
  ('我', 187151),
  ('的', 141211),
  ('了', 109369),
  ('是', 108569),
  (',', 53891),
  ('不', 47106),
  ('啊', 38570),
  ('~', 32130),
  ('好', 30579),
  ('说', 28503),
  ('吗', 27327),
  ('什么', 26808),
  ('就', 24647),
  ('那', 23474),
  ('在', 23044),
  ('吃', 22918),
  ('有', 22861),
  ('他', 22745),
  ('么', 22511),
  ('*', 21891),
  ('吧', 21172),
  ('啦', 20862),
  ('都', 19825),
  ('去', 19194),
  ('鸡', 17672),
  ('谁', 17433),
  ('怎么', 17398),
  ('也', 17325),
  ('爱', 17322),
  ('知道', 17315),
  ('给', 16283),
  ('喜欢', 16127),
  ('想', 16086),
  ('主人', 15618),
  ('会', 15247),
  ('就是', 13818),
  ('不是', 13468),
  ('还', 12528),
  ('!', 12410),
  ('呢', 12290),
  ('人', 11899),
  ('个', 11845),
  ('要', 11778),
  ('没', 11608),
  ('不要', 11566),
  ('嘛', 11185),
  ('小通', 11133),
  ('和', 10905),
  ('她', 10626),
  ('很', 10579),
  ('来', 10227),
  ('嗯', 10019),
  ('一个', 9782),
  ('^', 9562),
  ('对', 9252),
  ('小', 9094),
  ('傻', 9034),
  ('才', 8969),
  ('呵呵', 8751),
  ('叫', 8739),
  ('这', 8444),
 