In [1]:
import pickle
import collections
import numpy as np
import tensorflow as tf
import tensorflow.contrib.keras as kr

In [2]:
# 1.对句子中的汉字、标点符号拆分
# 对于文本分类问题，是否采用中文切词对分类结果会产生多大影响？ 
# 特别是垃圾信息识别中，类似这样：f*ck, 草~泥~马这样的内容，对中文进行分词似乎更不好用 
def cn_token(str_text):
    "对句子中的汉字、标点符号拆分"
    return [token for token in str_text]

print(cn_token("利用tensorflow框架实现CNN中文文本分类 - GitHub"))

['利', '用', 't', 'e', 'n', 's', 'o', 'r', 'f', 'l', 'o', 'w', '框', '架', '实', '现', 'C', 'N', 'N', '中', '文', '文', '本', '分', '类', ' ', '-', ' ', 'G', 'i', 't', 'H', 'u', 'b']


In [3]:
# 2.构建词汇表
# https://github.com/alicelmx/Chinese-classification
def build_vocab(text_list,vocab_size=100):
    "是否有更高级一点的，现成的构建词表函数或工具？"
    all_tokens = [] 
    for text in text_list: #样本量非常巨大的情况？
        all_tokens.extend(cn_token(text)) 
        
    counter =  collections.Counter()
    for text in text_list:
        counter.update(cn_token(text))
    
    t1 = collections.Counter(all_tokens)
    count_pairs = t1.most_common(vocab_size-1) 
    # print(count_pairs)
    # print(list(zip(*count_pairs)))
    words, _ = list(zip(*count_pairs)) 
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    
    #2种方式存入磁盘持久化
    with open('./tmp/nlp_1_vocab.txt','w') as f:
        f.write('\n'.join(words) + '\n') 
    with open('./tmp/nlp_1_vocab.pickle','wb') as f: 
        pickle.dump(words,f)
    
    return words

def build_vocab_4(text_list,vocab_size=100):
    "是否有更高级一点的，现成的构建词表函数或工具？" 
    counter =  collections.Counter()
    for text in text_list:
        counter.update(cn_token(text))
    
    count_pairs = counter.most_common(vocab_size-1)  
    words, _ = list(zip(*count_pairs))  
    words = ['<PAD>'] + list(words) 
    
    return words 

def build_vocab_1(text_list):
    vocabulary = []
    for text in text_list:
        tokens = cn_token(text)
        for t in tokens:
            #token,doc_count,token_count?
            if t not in vocabulary:
                vocabulary.append(t) 
    return vocabulary 

def build_vocab_2(text_list):
    """统计：字符，在多少个文档中出现，总计出现多少次"""
    d_token = {}
    for text in text_list:
        tokens = cn_token(text)
        has_doc = False
        for t in tokens:
            #token,doc_count,token_count?
            if t not in d_token:
                d_token[t] = {'doc_count':1,'token_count':1}
                has_doc = True
            else:
                tmp = d_token[t] 
                tmp['token_count'] = tmp['token_count'] + 1
                if not has_doc:
                    tmp['doc_count'] = tmp['doc_count'] + 1  #有问题，？
                    has_doc = True 
                d_token[t] = tmp 
    return d_token 

def build_vocab_3(text_list):
    vocabulary = []
    for text in text_list:
        tokens = cn_token(text)
        for t in tokens:
            #token,doc_count,token_count?
            if t not in [v[0] for v in vocabulary]:
                vocabulary.append([t,1,1]) 
            else:
                index = [v[0] for v in vocabulary].index(t)
                tmp = vocabulary[index]
                vocabulary[index] = [tmp[0],tmp[1]+1,tmp[2]+1]  #有问题，？ 
    return vocabulary 


x_text = ['叙利亚叙政府军和俄罗斯方面20日','对叙利亚首都大马士革东郊东古塔地区发动空袭','导致平民伤亡', '一所医院停止运转']
# print(build_vocab_3(x_text))
# print(build_vocab_2(x_text))
print(build_vocab(x_text))
print(build_vocab_4(x_text))

['<PAD>', '叙', '利', '亚', '东', '政', '府', '军', '和', '俄', '罗', '斯', '方', '面', '2', '0', '日', '对', '首', '都', '大', '马', '士', '革', '郊', '古', '塔', '地', '区', '发', '动', '空', '袭', '导', '致', '平', '民', '伤', '亡', '一', '所', '医', '院', '停', '止', '运', '转']
['<PAD>', '叙', '利', '亚', '东', '政', '府', '军', '和', '俄', '罗', '斯', '方', '面', '2', '0', '日', '对', '首', '都', '大', '马', '士', '革', '郊', '古', '塔', '地', '区', '发', '动', '空', '袭', '导', '致', '平', '民', '伤', '亡', '一', '所', '医', '院', '停', '止', '运', '转']


In [4]:
# 3.用词向量表示训练数据
def process_file_1(text_list, max_length=21):
    """将文件转换为id表示""" 
    words = build_vocab(text_list) 
    
    data_id = []
    for tokens in [cn_token(t) for t in text_list]:
        l = [] 
        for i,t in enumerate(tokens):
            index = words.index(t)
            l.append( index )
        data_id.append(l)
    
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length) 
     
    return x_pad

def process_file(text_list, max_length=25):
    """将文件转换为id表示"""  
    words = build_vocab(text_list) 
    word_to_id = dict(zip(words, range(len(words)))) 

    data_id = []
    contents = [cn_token(txt) for txt in text_list]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id]) 
    
    x = np.array(data_id)
    print(x.shape)
    
    print("max_doc_len:" + str(max([len(cn_token(txt)) for txt in text_list]) ))

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    # why use this ?
    # pad_sequences(sequences, maxlen=None, dtype='int32',
    # padding='pre', truncating='pre', value=0.)
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length,padding='post') 
#     print(x_pad)
    return x_pad 

def process_file_2(x_text,max_document_length=25): 
    #VocabularyProcessor (max_document_length, min_frequency=0, vocabulary=None, tokenizer_fn=None)
    # max_document_length: 文档的最大长度。如果文本的长度大于最大长度，那么它会被剪切，反之则用0填充。 
    # min_frequency: 词频的最小值，出现次数小于最小词频则不会被收录到词表中。 
    # vocabulary: CategoricalVocabulary 对象。 
    # tokenizer_fn：分词函数

    vocab_pro = tf.contrib.learn.preprocessing.VocabularyProcessor(max_document_length,
                                                                   min_frequency=0, 
                                                                   tokenizer_fn=cn_token) 
    x = np.array(list(vocab_pro.fit_transform(x_text)))
    # print(x.shape)

    embedding_size = len([x for x in vocab_pro.fit_transform(x_text)])
    return x 

x_text = ['叙利亚叙政府军和俄罗斯方面20日','对叙利亚首都大马士革东郊东古塔地区发动空袭','导致平民伤亡', '一所医院停止运转']

# print(code(build_vocab(x_text),x_text))
# print(process_file_1(x_text))
# print(process_file(x_text))
max_document_length=22
print(process_file(x_text,max_document_length))
print('~~~')
labels = process_file_2(x_text,max_document_length)
print(labels)

# one-hot ?
# targets = tf.one_hot(labels, max_document_length, on_value=1.0, off_value=0.0, axis=-1)
# with tf.Session() as sess:
#     print(sess.run(targets))
    
#为啥编号会有不一样的地方？按cout排序时，如果count一致，可能会出现随机排列

(4,)
max_doc_len:21
[[ 1  2  3  1  5  6  7  8  9 10 11 12 13 14 15 16  0  0  0  0  0  0]
 [17  1  2  3 18 19 20 21 22 23  4 24  4 25 26 27 28 29 30 31 32  0]
 [33 34 35 36 37 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [39 40 41 42 43 44 45 46  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]
~~~
[[ 1  2  3  1  4  5  6  7  8  9 10 11 12 13 14 15  0  0  0  0  0  0]
 [16  1  2  3 17 18 19 20 21 22 23 24 23 25 26 27 28 29 30 31 32  0]
 [33 34 35 36 37 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [39 40 41 42 43 44 45 46  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [5]:
# 声明词嵌入矩阵。将句子单词转成索引，再将索引转成one-hot向量，该向量为单位矩阵。
# 再使用该矩阵为每个单词查找系数向量，并加入到词系数向量

# embedding_lookup(params, 
# ids, 
# partition_strategy='mod', 
# name=None, 
# validate_indices=True, 
# max_norm=None)
# 按照ids顺序返回params中的第ids行。
# 比如说，ids=[1,3,2],就是返回params中第1,3,2行。返回结果为由params的1,3,2行组成的tensor.
x_embed = tf.nn.embedding_lookup(labels,[0,1,2,3])

#InvalidArgumentError: indices[0,4] = 4 is not in [0, 4)

sess = tf.Session()
sess.run(tf.global_variables_initializer())
print(sess.run(x_embed)) 

# x_col_sums = tf.reduce_sum(x_embed, 0) #为啥要求和？


[[ 1  2  3  1  4  5  6  7  8  9 10 11 12 13 14 15  0  0  0  0  0  0]
 [16  1  2  3 17 18 19 20 21 22 23 24 23 25 26 27 28 29 30 31 32  0]
 [33 34 35 36 37 38  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0  0]
 [39 40 41 42 43 44 45 46  0  0  0  0  0  0  0  0  0  0  0  0  0  0]]


In [6]:
# 4.one-hot？
def one_hot(text_list):
    text_dict = build_vocab(text_list) 
    
    vocabulary = []
    for text in text_list:
        l = []
        tokens = cn_token(text) 
        for i,t in enumerate(text_dict):
            l.append(1 if t in tokens else 0) 
        vocabulary.append(l)
        
    return vocabulary

print(one_hot(x_text))

[[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]
