In [1]:
import pickle
import collections
import numpy as np
import tensorflow as tf
import tensorflow.contrib.keras as kr

In [2]:
# 1.对句子中的汉字、标点符号拆分
# 对于文本分类问题，是否采用中文切词对分类结果会产生多大影响？ 
# 特别是垃圾信息识别中，类似这样：f*ck, 草~泥~马这样的内容，对中文进行分词似乎更不好用 
def cn_token(str_text):
    "对句子中的汉字、标点符号拆分"
    return [token for token in str_text]

# print(cn_token("叙利亚政府军和俄罗斯方面20日对叙利亚首都大马士革东郊东古塔地区发动空袭，导致平民伤亡，一所医院停止运转。test"))

In [3]:
# 2.构建词汇表
# https://github.com/alicelmx/Chinese-classification
def build_vocab(text_list,vocab_size=100):
    "是否有更高级一点的，现成的构建词表函数或工具？"
    all_tokens = []
    for text in text_list: #样本量非常大的情况？
        all_tokens.extend(cn_token(text)) 
    
    t1 = collections.Counter(all_tokens)
    count_pairs = t1.most_common(vocab_size) 
    # print(count_pairs)
    # print(list(zip(*count_pairs)))
    words, _ = list(zip(*count_pairs)) 
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = ['<PAD>'] + list(words)
    
    #2种方式存入磁盘持久化
    with open('./tmp/nlp_1_vocab.txt','w') as f:
        f.write('\n'.join(words) + '\n') 
    with open('./tmp/nlp_1_vocab.pickle','wb') as f: 
        pickle.dump(words,f)
    
    return words

def build_vocab_1(text_list):
    d_token = {}
    for text in text_list:
        tokens = cn_token(text)
        for t in tokens:
            #token,doc_count,token_count?
            if t not in d_token:
                d_token[t] = {'doc_count':1,'token_count':1}
            else:
                tmp = d_token[t] 
                tmp['token_count'] = tmp['token_count'] + 1
                tmp['doc_count'] = tmp['doc_count'] + 1  #?
                d_token[t] = tmp 
    return d_token 

def build_vocab_2(text_list):
    vocabulary = []
    for text in text_list:
        tokens = cn_token(text)
        for t in tokens:
            #token,doc_count,token_count?
            if t not in [v[0] for v in vocabulary]:
                vocabulary.append([t,1,1]) 
            else:
                index = [v[0] for v in vocabulary].index(t)
                tmp = vocabulary[index]
                vocabulary[index] = [tmp[0],tmp[1]+1,tmp[2]+1]  
    return vocabulary 

def build_vocab_3(text_list):
    vocabulary = []
    for text in text_list:
        tokens = cn_token(text)
        for t in tokens:
            #token,doc_count,token_count?
            if t not in vocabulary:
                vocabulary.append(t) 
    return vocabulary 


x_text = ['叙利亚叙政府军和俄罗斯方面20日','对叙利亚首都大马士革东郊东古塔地区发动空袭','导致平民伤亡', '一所医院停止运转']
# print(gen_dict_1(x_text))
# print(gen_dict_2(x_text))
# print(build_vocab(x_text))

In [4]:
# 3.用词向量表示训练数据
def process_file_1(text_list, max_length=21):
    """将文件转换为id表示""" 
    words = build_vocab(text_list) 
    
    data_id = []
    for tokens in [cn_token(t) for t in text_list]:
        l = [] 
        for i,t in enumerate(tokens):
            index = words.index(t)
            l.append( index )
        data_id.append(l)
    
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length) 
     
    return x_pad

def process_file(text_list, max_length=21):
    """将文件转换为id表示"""  
    words = build_vocab(text_list) 
    word_to_id = dict(zip(words, range(len(words)))) 

    data_id = []
    contents = [cn_token(txt) for txt in text_list]
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id]) 

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    # why use this ?
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, max_length) 

    return x_pad 

# print(code(build_vocab(x_text),x_text))
# print(process_file_1(x_text))
print(process_file(x_text))
# process_file_x(x_text)

[[ 0  0  0  0  0  1  2  3  1  5  6  7  8  9 10 11 12 13 14 15 16]
 [17  1  2  3 18 19 20 21 22 23  4 24  4 25 26 27 28 29 30 31 32]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0  0  0 33 34 35 36 37 38]
 [ 0  0  0  0  0  0  0  0  0  0  0  0  0 39 40 41 42 43 44 45 46]]


In [5]:
# 4.one-hot
def one_hot(text_list):
    text_dict = build_vocab(text_list) 
    
    vocabulary = []
    for text in text_list:
        l = []
        tokens = cn_token(text) 
        for i,t in enumerate(text_dict):
            l.append(1 if t in tokens else 0) 
        vocabulary.append(l)
        
    return vocabulary

print(one_hot(x_text))

[[0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0], [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]]
