### 通过临时环境变量设置keras后端

In [10]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'
import re
import codecs
import pickle
import fire

import numpy as np

import keras
from keras.models import Sequential, Model
from keras.optimizers import *  
from keras.layers import Dense, Input, Concatenate ,Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding ,GlobalMaxPooling1D 
from keras.preprocessing.text import Tokenizer  
from keras.preprocessing.sequence import pad_sequences  
from keras.utils.np_utils import to_categorical  
#from gensim.models.word2vec import Word2Vec
#from gensim.models.keyedvectors import KeyedVectors

Using TensorFlow backend.


### 语料库预处理  list(text)<->list(label_id)<->dict(label)
- 文本列表   texts.append()
- 文本标签id列表  labels.append(label_id)
- 标签名：标签id 字典  labels_index[label_name] = label_id  
- label_id = len(labels_index)

```
texts = []  # list of text samples
labels_index = {}  # dictionary mapping label name to numeric id
labels = []  # list of label ids
for name in sorted(os.listdir(TEXT_DATA_DIR)):
    path = os.path.join(TEXT_DATA_DIR, name)
    if os.path.isdir(path):
        label_id = len(labels_index)
        labels_index[name] = label_id
        for fname in sorted(os.listdir(path)):
            if fname.isdigit():
                fpath = os.path.join(path, fname)
                f = open(fpath)
                texts.append(f.read())
                f.close()
                labels.append(label_id)

print('Found %s texts.' % len(texts))
```

### 文本预处理   texts【每个文本转换为id序列，对齐最大文本长度】，label【每个标签转换为独热编码】

```
from keras.preprocessing.text import Tokenizer        #标记生成器
from keras.preprocessing.sequence import pad_sequences   #填充序列

tokenizer = Tokenizer()   #num_words：None或整数，处理的最大单词数量，只处理数据集词典中最常见的
tokenizer.fit_on_texts(texts)  #在文本序列texts上执行标记生成器
sequences = tokenizer.texts_to_sequences(texts)  #文本向量化，将文本转换为词id序列

word_index = tokenizer.word_index   #词id（索引）
print('Found %s unique tokens.' % len(word_index)) #所有唯一id

data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)  #将每个文本的词id序列填充为最大长度

labels = keras.utils.to_categorical(np.asarray(labels))  #将标签id转换为独热编码形式
print('Shape of data tensor:', data.shape)  #文本列表的形状  length*MAX_SEQUENCE_LENGTH
print('Shape of label tensor:', labels.shape)  #标签列表的形状  length*num_classes

# split the data into a training set and a validation set #将数据分割为训练集和验证集
indices = np.arange(data.shape[0])  #文本列表长度
np.random.shuffle(indices)       #混排文本序号
data = data[indices]           #混排后的文本列表
labels = labels[indices]        #对应混排后的标签列表
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0]) #根据验证分割比例计算数据分割点

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]
```

### 读取词向量字典  word->vector
```
embeddings_index = {}
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))
```

### 加载词向量字典  python3 使用KeyedVectors

### 建立词向量矩阵

### 加载词向量矩阵到嵌入层

### 使用cnn实现文本分类 序列模型
```
from keras.models import Sequential
from keras.layers import Embedding,Conv1D,MaxPooling1D,GlobalMaxPooling1D,Dropout,Dense

model = Sequential()
model.add(Embedding(10000,256,weights=[embedding_matrix],input_length=100,trainable=True))
model.add((100,[3,4,5],strides=[1,1,1],activation='relu'))     #filters:100个卷积核，kernel_size:多个卷积窗口大小
#model.add(MaxPooling1d(pool_size=[100-3+1,100-4+1,100-5+1],strides=[1,1,1]) 
model.add(GlobalMaxPool1D())                  #窗口为样本长度（最大文本长度）-卷积窗口大小+1，步长为1的最大池化层
model.add(Dropout(0.5))                          #防止过拟合，随机丢弃50%的连接
model.add(Dense(len(labels_index),activation='softmax')

model.compile('rmsprop','categorical_crossentropy',['acc'])

model.fit(x_train,y_train,validation_data=(x_val,y_val),epochs=10,batch_size=128)

#score = model.evaluate(x_test,y_test,batch_size=128)
```

### 使用cnn实现文本分类 函数式模型
```
sequence_input = keras.Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')  #后端张量+某些属性=keras张量
embedded_sequences = embedding_layer(sequence_input)     #给层赋值
x = Conv1D(100, 5, activation='relu')(embedded_sequences) #一维卷积层（时序卷积），位置参数，卷积个数，卷积核大小
x = MaxPooling1D(5)(x) #一维池化层，最大池化窗口
x = Conv1D(128, 5, activation='relu')(x)
x = GlobalMaxPooling1D(35)(x)  # global max pooling
x = Flatten()(x)
x = Dense(128, activation='relu')(x)
preds = Dense(len(labels_index), activation='softmax')(x)

model = Model(sequence_input, preds)         #函数式模型`model = Model(input=[a, b], output=c)`，通过两端的输入输出初始化模型
model.compile(loss='categorical_crossentropy', 
              optimizer='rmsprop',
              metrics=['acc'])

# happy learning!
model.fit(x_train, y_train, validation_data=(x_val, y_val),
          nb_epoch=2, batch_size=128)
```

### 模型可视化
```
from keras.utils import plot_model
plot_model(model, to_file='model.png')
```

### TextCNN

### corpus_preprocess

In [7]:
import os
import re
import codecs
import time

import numpy as np

corpus_path =  'train-zhihu6-title-desc-single-100000.txt'
def corpus_preprocess(corpus_path,label_re):
    label_pattern = label_re+'[\w]+'
    corpus_path =  os.path.abspath(corpus_path)
    corpus_size = os.path.getsize(corpus_path)/(1024*1024*1024)
    texts = []
    labels = []
    label_index = {}
    with codecs.open(corpus_path,'r',encoding='utf-8') as f:
        start = time.time()
        if corpus_size > 0:
            while True:
                try:
                    line = f.readline()
                    line = line.strip()
                    re_labels = re.findall(label_pattern,line)
                    if re_labels != None and len(re_labels) > 0:
                        texts.append(re.sub(label_pattern,'',line))
                        for i in re_labels:
                            if i not in label_index:
                                label_id = len(label_index)
                                label_index[i] = label_id
                                labels.append(label_id)
                            else:
                                labels.append(label_index[i])
                except:
                    print(time.time()-start)
                    break       
        else:
            lines = f.readlines()
            for line in lines:
                line = line.strip()
                re_labels = re.findall(label_pattern,line)
                if re_labels != None and len(re_labels) > 0:
                    texts.append(re.sub(label_pattern,'',line))
                    for i in re_labels:
                        if i not in label_index:
                            label_id = len(label_index)
                            label_index[i] = label_id
                            labels.append(label_id)
                        else:
                            labels.append(label_index[i])
            print(time.time()-start)
                
        f.close()
    return texts,labels,label_index

In [8]:
texts,labels,label_index = corpus_preprocess(corpus_path,'__label__')

2060.216057777405


### text_preprocess

### word2vec_preprocess

### Corpus

In [79]:
import os
import re
import time
import gzip
import codecs
import _pickle as pickle
from functools import reduce

import numpy as np

from keras.preprocessing.sequence import pad_sequences  
from keras.utils.np_utils import to_categorical

def _to_categorical(num,max_num):
    arr = np.zeros((1,max_num),dtype=np.int8)
    arr[0][num] = 1
    return arr[0]

def index2categorical(labels,max_num):
    labels = list(np.array(labels).flatten())
    num_labels = len(labels)
    categorical = []
    for num in range(max_num):
        arr = np.zeros((1,max_num),dtype=np.float16) #np.int8
        arr[0][num] = labels.count(num)/num_labels  #prior probability for each label 
        # uniform distribution?
        categorical.append(arr[0])
    return categorical

def normalize(distribution):
    print(distribution)
    total = sum(distribution)
    return [p/total for p in distribution]

class Corpus(object):
    '''
    Build train/dev/test data easily and quickly!
    '''
    def __init__(self,path,word2vec_path=None,label_pattern='__label__[\-\w]+'):
        self.path = os.path.abspath(path)
        self.filename = os.path.basename(self.path).split('.')[0] 
        self.label_pattern = label_pattern
        self.size = round(os.path.getsize(path)/(1024*1024*1024),2)
        self.texts = []
        self.max_text_length = 0
        self.labels = []
        self.word_index = {'__PADDING__':0}
        self.label_index = {}
        self.word2vec_path = word2vec_path

    def preprocess(self):
        start = time.time()
        with codecs.open(self.path,'r',encoding='utf-8') as f:
            for line in f.readlines():
                line = line.strip()
                re_labels = re.findall(self.label_pattern,line)
                text = re.sub(self.label_pattern,'',line)
                # if each line with multilabels
                if re_labels != None and len(re_labels) > 0:# for multilabel
                    word_ids = []
                    for word in text.split(' '):#text preprocess
                        if word not in self.word_index:
                            word_id = len(self.word_index)
                            self.word_index[word] = word_id
                            word_ids.append(word_id)
                        else:
                            word_ids.append(self.word_index[word])
                    word_ids_length = len(word_ids)
                    if word_ids_length > self.max_text_length:
                        self.max_text_length = word_ids_length
                    self.texts.append(word_ids)
                    label_ids = []
                    for label in re_labels:
                        if label not in self.label_index:
                            label_id = len(self.label_index)
                            self.label_index[label] = label_id
                            label_ids.append(label_id)
                        else:
                            label_ids.append(self.label_index[label])
                    self.labels.append(label_ids)
        self.num_words = len(self.word_index)
        self.texts = np.array(pad_sequences(self.texts,
                                   maxlen=self.max_text_length,
                                   padding='post',
                                   truncating='post',
                                   value=0),dtype=np.int32)
        self.num_texts = len(self.texts)
        self.num_classes = len(self.label_index)
        #self.index_label = dict(zip(self.label_index.values(),self.label_index.keys()))#index2label
        #self.labels = np.array([to_categorical(label,self.num_classes)[0] for label in self.labels])
        categorical = index2categorical(self.labels,self.num_classes)
        for index,label_ids in enumerate(self.labels):
            self.labels[index] = sum([categorical[label_id] for label_id in label_ids])
        self.labels = np.array(self.labels)
        print(self.labels[:10])
        self.num_labels = len(self.labels)
        assert self.num_texts == self.num_labels
        # preprocess pretrained word2vec
        if not self.word2vec_path == None: 
            self.embeddings_index = {}
            vectors = 0
            with codecs.open(self.word2vec_path,'r',encoding='utf-8') as f:
                f.readline()
                while True:
                    try:
                        line = f.readline()
                        values = line.split()
                        word = values[0]
                        vectors = np.asarray(values[1:], dtype='float16')#float32
                        self.embeddings_index[word] = vectors
                        f.close()
                    except:
                        break
            self.vector_dim = len(vectors)
            self.embedding_matrix = np.zeros((self.num_words + 1,self.vector_dim))
            for word, index in self.word_index.items():                                 
                if word in self.embeddings_index:                
                    self.embedding_matrix[index] = self.embeddings_index[word]
                else:
                    self.embedding_matrix[index] = np.random.uniform(-1,1,size=(self.vector_dim)) #unlogin word  
            self.num_embeddings = len(self.embeddings_index)
            self.embedding_matrix_shape =self.embedding_matrix.shape
        else:
            self.embedding_matrix = None
        self.preprocess_time = round(time.time() - start,2)

    def summary(self):
        print('path:',self.path,
              '\nfilename:',self.filename,
              '\nlabel_pattern:',self.label_pattern,
              '\nsize: %sGB'%self.size,
              '\nnum_texts:',self.num_texts,
              '\ntexts_shape:',self.texts.shape,
              '\nnum_labels:',self.num_labels,
              '\nlabels_shape:',self.labels.shape,
              '\nnum_words:',self.num_words,
              '\nnum_classes:',self.num_classes,
              '\nmax_text_length:',self.max_text_length,
              '\npreprocess_time: %ss'%self.preprocess_time
             )
        if not self.word2vec_path == None:
            print('num_embeddings:',self.num_embeddings,
                  '\nvector_dim:',self.vector_dim,
                  '\nembedding_matrix_shape:',self.embedding_matrix_shape
             )

    @staticmethod
    def dump(corpus):
        corpus_object_path = os.path.join(os.path.dirname(corpus.path),
                            corpus.filename+'.'+corpus.__class__.__name__+'.pkl.gz')
        with gzip.open(corpus_object_path,'wb') as f:
            pickle.dump(corpus,f)
            print(corpus_object_path,
                ': %sGB'%round(os.path.getsize(corpus_object_path)/(1024*1024*1024),2))

    @staticmethod
    def load(corpus_path):
        corpus_path = os.path.abspath(corpus_path)
        with gzip.open(corpus_path,'rb') as f:
            return pickle.load(f)

    @classmethod
    def transform(cls,corpus):
        corpus.preprocess()
        corpus.summary()
        #cls.dump(corpus)

def main():
    pass

In [80]:
corpus = Corpus('fsd_gs.txt')

In [81]:
Corpus.transform(corpus)

[[ 0.58251953  0.          0.          0.        ]
 [ 0.58251953  0.          0.          0.        ]
 [ 0.58251953  0.          0.          0.        ]
 [ 0.58251953  0.          0.          0.        ]
 [ 0.58251953  0.          0.          0.        ]
 [ 0.58251953  0.          0.          0.        ]
 [ 0.58251953  0.          0.          0.        ]
 [ 0.          0.26611328  0.          0.        ]
 [ 0.          0.          0.11309814  0.        ]
 [ 0.58251953  0.          0.          0.        ]]
path: C:\Users\lxp\Desktop\nlp\quora\kaggle\zhihu\ieee_zhihu_cup\fsd_gs.txt 
filename: fsd_gs 
label_pattern: __label__[\-\w]+ 
size: 0.0GB 
num_texts: 3033 
texts_shape: (3033, 108) 
num_labels: 3033 
labels_shape: (3033, 4) 
num_words: 12139 
num_classes: 4 
max_text_length: 108 
preprocess_time: 0.13s


In [82]:
a = [[0,1],[2,3]]

In [68]:
for i in a:
    print(sum([j+1 for j in i]))

3
7


In [72]:
sum(np.array([0,1]))

1

In [None]:
corpus.labels.size

In [114]:
corpus = Corpus('fsd_gs.txt',)

In [115]:
corpus.preprocess()

In [116]:
corpus.summary()

path: C:\Users\lxp\Desktop\nlp\quora\kaggle\zhihu\ieee_zhihu_cup\fsd_gs.txt 
filename: fsd_gs 
label_pattern: __label__[\-\w]+ 
size: 0.0GB 
num_texts: 3033 
num_labels: 3033 
num_words: 12139 
num_classes: 4 
max_text_length: 108 
preprocess_time: 0.16s


In [117]:
corpus.dump(corpus)

C:\Users\lxp\Desktop\nlp\quora\kaggle\zhihu\ieee_zhihu_cup\fsd_gs.Corpus.pkl.gz : 0.0GB


### 深度学习模型基类

In [207]:
class NN(object):
    '''
    A simple Neural Network inplemetation in keras
    '''
    def __init__(self):
        self.name = self.__class__.__name__
        
    def build(self):
        pass
        
    def compile(self):
        self.model.compile(loss=self.loss, 
                          optimizer=self.optimizer,
                          metrics=self.metrics)

    def summary(self):
        self.model.summary()

    def plot_model(self):
        plot_model(self.model,
                to_file=self.name+'.png',
                show_shapes=True,
                show_layer_names=True)

    def fit(self,x,y,epochs=5,batch_size=128,validation_split=0.1):
        self.model.fit(x,y,validation_split=validation_split,
                       epochs=epochs, batch_size=batch_size)

    def evaluate(self,x_test,y_test,batch_size=128):
        score = self.model.evaluate(x_test, y_test, batch_size)
        
    @staticmethod
    def save_to_yaml(model):
        yaml_string = model.model.to_yaml()
        with gzip.open(model.name+'.config.yml.gz','wb') as f:
            f.write(yaml_string.encode('utf-8'))
        model.model.save_weights(model.name+'.weights.h5')
        
    @staticmethod
    def load_from_yaml(config_path,weights_path):
        with gzip.open(config_path,'rb') as f:
            yaml_string = f.read()
        model = model_from_yaml(yaml_string.decode('utf-8'))
        model.load_weights(weights_path, by_name=True)
        return model

    @staticmethod
    def dump_to_pickle(model):
        with gzip.open(model.name+'.pkl.gz','wb') as f:
            pickle.dump(model,f)

    @staticmethod
    def load_from_pickle(model_pickle_path):
        model_pickle_path = os.path.abspath(model_pickle_path)
        with gzip.open(model_pickle_path,'rb') as f:
            return pickle.load(f)
        
    @classmethod   
    def train(model):
        model.build()
        model.compile()
        model.summary()
        model.fit()
        NN.save_to_yaml(model)
        NN.dump_to_pickle(model)
        
    @staticmethod
    def predict(self):
        pass
#if __name__ == '__main__':
 #   fire.Fire(NN.train)

In [273]:
class TextRNN(NN):
    def __init__(self,**kw):
        print(kw)
        self.a = kw
        del kw
        pass
    def train(num_filters=100, filter_sizes=[1,2,3],):
        pass
    @classmethod
    def test(cls):
        print(type(cls.__name__))

In [274]:
a = {'a':1}

In [275]:
tr = TextRNN(**a)

{'a': 1}


In [276]:
a.update({'b':2})

In [277]:
tr.a

{'a': 1}

In [278]:
TextRNN.test()

<class 'str'>


In [279]:
NN.train

<bound method NN.train of <class '__main__.NN'>>

In [280]:
dir(NN)

['__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 'build',
 'compile',
 'dump_to_pickle',
 'evaluate',
 'fit',
 'load_from_pickle',
 'load_from_yaml',
 'plot_model',
 'predict',
 'save_to_yaml',
 'summary',
 'train']