In [7]:
# tool:
# Deep learning：keras
# Traditional machine learning：sklearn

# The model involved in the training are as follows：
# CNN, Wordvec+CNN
# LSTM, word2vec+LSTM
# KNN
# SVM
# naive bayes
# Logistic Regression
# Random forest classifier
# Decition tree classifier

In [8]:
# !pip install keras 
# !pip install tensorflow==2.2
# !pip install numpy
# !pip install gensim

# print(np.__version__)
# print(tensorflow.__version__)

In [9]:
def load_text(train_texts,train_labels,test_texts,test_labels):
    #coding:utf-8
    print ('*load texts:')
    train_texts = open(train_texts,encoding='UTF-8').read().split('\n')
    train_labels = open(train_labels,encoding='UTF-8').read().split('\n')
    test_texts = open(test_texts,encoding='UTF-8').read().split('\n')
    test_labels = open(test_labels,encoding='UTF-8').read().split('\n')
    all_texts = train_texts + test_texts
    all_labels = train_labels + test_labels
    return all_texts,all_labels,train_texts,train_labels,test_texts,test_labels
    
def creat_tokenizer(all_texts):
    print ('*tokenizer:')
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.utils import to_categorical
    import numpy as np

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    sequences = tokenizer.texts_to_sequences(all_texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(all_labels))
    print('The shape of data tensor:', data.shape)
    print('The shape of label tensor:', labels.shape)
    return data,labels,word_index

In [10]:
import tensorflow.keras
import keras

MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 400
VALIDATION_SPLIT = 0.15
TEST_SPLIT = 0.2

all_texts,all_labels,train_texts,train_labels,test_texts,test_labels=load_text('text_stopwords.txt','categroy.txt','new_text_stopwords.txt','ner_cate.txt')

data,labels,word_index=creat_tokenizer(all_texts)

print ('*split data set:')
# split the data into training set, validation set, and test set
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]
print ('train docs: '+str(len(x_train)))
print ('val docs: '+str(len(x_val)))
print ('test docs: '+str(len(x_test)))

*load texts:
*tokenizer:
Found 205393 unique tokens.
The shape of data tensor: (14104, 100)
The shape of label tensor: (14104, 14)
*split data set:
train docs: 9167
val docs: 2116
test docs: 2821


In [11]:
def CNN_define(embedding_layer,embedding_dim,max_sequence_length):
    #define for CNN
    print ('*define model CNN:')
    from keras.layers import Dense, Input, Flatten, Dropout
    from keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalMaxPooling1D
    from keras.models import Sequential
#     from keras.utils import plot_model

    model = Sequential()
    if embedding_layer==None:  
        model.add(Embedding(len(word_index) + 1, embedding_dim, input_length=max_sequence_length))
    else:
        model.add(embedding_layer)
    model.add(Dropout(0.2))
    model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(3))
    model.add(Flatten())
    model.add(Dense(embedding_dim, activation='relu'))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()
#     plot_model(model, to_file='model.png',show_shapes=True)

    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
    print (model.metrics_names)
    return model

def LSTM_define(embedding_layer,embedding_dim,max_sequence_length):
    #trainning for lstm
    print ('*define model lstm：')
    from keras.layers import Dense, Input, Flatten, Dropout
    from keras.layers import LSTM, Embedding
    from keras.models import Sequential

    model = Sequential()
    if embedding_layer==None:
        model.add(Embedding(len(word_index) + 1, embedding_dim, input_length=max_sequence_length))
    else:
        model.add(embedding_layer)
    model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()

    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
    print (model.metrics_names)
    return model
    
def model_train(model_name,model):
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128)
    model.save(model_name)

    print ('*testing model:')
    print (model.evaluate(x_test, y_test))

In [12]:
model=CNN_define(None,EMBEDDING_DIM,MAX_SEQUENCE_LENGTH)
model_train('CNN.h5',model)

*define model CNN:
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 100, 400)          82157600  
_________________________________________________________________
dropout_2 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 98, 250)           300250    
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 32, 250)           0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_3 (Dense)              (None, 400)               3200400   
_________________________________________________________________
dense_4 (Dense)              (None,

In [13]:
model=LSTM_define(None,EMBEDDING_DIM,MAX_SEQUENCE_LENGTH)
model_train('LSTM.h5',model)

*define model lstm：
Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 400)          82157600  
_________________________________________________________________
lstm_1 (LSTM)                (None, 200)               480800    
_________________________________________________________________
dropout_3 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_5 (Dense)              (None, 14)                2814      
Total params: 82,641,214
Trainable params: 82,641,214
Non-trainable params: 0
_________________________________________________________________
['loss', 'acc']
Train on 9167 samples, validate on 2116 samples
Epoch 1/2
Epoch 2/2
*testing model:
[1.535922124258215, 0.5108117461204529]


In [14]:
print ('(4) load word2vec as embedding...')
import gensim
from gensim.models import Word2Vec
from gensim.models.word2vec import LineSentence
import numpy as np


w2v_model = gensim.models.KeyedVectors.load_word2vec_format('wiki.zh.text.vector', binary=False)
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
not_in_model = 0
in_model = 0
for word, i in word_index.items(): 
    if str(word) in w2v_model:
        in_model += 1
        embedding_matrix[i] = np.asarray(w2v_model[str(word)], dtype='float32')
    else:
        not_in_model += 1
print (str(not_in_model)+' words not in w2v model')
from keras.layers import Embedding
embedding_layer = Embedding(len(word_index) + 1,EMBEDDING_DIM,weights=[embedding_matrix],
                            input_length=MAX_SEQUENCE_LENGTH,trainable=False)

(4) load word2vec as embedding...
59395 words not in w2v model


In [15]:
model=CNN_define(embedding_layer,EMBEDDING_DIM,MAX_SEQUENCE_LENGTH)
model_train('CNN_word2vec.h5',model)

*define model CNN:
Model: "sequential_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 400)          82157600  
_________________________________________________________________
dropout_4 (Dropout)          (None, 100, 400)          0         
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 98, 250)           300250    
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 32, 250)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 400)               3200400   
_________________________________________________________________
dense_7 (Dense)              (None,

In [16]:
model=LSTM_define(embedding_layer,EMBEDDING_DIM,MAX_SEQUENCE_LENGTH)
model_train('LSTM_word2vec.h5',model)

*define model lstm：
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, 100, 400)          82157600  
_________________________________________________________________
lstm_2 (LSTM)                (None, 200)               480800    
_________________________________________________________________
dropout_5 (Dropout)          (None, 200)               0         
_________________________________________________________________
dense_8 (Dense)              (None, 14)                2814      
Total params: 82,641,214
Trainable params: 483,614
Non-trainable params: 82,157,600
_________________________________________________________________
['loss', 'acc']
Train on 9167 samples, validate on 2116 samples
Epoch 1/2
Epoch 2/2
*testing model:
[0.8895783752931935, 0.7465437650680542]


In [26]:
def tfid(all_texts):
    print ('*tfidtransfor:')
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer   
    count_v0= CountVectorizer();  
    counts_all = count_v0.fit_transform(all_texts);
    count_v1= CountVectorizer(vocabulary=count_v0.vocabulary_);  
    counts_train = count_v1.fit_transform(train_texts);   
    print ("the shape of train is "+repr(counts_train.shape))  
    count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_);  
    counts_test = count_v2.fit_transform(test_texts);  
    print ("the shape of test is "+repr(counts_test.shape))

    tfidftransformer = TfidfTransformer();    
    train_data = tfidftransformer.fit(counts_train).transform(counts_train);
    test_data = tfidftransformer.fit(counts_test).transform(counts_test); 

    x_train = train_data
    y_train = train_labels
    x_test = test_data
    y_test = test_labels
    return x_train,y_train,x_test,y_test

In [27]:
x_train,y_train,x_test,y_test=tfid(all_texts)

*tfidtransfor:
the shape of train is (5590, 200364)
the shape of test is (8514, 200364)


In [28]:
def test_model(model,x_test,y_test):
    preds=model.predict(x_test)
    num=0
    preds=preds.tolist()
    for i,pred in enumerate(preds):
        if int(pred) == int(y_test[i]):
            num += 1
    return num, preds

In [31]:
print ('*KNN:')
from sklearn.neighbors import KNeighborsClassifier  
from sklearn.metrics import f1_score

y_pred=[0,1,1,1,2,2]
y_true=[0,1,0,2,1,1]

for x in range(1,15):  
    knnclf = KNeighborsClassifier(n_neighbors=x)
    knnclf.fit(x_train,y_train)  
    num, preds=test_model(knnclf,x_test,y_test)
    print ('K= '+str(x)+', precision_score:' + str(float(num) / len(preds)))
    print ('f1_score:' + str(f1_score(y_test,preds,average='macro')))

*KNN:


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 1, precision_score:0.8202959830866807
f1_score:0.7174386211598571


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 2, precision_score:0.7378435517970402
f1_score:0.6404781546359531


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 3, precision_score:0.7807141179234203
f1_score:0.6876219828738407


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 4, precision_score:0.7788348602302091
f1_score:0.6840624285501783


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 5, precision_score:0.7885835095137421
f1_score:0.698792451711796


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 6, precision_score:0.785882076579751
f1_score:0.6965410305564748


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 7, precision_score:0.7926943857176415
f1_score:0.704232499412351


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 8, precision_score:0.7932816537467701
f1_score:0.7025800806454455


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 9, precision_score:0.7964529011040639
f1_score:0.7082660223448582


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 10, precision_score:0.7962179938924125
f1_score:0.707351958213011


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 11, precision_score:0.7963354474982381
f1_score:0.7049330720342548


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 12, precision_score:0.7935165609584214
f1_score:0.7044715414057753


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 13, precision_score:0.7939863753817242
f1_score:0.7045985503237356
K= 14, precision_score:0.7937514681700728
f1_score:0.7054861869935919


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [32]:
print ('*SVM：')
from sklearn.svm import SVC   
svclf = SVC(kernel = 'linear') 
svclf.fit(x_train,y_train)  
num, preds=test_model(svclf,x_test,y_test)
print ('precision_score:' + str(float(num) / len(preds)))
print ('f1_score:' + str(f1_score(y_test,preds,average='macro')))

*SVM：
precision_score:0.9007517030772845
f1_score:0.8149666268208976


In [33]:
print ('*Naive Bayes：')
from sklearn.naive_bayes import MultinomialNB  

clf = MultinomialNB(alpha = 0.01)   
clf.fit(x_train, y_train)
num, preds=test_model(clf,x_test,y_test)
print ('precision_score:' + str(float(num) / len(preds)))
print ('f1_score:' + str(f1_score(y_test,preds,average='macro')))

*Naive Bayes：
precision_score:0.9021611463471929
f1_score:0.8212736580324363


In [34]:
print ('*Logistic Regression：')
from sklearn.linear_model import LogisticRegression

log =  LogisticRegression(C=1e10,max_iter=1000) 
log.fit(x_train, y_train)
num, preds=test_model(log,x_test,y_test)
print ('precision_score:' + str(float(num) / len(preds)))
print ('f1_score:' + str(f1_score(y_test,preds,average='macro')))

*Logistic Regression：
precision_score:0.9003993422598073
f1_score:0.8039468213952466


In [35]:
print ('*Random Forest Classifier：')
from sklearn.ensemble import RandomForestClassifier

rfc =  RandomForestClassifier(n_estimators=500,  max_features='sqrt',random_state=10)
rfc.fit(x_train, y_train)
num, preds=test_model(rfc,x_test,y_test)
print ('precision_score:' + str(float(num) / len(preds)))
print ('f1_score:' + str(f1_score(y_test,preds,average='macro')))

*Random Forest Classifier：
precision_score:0.8803147756636128
f1_score:0.814149079603147


In [36]:
print ('*Decision Tree Classifier：')
from sklearn.tree import DecisionTreeClassifier

dtc =  DecisionTreeClassifier(random_state=77)
dtc.fit(x_train, y_train)
num, preds=test_model(dtc,x_test,y_test)
print ('precision_score:' + str(float(num) / len(preds)))
print ('f1_score:' + str(f1_score(y_test,preds,average='macro')))

*Decision Tree Classifier：
precision_score:0.7668545924359877
f1_score:0.6721781847170548
