In [70]:
# 工具
# 深度学习：keras
# 传统机器学习：sklearn
# 参与比较的机器学习方法

# CNN 
# LSTM 
# 朴素贝叶斯
# KNN
# SVM
# Logisticre Gression

In [71]:
# !pip install tensorflow==2.2
# !pip install numpy
# !pip install gensim

# print(np.__version__)
# print(tensorflow.__version__)

In [103]:
def load_text(train_texts,train_labels,test_texts,test_labels):
    #coding:utf-8
    print ('*load texts:')
    train_texts = open(train_texts,encoding='UTF-8').read().split('\n')
    train_labels = open(train_labels,encoding='UTF-8').read().split('\n')
    test_texts = open(test_texts,encoding='UTF-8').read().split('\n')
    test_labels = open(test_labels,encoding='UTF-8').read().split('\n')
    all_texts = train_texts + test_texts
    all_labels = train_labels + test_labels
    return all_texts,all_labels,train_texts,train_labels,test_texts,test_labels
    
def creat_tokenizer(all_texts):
    print ('*tokenizer:')
    from tensorflow.keras.preprocessing.text import Tokenizer
    from tensorflow.keras.preprocessing.sequence import pad_sequences
    from tensorflow.keras.utils import to_categorical
    import numpy as np

    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(all_texts)
    sequences = tokenizer.texts_to_sequences(all_texts)
    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))
    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    labels = to_categorical(np.asarray(all_labels))
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)
    return data,labels,word_index

In [104]:
import tensorflow.keras

MAX_SEQUENCE_LENGTH = 100
EMBEDDING_DIM = 200
VALIDATION_SPLIT = 0.16
TEST_SPLIT = 0.2

all_texts,all_labels,train_texts,train_labels,test_texts,test_labels=load_text('text.txt','categroy.txt','text.txt','categroy.txt')

data,labels,word_index=creat_tokenizer(all_texts)

print ('*split data set:')
# split the data into training set, validation set, and test set
p1 = int(len(data)*(1-VALIDATION_SPLIT-TEST_SPLIT))
p2 = int(len(data)*(1-TEST_SPLIT))
x_train = data[:p1]
y_train = labels[:p1]
x_val = data[p1:p2]
y_val = labels[p1:p2]
x_test = data[p2:]
y_test = labels[p2:]
print ('train docs: '+str(len(x_train)))
print ('val docs: '+str(len(x_val)))
print ('test docs: '+str(len(x_test)))

*load texts:
*tokenizer:
Found 150140 unique tokens.
Shape of data tensor: (11180, 100)
Shape of label tensor: (11180, 14)
*split data set:
train docs: 7155
val docs: 1789
test docs: 2236


In [105]:
def CNN_define(embedding_dim,max_sequence_length):
    #define for CNN
    print ('*define model CNN:')
    from tensorflow.keras.layers import Dense, Input, Flatten, Dropout
    from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding, GlobalMaxPooling1D
    from tensorflow.keras.models import Sequential
    from tensorflow.keras.utils import plot_model

    model = Sequential()
    model.add(Embedding(len(word_index) + 1, embedding_dim, input_length=max_sequence_length))
    model.add(Dropout(0.2))
    model.add(Conv1D(250, 3, padding='valid', activation='relu', strides=1))
    model.add(MaxPooling1D(3))
    model.add(Flatten())
    model.add(Dense(embedding_dim, activation='relu'))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()
    # plot_model(model, to_file='model.png',show_shapes=True)

    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
    print (model.metrics_names)
    return model

def LSTM_define(embedding_dim,max_sequence_length):
    #trainning for lstm
    print ('*define model lstm：')
    from tensorflow.keras.layers import Dense, Input, Flatten, Dropout
    from tensorflow.keras.layers import LSTM, Embedding
    from tensorflow.keras.models import Sequential

    model = Sequential()
    model.add(Embedding(len(word_index) + 1, embedding_dim, input_length=max_sequence_length))
    model.add(LSTM(200, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dropout(0.2))
    model.add(Dense(labels.shape[1], activation='softmax'))
    model.summary()

    model.compile(loss='categorical_crossentropy',optimizer='rmsprop',metrics=['acc'])
    print (model.metrics_names)
    return model
    
def model_train(model_name,model):
    model.fit(x_train, y_train, validation_data=(x_val, y_val), epochs=2, batch_size=128)
    model.save(model_name)

    print ('*testing model:')
    print (model.evaluate(x_test, y_test))

In [106]:
model=CNN_define(EMBEDDING_DIM,MAX_SEQUENCE_LENGTH)
model_train('CNN.h5',model)

*define model CNN:
Model: "sequential_13"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_16 (Embedding)     (None, 100, 200)          30028200  
_________________________________________________________________
dropout_13 (Dropout)         (None, 100, 200)          0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 98, 250)           150250    
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 32, 250)           0         
_________________________________________________________________
flatten_10 (Flatten)         (None, 8000)              0         
_________________________________________________________________
dense_23 (Dense)             (None, 200)               1600200   
_________________________________________________________________
dense_24 (Dense)             (None

In [107]:
model=LSTM_define(EMBEDDING_DIM,MAX_SEQUENCE_LENGTH)
model_train('LSTM.h5',model)

*define model lstm：
Model: "sequential_14"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_17 (Embedding)     (None, 100, 200)          30028200  
_________________________________________________________________
lstm_3 (LSTM)                (None, 200)               320800    
_________________________________________________________________
dropout_14 (Dropout)         (None, 200)               0         
_________________________________________________________________
dense_25 (Dense)             (None, 14)                2814      
Total params: 30,351,814
Trainable params: 30,351,814
Non-trainable params: 0
_________________________________________________________________
[]
Epoch 1/2
Epoch 2/2
*testing model:
[1.725166916847229, 0.43649372458457947]


In [108]:
def tfid(all_texts):
    print ('*doc to var:')
    from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer   
    count_v0= CountVectorizer();  
    counts_all = count_v0.fit_transform(all_texts);
    count_v1= CountVectorizer(vocabulary=count_v0.vocabulary_);  
    counts_train = count_v1.fit_transform(train_texts);   
    print ("the shape of train is "+repr(counts_train.shape))  
    count_v2 = CountVectorizer(vocabulary=count_v0.vocabulary_);  
    counts_test = count_v2.fit_transform(test_texts);  
    print ("the shape of test is "+repr(counts_test.shape))

    tfidftransformer = TfidfTransformer();    
    train_data = tfidftransformer.fit(counts_train).transform(counts_train);
    test_data = tfidftransformer.fit(counts_test).transform(counts_test); 

    x_train = train_data
    y_train = train_labels
    x_test = test_data
    y_test = test_labels
    return x_train,y_train,x_test,y_test

In [109]:
x_train,y_train,x_test,y_test=tfid(all_texts)

*doc to var:
the shape of train is (5590, 145672)
the shape of test is (5590, 145672)


In [111]:
print ('*KNN:')
from sklearn.neighbors import KNeighborsClassifier  

for x in range(1,15):  
    knnclf = KNeighborsClassifier(n_neighbors=x)
    knnclf.fit(x_train,y_train)  
    preds = knnclf.predict(x_test);
    num = 0
    preds = preds.tolist()
    for i,pred in enumerate(preds):
        if int(pred) == int(y_test[i]):
            num += 1
    print ('K= '+str(x)+', precision_score:' + str(float(num) / len(preds)))


*KNN:


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 1, precision_score:1.0


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 2, precision_score:0.8395348837209302


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 3, precision_score:0.8518783542039357


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 4, precision_score:0.8339892665474061


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 5, precision_score:0.8311270125223613


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 6, precision_score:0.8196779964221824


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 7, precision_score:0.8223613595706619


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 8, precision_score:0.8178890876565296


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 9, precision_score:0.8194991055456172


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 10, precision_score:0.8161001788908766


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 11, precision_score:0.8141323792486583


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 12, precision_score:0.8110912343470483


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


K= 13, precision_score:0.8114490161001789
K= 14, precision_score:0.807871198568873


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [112]:
print ('*SVM：')
from sklearn.svm import SVC   
svclf = SVC(kernel = 'linear') 
svclf.fit(x_train,y_train)  
preds = svclf.predict(x_test);  
num = 0
preds = preds.tolist()
for i,pred in enumerate(preds):
    if int(pred) == int(y_test[i]):
        num += 1
print ('precision_score:' + str(float(num) / len(preds)))


*SVM：
precision_score:0.9837209302325581


In [110]:
print ('*Naive Bayes：')
from sklearn.naive_bayes import MultinomialNB  
from sklearn import metrics
clf = MultinomialNB(alpha = 0.01)   
clf.fit(x_train, y_train);  
preds = clf.predict(x_test);
num = 0
preds = preds.tolist()
for i,pred in enumerate(preds):
    if int(pred) == int(y_test[i]):
        num += 1
print ('precision_score:' + str(float(num) / len(preds)))


*Naive Bayes：
precision_score:0.9824686940966011


In [113]:
print ('*Logisticre Gression：')
from sklearn.linear_model import LogisticRegression
from sklearn import metrics
log =  LogisticRegression(C=1e10,max_iter=10000) 
log.fit(x_train, y_train);  
preds = log.predict(x_test);
num = 0
preds = preds.tolist()
for i,pred in enumerate(preds):
    if int(pred) == int(y_test[i]):
        num += 1
print ('precision_score:' + str(float(num) / len(preds)))


*Logisticre Gression：
precision_score:1.0
