In [3]:
import pymysql
import pandas as pd
import numpy as np
import re
import jieba

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

TENCENT_VECTOR_PATH = '/Users/huan/Desktop/Tencent_AILab_ChineseEmbedding/Tencent_AILab_ChineseEmbedding.txt'
MAX_VOCAB_SIZE = 50000
MAX_SEQUENCE_LENGTH = 20
EMBEDDING_DIM = 200

In [4]:
data = pd.read_csv("data/train_data.csv")

In [5]:
X = data['sentence']
Y = data['label']
null_filter = X.notnull() & Y.notnull()
X = X.loc[null_filter]
Y = Y.loc[null_filter]

X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size = 0.2, stratify = Y, random_state = 42)

In [18]:
tokenizer = Tokenizer(num_words = MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(X)
num_words = min(MAX_VOCAB_SIZE, len(tokenizer.word_index) + 1)
print('Number of word considered: {}'.format(num_words))
word2index = tokenizer.word_index

Number of word considered: 17390


In [23]:
sequences = tokenizer.texts_to_sequences(X)
padded_sequences = pad_sequences(sequences,maxlen = MAX_SEQUENCE_LENGTH)

embeddings_index = dict()
with open(TENCENT_VECTOR_PATH,'r', encoding = 'utf-8') as f:
    next(f)
    for line in f:
        values = line.split(' ')
        word = values[0]
        if word in word2index:
            coefs = np.asarray(values[1:], dtype='float32')
            embeddings_index[word] = coefs
    f.close()
    
embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
for word, ind in word2index.items():
    if ind < MAX_VOCAB_SIZE:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[ind] = embedding_vector

In [29]:
from tensorflow.keras.layers import Input,Dense, GlobalMaxPooling1D
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Embedding,concatenate,Flatten
from tensorflow.keras.models import Model
from sklearn.metrics import roc_auc_score
from tensorflow.keras.utils import plot_model

In [27]:
embedding_layer = Embedding(num_words,
                    EMBEDDING_DIM,
                    weights = [embedding_matrix],
                    input_length = MAX_SEQUENCE_LENGTH,
                    trainable = False)

inputs = Input(shape=(MAX_SEQUENCE_LENGTH,))
embedded_sequences = Embedding(input_dim=num_words, output_dim=200, input_length=MAX_SEQUENCE_LENGTH, weights=[embedding_matrix],trainable=False)(inputs)
l_conv01 = Conv1D(128, 3,activation='relu',padding='same')(embedded_sequences)
l_pool01 = MaxPooling1D(3)(l_conv01)
l_conv02 = Conv1D(128, 4,activation='relu',padding='same')(embedded_sequences)
l_pool02 = MaxPooling1D(5)(l_conv02)
l_conv03 = Conv1D(128, 5,activation='relu',padding='same')(embedded_sequences)
l_pool03 = MaxPooling1D(5)(l_conv03)
l_merge = concatenate([l_pool01,l_pool02,l_pool03],axis=1)

l_cov2 = Conv1D(128, 3, activation='relu',padding='same')(l_merge)
l_pool2 = MaxPooling1D(10)(l_cov2)
l_pool2 = Flatten()(l_pool2)
l_dense = Dense(128, activation='relu')(l_pool2)
encoder = Model(inputs, l_dense)

encoded_vector = encoder(inputs)

preds = Dense(2, activation='softmax')(encoded_vector)

model=Model(inputs,preds)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics = ['accuracy'])

In [33]:
model.fit(padded_sequences, Y, batch_size =128, epochs = 10,validation_split = 0.2)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7ff93c7e0d90>

In [34]:
import jieba

#### 还是要深度学习模型哟

In [52]:
def predict_sentence(sentence):
    x = tokenizer.texts_to_sequences([' '.join(list(jieba.cut(sentence)))])
    x_pad= pad_sequences(x,maxlen = MAX_SEQUENCE_LENGTH)
    return model.predict(x_pad)[:,1][0] > 0.5

In [53]:
predict_sentence('小婊子真骚')

True

In [54]:
predict_sentence('做爱了太爽了')

True

In [55]:
predict_sentence('今天天气不错')

False

In [56]:
predict_sentence('军火走私是犯法的')

True

In [58]:
predict_sentence('新疆独立么')

True