In [32]:
import tensorflow as tf
import pandas as pd
import numpy as np
import jieba
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM,Dense,Embedding

In [58]:
MAX_VOCAB_SIZE = 20000
MAX_SEQ_LEN = 100
EMBEDDING_DIM = 128
BATCH_SIZE = 64
EPOCHS = 6
split = 0.3

In [38]:
def load_dataset(tabel='train'):
    file = './datasets/'+tabel+'.tsv'
    df = pd.read_csv(file, sep='\t')
    df = df.dropna()
    texts = df['text_a'].astype(str).tolist()
    if tabel == 'test':
        labels = None
    else:
        labels = df['label'].astype(int).tolist()
        
    return texts, labels

In [48]:
train_texts, train_labels = load_dataset()
test_texts, test_labels = load_dataset('dev')

In [42]:
def load_stopwords(filep='./datasets/stopwords.txt'):
    with open(filep,encoding='utf-8') as f:
        return set([w.strip() for w in f.readlines()])
stopwords = load_stopwords()
def chinese_tokenizer(text):
    return [w for w in jieba.lcut(text) if w not in stopwords and w.strip()]
def text_cut(texts):
    return [' '.join(chinese_tokenizer(text)) for text in texts]

In [51]:
train_cut = text_cut(train_texts)
test_cut = text_cut(test_texts)

In [55]:
tokenizer = Tokenizer(num_words=MAX_VOCAB_SIZE)
tokenizer.fit_on_texts(train_cut)

In [56]:
train_sequences = tokenizer.texts_to_sequences(train_cut)
test_sequences = tokenizer.texts_to_sequences(test_cut)

In [57]:
x_train = pad_sequences(train_sequences,maxlen=MAX_SEQ_LEN)
y_train = np.array(train_labels)

x_test = pad_sequences(test_sequences, maxlen=MAX_SEQ_LEN)
y_test = np.array(test_labels)

In [68]:
db_train = tf.data.Dataset.from_tensor_slices((x_train,y_train))
db_train = db_train.shuffle(1000).batch(BATCH_SIZE,drop_remainder=True)

db_val = db_train.skip(int(len(db_train)*(1-split)))
db_train = db_train.take(int(len(db_train)*(1-split)))

db_test = tf.data.Dataset.from_tensor_slices((x_test,y_test))
db_test = db_test.shuffle(1000).batch(BATCH_SIZE,drop_remainder=True)

In [70]:
model = Sequential()
model.add(Embedding(MAX_VOCAB_SIZE,EMBEDDING_DIM,input_length=MAX_SEQ_LEN))
model.add(LSTM(128,dropout=0.3))
model.add(Dense(32,activation='relu'))
model.add(Dense(1,activation='sigmoid'))

In [71]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 128)          2560000   
_________________________________________________________________
lstm_3 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dense_3 (Dense)              (None, 32)                4128      
_________________________________________________________________
dense_4 (Dense)              (None, 1)                 33        
Total params: 2,695,745
Trainable params: 2,695,745
Non-trainable params: 0
_________________________________________________________________


In [72]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

In [73]:
history = model.fit(db_train,epochs=EPOCHS, validation_data=db_val)

Epoch 1/6
Epoch 2/6
Epoch 3/6
Epoch 4/6
Epoch 5/6
Epoch 6/6


In [74]:
model.evaluate(db_test)



[0.5456445813179016, 0.8654513955116272]

In [75]:
import os
os.makedirs('./models',exist_ok=True)
model.save('./models/lstm_model.h5')