In [1]:
num_threads = 4
train_name = 'train.csv'
test_name = 'test.csv'
model_name = 'models/RNNv1'
mode = 'chinese' # english / chinese
vocab_name = 'vocab.json'
sent2seq_name = 'sent2seq.json'
sent2seq_test_name = 'sent2seq_test.json'

In [2]:
from multiprocessing.pool import ThreadPool as Pool
import json
from tqdm import tqdm_notebook as tqdm
import jieba.posseg as pseg
import csv
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [3]:
BOS = '<bos>'
EOS = '<eos>'
PAD = '<pad>'
UNK = '<unk>'
MAX_Q_LEN = 30
MAX_A_LEN = 30

In [4]:
##### Data Loader
print('loading ' + vocab_name)
vocab = json.load(open(vocab_name, 'r', encoding='utf-8'))
print('done')

print('loading ' + sent2seq_name)
Sent2Seq = json.load(open(sent2seq_name, 'r'))
print('done')

iBOS = vocab[BOS]
iPAD = vocab[PAD]
iEOS = vocab[EOS]

print("Loading {} ...".format(train_name))
data1 = []
data2 = []
label = []

Reader = csv.reader(open(train_name, newline='', encoding='utf-8'), delimiter=',', quotechar='"')
row_count = sum(1 for row in Reader)
Reader = csv.reader(open(train_name, newline='', encoding='utf-8'), delimiter=',', quotechar='"')
for i,fields in tqdm(enumerate(Reader), total=row_count):    
    if i == 0:
        continue
    tid1, tid2 = fields[1:3]
#     sent1 = [iBOS] + Sent2Seq[tid1] + [iEOS]
#     sent2 = [iBOS] + Sent2Seq[tid2] + [iEOS]
    sent1 = Sent2Seq[tid1]
    sent2 = Sent2Seq[tid2]
    data1.append(sent1)
    data2.append(sent2)
    label.append(fields[7])
NUM_DATA = len(data1)
print("done. {} data loaded.".format(NUM_DATA))

loading vocab.json
done
loading sent2seq.json
done
Loading train.csv ...


HBox(children=(IntProgress(value=0, max=320553), HTML(value='')))


done. 320552 data loaded.


In [5]:
# data1 = pad_sequences(data1, maxlen=MAX_Q_LEN, padding='pre', truncating='pre', value=iPAD)
# data2 = pad_sequences(data2, maxlen=MAX_A_LEN, padding='pre', truncating='pre', value=iPAD)
data1 = pad_sequences(data1, maxlen=None, padding='pre', truncating='pre', value=iPAD)
data2 = pad_sequences(data2, maxlen=None, padding='pre', truncating='pre', value=iPAD)
print('done')

num_agree = sum([lb == 'agreed' for lb in label])
num_disagree = sum([lb == 'disagreed' for lb in label])
num_unrelated = sum([lb == 'unrelated' for lb in label])

lbtype = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
lbweight = {lbtype['agreed']: 1/15, lbtype['disagreed']: 1/5, lbtype['unrelated']: 1/16}
# lbweight = {lbtype['agreed']: NUM_DATA/num_agree, lbtype['disagreed']: NUM_DATA/num_disagree, lbtype['unrelated']: NUM_DATA/num_unrelated}
# print(lbweight)
labelcat = []
for lb in label:
    labelcat.append(lbtype[lb])

done


In [6]:
print(lbweight)
# print(lbweight2)

{0: 0.06666666666666667, 1: 0.2, 2: 0.0625}


In [7]:
MAX_Q_LEN = data1.shape[1]
MAX_A_LEN = data2.shape[1]

In [8]:
VOCAB_SZ = len(vocab)
print(VOCAB_SZ)

23225


In [9]:
wv_matrix = np.load('wv_matrix'+'.npy')

In [10]:
EMBED_DIM = wv_matrix.shape[1]
RNN_HIDDEN = EMBED_DIM + 1
DENSE_HIDDEN = EMBED_DIM

In [11]:
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
# core
from tensorflow.keras.layers import Input, Dense, Embedding, Activation, BatchNormalization, Dropout, Softmax, Subtract
from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM, GRU, LSTM
from tensorflow.keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D
from tensorflow.keras.layers import Dot, Bidirectional, TimeDistributed, Lambda, Multiply, Concatenate, Flatten

L_EmbeddingLayer = Embedding(VOCAB_SZ, EMBED_DIM, mask_zero=False, weights=[wv_matrix], trainable=True, name='WordEmbedding')
# L_EmbeddingLayer = Embedding(VOCAB_SZ, EMBED_DIM, name='WordEmbedding')

t_query = Input(shape=(MAX_Q_LEN,), dtype='int32', name='Sentence1')
t_enc_Q = L_EmbeddingLayer(t_query)

t_answer = Input(shape=(MAX_A_LEN,), dtype='int32', name='Sentence2')
t_enc_A = L_EmbeddingLayer(t_answer)

RNNLayer1 = Bidirectional(CuDNNLSTM(RNN_HIDDEN, unit_forget_bias=True, return_sequences=True, name='RNN1'), merge_mode='concat')
MaxPoolingLayer = GlobalMaxPooling1D(name='Pooling')
ATTNWLayer = Dense(RNN_HIDDEN*2, activation=None, use_bias=False)
ATTNLayer = Dot(axes=-1, name="Attention")
QReWeightLayer = Dot(axes=-2, name="ReweightQ")
DenseLayer1 = Dense(DENSE_HIDDEN, activation='tanh', use_bias=True)
DenseLayer2 = Dense(3, activation='softmax', use_bias=True)
DropLayer1 = Dropout(0.5)
DropLayer2 = Dropout(0.5)

h_q = RNNLayer1(t_enc_Q)
h_a = RNNLayer1(t_enc_A)

Wh_a = ATTNWLayer(h_a)
ATTN_mat = ATTNLayer([h_q, Wh_a])
ATTN_q2a = Softmax(axis=1)(ATTN_mat)

h_hat_q = QReWeightLayer([ATTN_q2a, h_q])


inter_mul = Multiply(name='Interaction')([h_hat_q, h_a])
inter_sub = Subtract(name='Interaction2')([h_hat_q, h_a])
max_filtered_mul = MaxPoolingLayer(inter_mul)
max_filtered_sub = MaxPoolingLayer(inter_sub)

feature_concat = Concatenate(axis=-1, name='MultiInclusion')([max_filtered_mul, max_filtered_sub])
# feature_concat = DropLayer1(feature_concat)
semi_out_qa = DenseLayer1(feature_concat)
# semi_out_qa = DropLayer2(semi_out_qa)
output = DenseLayer2(semi_out_qa)

model = Model(inputs=[t_query, t_answer], outputs=output)

model.summary()
from tensorflow.keras.utils import plot_model
plot_model(model, to_file='model_summary.png', show_shapes=True)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Sentence1 (InputLayer)          (None, 74)           0                                            
__________________________________________________________________________________________________
Sentence2 (InputLayer)          (None, 81)           0                                            
__________________________________________________________________________________________________
WordEmbedding (Embedding)       multiple             4645000     Sentence1[0][0]                  
                                                                 Sentence2[0][0]                  
__________________________________________________________________________________________________
bidirectional (Bidirectional)   multiple             648024      WordEmbedding[0][0]              
          

In [12]:
# from tensorflow.keras.models import load_model

# L_EmbeddingLayer = load_model('ElmoLayer.h5')
# for layer in L_EmbeddingLayer.layers:
#     layer.trainable = False

In [13]:
# from tensorflow.keras.models import Model, load_model
# from tensorflow.keras import backend as K
# # core
# from tensorflow.keras.layers import Input, Dense, Embedding, Activation, BatchNormalization, Dropout, Softmax, Subtract
# from tensorflow.keras.layers import CuDNNGRU, CuDNNLSTM, GRU, LSTM
# from tensorflow.keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D
# from tensorflow.keras.layers import Dot, Bidirectional, TimeDistributed, Lambda, Multiply, Concatenate, Flatten

# t_query = Input(shape=(MAX_Q_LEN,), dtype='int32', name='Sentence1')
# t_enc_Q = L_EmbeddingLayer(t_query)

# t_answer = Input(shape=(MAX_A_LEN,), dtype='int32', name='Sentence2')
# t_enc_A = L_EmbeddingLayer(t_answer)

# print(t_enc_Q.shape, t_enc_A.shape)

# RNNLayer1 = Bidirectional(CuDNNLSTM(RNN_HIDDEN, unit_forget_bias=True, return_sequences=True, name='RNN1'), merge_mode='concat')
# MaxPoolingLayer = GlobalMaxPooling1D(name='Pooling')
# ATTNWLayer = Dense(RNN_HIDDEN*2, activation=None, use_bias=False)
# ATTNLayer = Dot(axes=-1, name="Attention")
# QReWeightLayer = Dot(axes=-2, name="ReweightQ")
# DenseLayer1 = Dense(DENSE_HIDDEN, activation='tanh', use_bias=True)
# DenseLayer2 = Dense(3, activation='softmax', use_bias=True)
# DropLayer1 = Dropout(0.5)
# DropLayer2 = Dropout(0.5)

# h_q = RNNLayer1(t_enc_Q)
# h_a = RNNLayer1(t_enc_A)

# Wh_a = ATTNWLayer(h_a)
# ATTN_mat = ATTNLayer([h_q, Wh_a])
# ATTN_q2a = Softmax(axis=1)(ATTN_mat)

# h_hat_q = QReWeightLayer([ATTN_q2a, h_q])


# inter_mul = Multiply(name='Interaction')([h_hat_q, h_a])
# inter_sub = Subtract(name='Interaction2')([h_hat_q, h_a])
# max_filtered_mul = MaxPoolingLayer(inter_mul)
# max_filtered_sub = MaxPoolingLayer(inter_sub)

# feature_concat = Concatenate(axis=-1, name='MultiInclusion')([max_filtered_mul, max_filtered_sub])
# # feature_concat = DropLayer1(feature_concat)
# semi_out_qa = DenseLayer1(feature_concat)
# # semi_out_qa = DropLayer2(semi_out_qa)
# output = DenseLayer2(semi_out_qa)

# model = Model(inputs=[t_query, t_answer], outputs=output)

# model.summary()
# from tensorflow.keras.utils import plot_model
# plot_model(model, to_file='model_summary.png', show_shapes=True)

In [14]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LambdaCallback
from tensorflow.keras.optimizers import RMSprop, Adam

epochs = 30
starting = 0
optimizer = RMSprop(lr=0.001, clipnorm=15.)
# optimizer = Adam(lr=0.001, clipnorm=15.)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=["accuracy"])

earlystp = EarlyStopping(monitor="val_loss", patience=10, verbose=1, mode="auto")
checkpoint = ModelCheckpoint(model_name+'_{epoch:02d}.hdf5', monitor='val_loss', \
                             verbose=0, save_best_only=False, save_weights_only=False, \
                             mode='auto', period=1)
lrreduc = ReduceLROnPlateau(monitor='val_loss', factor=0.5,
                              patience=5, min_lr=0.00001, verbose=1, cooldown=3)
# evaluate(None,None)
model.fit(x=[data1, data2],y=labelcat, batch_size=512, epochs=epochs, \
           initial_epoch=starting, shuffle=True, validation_split=0.05)#, class_weight=lbweight)
#           callbacks=[earlystp, lrreduc, checkpoint])

model.save(model_name+'.hdf5')

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 304524 samples, validate on 16028 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
 56832/304524 [====>.........................] - ETA: 1:06 - loss: 0.0981 - acc: 0.9629

KeyboardInterrupt: 

In [15]:
model.save(model_name+'.hdf5')

In [16]:
model.load_weights(model_name+'.hdf5')

In [17]:
##### Test Data Loader
print("Loading {} ...".format(test_name))
testsents = {}
testdata1 = []
testdata2 = []
testid = []
Reader = csv.reader(open(test_name, newline='', encoding='utf-8'), delimiter=',', quotechar='"')
for i,fields in enumerate(Reader):    
    if i == 0:
        continue
    testid.append(fields[0])
    tid1, tid2 = fields[1:3]
    if mode == 'english':
        sent1 = fields[5]
        sent2 = fields[6]
    elif mode == 'chinese':
        sent1 = fields[3]
        sent2 = fields[4]
    if sent1 == "":
        sent1 = UNK
    if sent2 == "":
        sent2 = UNK    
    if tid1 not in testsents:
        testsents[tid1] = sent1 
    if tid2 not in testsents:
        testsents[tid2] = sent2
    testdata1.append(tid1)
    testdata2.append(tid2)
NUM_DATA = len(testsents)
print("done. {} data loaded.".format(NUM_DATA))

Loading test.csv ...
done. 62767 data loaded.


In [18]:
##### Sent2Seq
def sent2seq():
    for key, sent in tqdm(testsents.items()):
        words = pseg.cut(sent)
        out_seq = []
        for w,flag in words:
            if flag is not 'x':
                try:
                    wid = vocab[w]
                except KeyError:
                    wid = vocab[UNK]
                out_seq.append(wid)
        testsents[key] = out_seq

    sent2seq_test_name = 'sent2seq_test.json'
    json.dump(testsents, open(sent2seq_test_name, 'w'))
# sent2seq()

In [19]:
testsents = json.load(open(sent2seq_test_name))

for i,d in enumerate(testdata1):
#     seq = [iBOS] + testsents[d] + [iEOS]
    seq = testsents[d]
    testdata1[i] = seq
for i,d in enumerate(testdata2):
#     seq = [iBOS] + testsents[d] + [iEOS]
    seq = testsents[d]
    testdata2[i] = seq
    
testdata1 = pad_sequences(testdata1, maxlen=MAX_Q_LEN, padding='pre', truncating='pre', value=iPAD)
testdata2 = pad_sequences(testdata2, maxlen=MAX_A_LEN, padding='pre', truncating='pre', value=iPAD)
print('done')

done


In [20]:
testprobs = model.predict(x=[testdata1, testdata2], batch_size=1024, verbose=1)



In [21]:
testlabel = [np.argmax(lb) for lb in testprobs]

In [22]:
type2lb = ['agreed', 'disagreed', 'unrelated']
outcsv = open('predict.csv', 'w')
outcsv.write("Id,Category\n")
for t, lb in zip(testid, testlabel):
    outcsv.write("{},{}\n".format(t, type2lb[lb]))
outcsv.close()