# Basic model

### Setup 1
- train/test name: the files containing the relationship between titles
- model_name: the name of the model. Used when saving model.
- sent2seq_name: the output of Preprocessing task.
- sent2seq_test_name: the file to store the processed sentences in testset.

In [1]:
num_threads = 4
train_name = 'train.csv'
test_name = 'test.csv'
model_name = 'models/RNNv1'
mode = 'chinese' # english / chinese
vocab_name = 'vocab.json'
sent2seq_name = 'sent2seq.json'
sent2seq_test_name = 'sent2seq_test.json'

In [2]:
from multiprocessing.pool import ThreadPool as Pool
import json
from tqdm import tqdm_notebook as tqdm
import jieba.posseg as pseg
import csv
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

### Setup 2
- MAX_Q_LEN: the maximum length of the first input title
- MAX_A_LEN: the maximum length of the second input title

In [3]:
BOS = '<bos>'
EOS = '<eos>'
PAD = '<pad>'
UNK = '<unk>'
MAX_Q_LEN = 20
MAX_A_LEN = 20

### Data loader
this cell loads the sentence relationships, store it into label, and store the corresponding sentences into data1, data2.

In [4]:
##### Data Loader
print('loading ' + vocab_name)
vocab = json.load(open(vocab_name, 'r', encoding='utf-8'))
print('done')

print('loading ' + sent2seq_name)
Sent2Seq = json.load(open(sent2seq_name, 'r'))
print('done')

iBOS = vocab[BOS]
iPAD = vocab[PAD]
iEOS = vocab[EOS]

print("Loading {} ...".format(train_name))
data1 = []
data2 = []
label = []

Reader = csv.reader(open(train_name, newline='', encoding='utf-8'), delimiter=',', quotechar='"')
row_count = sum(1 for row in Reader)
Reader = csv.reader(open(train_name, newline='', encoding='utf-8'), delimiter=',', quotechar='"')
for i,fields in tqdm(enumerate(Reader), total=row_count):    
    if i == 0:
        continue
    tid1, tid2 = fields[1:3]
#     sent1 = [iBOS] + Sent2Seq[tid1] + [iEOS]
#     sent2 = [iBOS] + Sent2Seq[tid2] + [iEOS]
    sent1 = Sent2Seq[tid1]
    sent2 = Sent2Seq[tid2]
    data1.append(sent1)
    data2.append(sent2)
    label.append(fields[7])
NUM_DATA = len(data1)
print("done. {} data loaded.".format(NUM_DATA))

loading vocab.json
done
loading sent2seq.json
done
Loading train.csv ...


HBox(children=(IntProgress(value=0, max=320553), HTML(value='')))


done. 320552 data loaded.


### Padding & Label transform
the cell pads or truncate the sentences to the specified length, then assign number (0~2) according to the labels.

In [5]:
data1 = pad_sequences(data1, maxlen=MAX_Q_LEN, padding='pre', truncating='pre', value=iPAD)
data2 = pad_sequences(data2, maxlen=MAX_A_LEN, padding='pre', truncating='pre', value=iPAD)
print('done')

num_agree = sum([lb == 'agreed' for lb in label])
num_disagree = sum([lb == 'disagreed' for lb in label])
num_unrelated = sum([lb == 'unrelated' for lb in label])

lbtype = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
labelcat = []
for lb in label:
    labelcat.append(lbtype[lb])

done


### Hyperparameters
hyperparameters for the model.

In [10]:
RNN_HIDDEN = 128
EMBED_DIM = 200
DENSE_HIDDEN = 1
VOCAB_SZ = len(vocab)
print(VOCAB_SZ)

71055


### Embedding
load the embedding pre-trained in WordEmbedding task.

In [11]:
wv_matrix = np.load('wv_matrix'+'.npy')

### Architecture
this cell specifies the architecture of the model.

In [12]:
from tensorflow.keras.models import Model
from tensorflow.keras import backend as K
from tensorflow.keras.layers import Input, Dense, Embedding, CuDNNLSTM, Concatenate

L_EmbeddingLayer = Embedding(VOCAB_SZ, EMBED_DIM, mask_zero=False, weights=[wv_matrix], trainable=False, name='WordEmbedding')
# L_EmbeddingLayer = Embedding(VOCAB_SZ, EMBED_DIM, name='WordEmbedding')

t_query = Input(shape=(MAX_Q_LEN,), dtype='int32', name='Sentence1')
t_enc_Q = L_EmbeddingLayer(t_query)

t_answer = Input(shape=(MAX_A_LEN,), dtype='int32', name='Sentence2')
t_enc_A = L_EmbeddingLayer(t_answer)

RNNLayer1 = CuDNNLSTM(RNN_HIDDEN, unit_forget_bias=True, name='RNN1')

semi_out_q = RNNLayer1(t_enc_Q)

semi_out_a = RNNLayer1(t_enc_A)

DenseLayer2 = Dense(3, activation='softmax', use_bias=True)

semi_out_qa = Concatenate(axis=-1, name='Interaction')([semi_out_q, semi_out_a])

output = DenseLayer2(semi_out_qa)

model = Model(inputs=[t_query, t_answer], outputs=output)

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Sentence1 (InputLayer)          (None, 20)           0                                            
__________________________________________________________________________________________________
Sentence2 (InputLayer)          (None, 20)           0                                            
__________________________________________________________________________________________________
WordEmbedding (Embedding)       (None, 20, 200)      14211000    Sentence1[0][0]                  
                                                                 Sentence2[0][0]                  
__________________________________________________________________________________________________
RNN1 (CuDNNLSTM)                (None, 128)          168960      WordEmbedding[0][0]              
          

### Training
this cell trains the model with the data prepared above.

In [13]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau, LambdaCallback
from tensorflow.keras.optimizers import RMSprop, Adam

epochs = 20
starting = 0
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=["accuracy"])

model.fit(x=[data1, data2],y=labelcat, batch_size=512, epochs=epochs, \
           initial_epoch=starting, shuffle=True)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f5c65f017b8>

In [None]:
model.save(model_name+'.hdf5')

In [None]:
model.load_weights(model_name+'.hdf5')

### Testdata processing
the following two cells process the testing data the same way as the training data was processed in Preprocessing and above.

In [14]:
##### Test Data Loader
print("Loading {} ...".format(test_name))
testsents = {}
testdata1 = []
testdata2 = []
testid = []
Reader = csv.reader(open(test_name, newline='', encoding='utf-8'), delimiter=',', quotechar='"')
for i,fields in enumerate(Reader):    
    if i == 0:
        continue
    testid.append(fields[0])
    tid1, tid2 = fields[1:3]
    if mode == 'english':
        sent1 = fields[5]
        sent2 = fields[6]
    elif mode == 'chinese':
        sent1 = fields[3]
        sent2 = fields[4]
    if sent1 == "":
        sent1 = UNK
    if sent2 == "":
        sent2 = UNK    
    if tid1 not in testsents:
        testsents[tid1] = sent1 
    if tid2 not in testsents:
        testsents[tid2] = sent2
    testdata1.append(tid1)
    testdata2.append(tid2)
NUM_DATA = len(testsents)
print("done. {} data loaded.".format(NUM_DATA))

Loading test.csv ...
done. 62767 data loaded.


In [15]:
##### Sent2Seq
def sent2seq():
    for key, sent in tqdm(testsents.items()):
        words = pseg.cut(sent)
        out_seq = []
        for w,flag in words:
            if flag is not 'x':
                try:
                    wid = vocab[w]
                except KeyError:
                    wid = vocab[UNK]
                out_seq.append(wid)
        testsents[key] = out_seq

    sent2seq_test_name = 'sent2seq_test.json'
    json.dump(testsents, open(sent2seq_test_name, 'w'))
sent2seq()

HBox(children=(IntProgress(value=0, max=62767), HTML(value='')))

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.715 seconds.
Prefix dict has been built succesfully.





### Testdata padding
loads the processed testdata and pad them to the input size of the model.

In [16]:
testsents = json.load(open(sent2seq_test_name))

for i,d in enumerate(testdata1):
#     seq = [iBOS] + testsents[d] + [iEOS]
    seq = testsents[d]
    testdata1[i] = seq
for i,d in enumerate(testdata2):
#     seq = [iBOS] + testsents[d] + [iEOS]
    seq = testsents[d]
    testdata2[i] = seq
    
testdata1 = pad_sequences(testdata1, maxlen=MAX_Q_LEN, padding='pre', truncating='pre', value=iPAD)
testdata2 = pad_sequences(testdata2, maxlen=MAX_A_LEN, padding='pre', truncating='pre', value=iPAD)
print('done')

done


### Inferencing
inference the model to obtain the probability distribution of each title pairs.

In [17]:
testprobs = model.predict(x=[testdata1, testdata2], batch_size=1024, verbose=1)



### Determine class & output
use argmax to determine the most probable relation for each pairs, and then output the test results to predict.csv

In [18]:
testlabel = [np.argmax(lb) for lb in testprobs]
type2lb = ['agreed', 'disagreed', 'unrelated']
outcsv = open('predict.csv', 'w')
outcsv.write("Id,Category\n")
for t, lb in zip(testid, testlabel):
    outcsv.write("{},{}\n".format(t, type2lb[lb]))
outcsv.close()