In [1]:
import os
import hgtk
os.environ["CUDA_VISIBLE_DEVICES"]="0,1,2"

import tensorflow as tf
from keras import layers
from keras.preprocessing.sequence import pad_sequences
import numpy as np
from keras import backend as K
from keras.models import Model
import keras

def int_sentence(left_texts, right_texts, vocab_index):
    left_int = []
    right_int = []

    for i in range(len(left_texts)):
        left_etc = []
        right_etc = []
        for j in range(len(left_texts[i])):
            left_etc.append(vocab_index[left_texts[i][j]])
        for j in range(len(right_texts[i])):
            right_etc.append(vocab_index[right_texts[i][j]])
        left_int.append(left_etc)
        right_int.append(right_etc)
        
    return left_int, right_int

Using TensorFlow backend.


In [2]:
All = open('file_name_1', 'r', encoding='utf-8-sig')
train = open('file_name_2', 'r', encoding='utf-8-sig')
cv = open('file_name_3', 'r', encoding='utf-8-sig')
test = open('file_name_4', 'r', encoding='utf-8-sig')

all_sentence = []

for line in All:
    line = line.split('\t')
    left = hgtk.text.decompose(line[0])
    right = hgtk.text.decompose(line[1])
    
    etc = []
    for i in left:
        if i != 'ᴥ':
            etc.append(i + ' ')
    all_sentence.append(etc)
    
    etc = []
    for i in right:
        if i != 'ᴥ':
            etc.append(i + ' ')
    all_sentence.append(etc)
    

max_len = max([len(i) for i in all_sentence])

vocab = set()
for line in all_sentence:
    for word in line:
        vocab.add(word)

vocab_size = len(vocab)+1

vocab = sorted(list(vocab))

vocab_index = {}
for i in range(len(vocab)):
    vocab_index[vocab[i]] = len(vocab_index)+1

In [3]:
train_left_sen = []
train_right_sen = []
train_label = []

for line in train:
    line = line.split('\t')
    left = hgtk.text.decompose(line[0])
    right = hgtk.text.decompose(line[1])
    
    etc = []
    for i in left:
        if i != 'ᴥ':
            etc.append(i + ' ')
    train_left_sen.append(etc)
    
    etc = []
    for i in right:
        if i != 'ᴥ':
            etc.append(i + ' ')
    train_right_sen.append(etc)
    train_label.append(line[2].strip())

cv_left_sen = []
cv_right_sen = []
cv_label = []

for line in cv:
    line = line.split('\t')
    left = hgtk.text.decompose(line[0])
    right = hgtk.text.decompose(line[1])
    
    etc = []
    for i in left:
        if i != 'ᴥ':
            etc.append(i + ' ')
    cv_left_sen.append(etc)
    
    etc = []
    for i in right:
        if i != 'ᴥ':
            etc.append(i + ' ')
    cv_right_sen.append(etc)
    cv_label.append(line[2].strip())

test_left_sen = []
test_right_sen = []
test_label = []

for line in test:
    line = line.split('\t')
    left = hgtk.text.decompose(line[0])
    right = hgtk.text.decompose(line[1])
    
    etc = []
    for i in left:
        if i != 'ᴥ':
            etc.append(i + ' ')
    test_left_sen.append(etc)
    
    etc = []
    for i in right:
        if i != 'ᴥ':
            etc.append(i + ' ')
    test_right_sen.append(etc)
    test_label.append(line[2].strip())


In [4]:
train_left_int, train_right_int = int_sentence(train_left_sen, train_right_sen, vocab_index)
cv_left_int, cv_right_int = int_sentence(cv_left_sen, cv_right_sen, vocab_index)
test_left_int, test_right_int = int_sentence(test_left_sen, test_right_sen, vocab_index)

train_left = pad_sequences(train_left_int, padding='post', maxlen=max_len)
train_right = pad_sequences(train_right_int, padding='post', maxlen=max_len)

cv_left = pad_sequences(cv_left_int, padding='post', maxlen=max_len)
cv_right = pad_sequences(cv_right_int, padding='post', maxlen=max_len)

test_left = pad_sequences(test_left_int, padding='post', maxlen=max_len)
test_right = pad_sequences(test_right_int, padding='post', maxlen=max_len)

train_label = keras.utils.to_categorical(train_label)
cv_label = keras.utils.to_categorical(cv_label)
test_label = keras.utils.to_categorical(test_label)

In [5]:
left_input = layers.Input(shape=(max_len,))
right_input = layers.Input(shape=(max_len,))

embedded_layer = layers.Embedding(vocab_size, 300)

left_emd = embedded_layer(left_input)
right_emd = embedded_layer(right_input)

cnn3 = layers.Conv1D(filters=256, kernel_size=3, padding='same', activation='relu')
cnn4 = layers.Conv1D(filters=256, kernel_size=4, padding='same', activation='relu')
cnn5 = layers.Conv1D(filters=256, kernel_size=5, padding='same', activation='relu')

left_cnn3 = cnn3(left_emd)
right_cnn3 = cnn3(right_emd)

left_cnn4 = cnn4(left_emd)
right_cnn4 = cnn4(right_emd)

left_cnn5 = cnn5(left_emd)
right_cnn5 = cnn5(right_emd)

lstm3 = layers.Bidirectional(layers.LSTM(50, return_sequences=True))
lstm4 = layers.Bidirectional(layers.LSTM(50, return_sequences=True))
lstm5 = layers.Bidirectional(layers.LSTM(50, return_sequences=True))

left_lstm3 = lstm3(left_cnn3)
right_lstm3 = lstm3(right_cnn3)

left_lstm4 = lstm4(left_cnn4)
right_lstm4 = lstm4(right_cnn4)

left_lstm5 = lstm5(left_cnn5)
right_lstm5 = lstm5(right_cnn5)

left_max3 = layers.GlobalMaxPooling1D()(left_lstm3)
left_max4 = layers.GlobalMaxPooling1D()(left_lstm4)
left_max5 = layers.GlobalMaxPooling1D()(left_lstm5)

right_max3 = layers.GlobalMaxPooling1D()(right_lstm3)
right_max4 = layers.GlobalMaxPooling1D()(right_lstm4)
right_max5 = layers.GlobalMaxPooling1D()(right_lstm5)

left_avg3 = layers.GlobalAveragePooling1D()(left_lstm3)
left_avg4 = layers.GlobalAveragePooling1D()(left_lstm4)
left_avg5 = layers.GlobalAveragePooling1D()(left_lstm5)

right_avg3 = layers.GlobalAveragePooling1D()(right_lstm3)
right_avg4 = layers.GlobalAveragePooling1D()(right_lstm4)
right_avg5 = layers.GlobalAveragePooling1D()(right_lstm5)

concat_layer = layers.Concatenate(axis=1)([left_max3, left_max4, left_max5, left_avg3, left_avg4, left_avg5, right_max3, right_max4, right_max5, right_avg3, right_avg4, right_avg5])

outputs = layers.Dense(2, activation='softmax')(concat_layer)

model = Model(inputs=[left_input, right_input], outputs=[outputs])

model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 270)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 270)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 270, 300)     41100       input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
conv1d_1 (Conv1D)               (None, 270, 256)     230656      embedding_1[0][0]                
          

In [6]:
model = keras.utils.multi_gpu_model(model, gpus=3)

model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])

from keras.callbacks import EarlyStopping

es = EarlyStopping(monitor='val_loss',mode='min', verbose=1, patience=3, restore_best_weights=True)

model.fit([train_left, train_right], [train_label], batch_size=64, epochs=100, validation_data=([cv_left, cv_right], [cv_label]), callbacks=[es])

Train on 8800 samples, validate on 1100 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Restoring model weights from the end of the best epoch
Epoch 00013: early stopping


<keras.callbacks.History at 0x7f51221a4438>

In [7]:
evaluation = model.evaluate([test_left, test_right], [test_label])

print('Accuracy: '+str(evaluation[1]))
print('Loss: '+str(evaluation[0]))

Accuracy: 0.8654545456712897
Loss: 0.3541742006215182
