In [9]:
import os
from gensim.models import KeyedVectors
word2vec = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

In [10]:
def file_open(input):
    sentences1 = []
    sentences2 = []
    with open(input, 'r', encoding='utf-8') as fileobj:
        while True:
            line = fileobj.readline()
            if line :
                s = line.split('\t')
                s[0] = s[0].rstrip().replace('.', ' ').replace('"', ' ').replace("'", " ").replace("-", " ").replace(":", " ").replace("%", " ").replace("*", " ").replace("!", " ").replace("?", " ").replace("@", " ").replace('/', ' ').replace(",", ' ').replace('(', ' ').replace(')', ' ').replace('#', ' ')
                sentences1.append(s[0])
                s[1] = s[1].rstrip().replace('.', ' ').replace('"', ' ').replace("'", " ").replace("-", " ").replace(":", " ").replace("%", " ").replace("*", " ").replace("!", " ").replace("?", " ").replace("@", " ").replace('/', ' ').replace(",", ' ').replace('(', ' ').replace(')', ' ').replace('#', ' ')
                sentences2.append(s[1])
            else:
                break
    return sentences1, sentences2

In [11]:
def label_open(input):
    with open(input, "r") as fileobj:
        labels = [float(s.strip())/5. for s in fileobj.readlines()]
    labels = np.asarray(labels)
    return labels

In [12]:
import numpy as np
sentences1, sentences2 = file_open("input/STS.input.OnWN.txt")
labels = label_open("gs/STS.gs.OnWN.txt")

sentences1n, sentences2n = file_open("input/STS.input.MSRvid.txt")
sentences1 = np.concatenate([sentences1, sentences1n], axis=0)
sentences2 = np.concatenate([sentences2, sentences2n], axis=0)
labelsn = label_open("gs/STS.gs.MSRvid.txt")
labels = np.concatenate([labels, labelsn], axis=0)

sentences1n, sentences2n = file_open("input/STS.input.SMTeuroparl.txt")
sentences1 = np.concatenate([sentences1, sentences1n], axis=0)
sentences2 = np.concatenate([sentences2, sentences2n], axis=0)
labelsn = label_open("gs/STS.gs.SMTeuroparl.txt")
labels = np.concatenate([labels, labelsn], axis=0)


sentences1n, sentences2n = file_open("input/STS.input.tweet-news.txt")
sentences1 = np.concatenate([sentences1, sentences1n], axis=0)
sentences2 = np.concatenate([sentences2, sentences2n], axis=0)
labelsn = label_open("gs/STS.gs.tweet-news.txt")
labels = np.concatenate([labels, labelsn], axis=0)


In [13]:
from nltk.stem import *
from keras.preprocessing.text import *
from keras.preprocessing.sequence import pad_sequences
import nltk


def lemmatize(texts):
    s = []
    a = np.zeros((len(texts), 75, 300))
    for text in texts:
        s.append(text_to_word_sequence(text))
    for i, text in enumerate(s):
        for j, word in enumerate(text):
            if word in word2vec:
                a[i][j][:] = word2vec[word]
            else:
                continue
    return a

In [None]:
max_len = 75
data_left = lemmatize(sentences1)
data_right = lemmatize(sentences2)

labels = np.asarray(labels)
print('Shape of data tensor:', data_left.shape)
print('Shape of labels tensor:', labels.shape)

indices = np.arange(data_left.shape[0])
np.random.shuffle(indices)
data_left = data_left[indices]
data_right = data_right[indices]
labels = labels[indices]


In [15]:
import keras
from keras.layers import *
from keras.models import *
from keras_self_attention import SeqSelfAttention

def build_model():
    input_left = Input(shape=(max_len, 300 ) ,name='input1')
    input_right = Input(shape=(max_len, 300 ), name='input2')

    lstm = Bidirectional(LSTM(32, dropout=0.5, recurrent_dropout=0.5, return_sequences=True), merge_mode='ave')

    attention = SeqSelfAttention(attention_activation='sigmoid', kernel_regularizer=keras.regularizers.l2(1e-6), bias_regularizer=keras.regularizers.l1(1e-4), attention_regularizer_weight=1e-4)
    left = lstm(input_left)
    left = attention(left)
    left = GlobalAveragePooling1D()(left)

    right = lstm(input_right)
    right = attention(right)
    right = GlobalAveragePooling1D()(right)

    out = Dot(axes=-1, normalize=True)([left, right])

    model = Model(inputs=[input_left, input_right], output=out)
    

    model.compile(loss='mse', optimizer='adam', metrics=['mae'])
    model.summary()
    return model

In [None]:
from keras.callbacks import *

model = build_model()
history = model.fit([data_left, data_right], labels, validation_split=0.2, epochs=100, batch_size=64, callbacks=[ModelCheckpoint(filepath='my_model_mask.h5', monitor='val_loss', save_best_only=True)])


In [None]:
import matplotlib.pyplot as plt
import numpy as np
num_epochs=100

mae = history.history["mae"]
val_mae = history.history['val_mae']
loss = history.history['loss']
val_loss = history.history['val_loss']

epochs = range(1, len(mae) + 1)

plt.plot(epochs, loss, 'bo', label='Traing loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.figure()

plt.plot(epochs, mae, 'bo', label='Traing mae')
plt.plot(epochs, val_mae, 'b', label='Validation mae')
plt.title('Training and validation mae')
plt.legend()

plt.show()

In [25]:
test1, test2 = file_open("input/STS.input.images.txt")
test1 = lemmatize(test1)
test2 = lemmatize(test2)


model.load_weights('my_model_mask.h5')
prediction = model.predict([test1, test2])