In [1]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

E: Package 'python-software-properties' has no installation candidate
··········


In [2]:
!mkdir -p drive
!google-drive-ocamlfuse drive

fuse: mountpoint is not empty
fuse: if you are sure this is safe, use the 'nonempty' mount option


In [3]:
%cd drive/Colab Notebooks/drive

/content/drive/Colab Notebooks/drive


In [4]:
import keras
from keras.layers import *
from keras.activations import softmax
from keras.models import Model
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization


def get_ESIM_model(nb_words, embedding_dim, embedding_matrix, recurrent_units, dense_units, dropout_rate, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                # embeddings_initializer='uniform',
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)

    input_q1_layer = Input(shape=(max_sequence_length,), dtype='int32', name='q1')
    input_q2_layer = Input(shape=(max_sequence_length,), dtype='int32', name='q2')

    embedding_sequence_q1 = BatchNormalization(axis=2)(embedding_layer(input_q1_layer))
    embedding_sequence_q2 = BatchNormalization(axis=2)(embedding_layer(input_q2_layer))

    final_embedding_sequence_q1 = SpatialDropout1D(0.25)(embedding_sequence_q1)
    final_embedding_sequence_q2 = SpatialDropout1D(0.25)(embedding_sequence_q2)

    rnn_layer_q1 = Bidirectional(LSTM(recurrent_units, return_sequences=True))(final_embedding_sequence_q1)
    rnn_layer_q2 = Bidirectional(LSTM(recurrent_units, return_sequences=True))(final_embedding_sequence_q2)

    attention = Dot(axes=-1)([rnn_layer_q1, rnn_layer_q2])
    w_attn_1 = Lambda(lambda x: softmax(x, axis=1))(attention)
    w_attn_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2))(attention))
    align_layer_1 = Dot(axes=1)([w_attn_1, rnn_layer_q1])
    align_layer_2 = Dot(axes=1)([w_attn_2, rnn_layer_q2])

    subtract_layer_1 = subtract([rnn_layer_q1, align_layer_1])
    subtract_layer_2 = subtract([rnn_layer_q2, align_layer_2])

    multiply_layer_1 = multiply([rnn_layer_q1, align_layer_1])
    multiply_layer_2 = multiply([rnn_layer_q2, align_layer_2])

    m_q1 = concatenate([rnn_layer_q1, align_layer_1, subtract_layer_1, multiply_layer_1])
    m_q2 = concatenate([rnn_layer_q2, align_layer_2, subtract_layer_2, multiply_layer_2])

    v_q1_i = Bidirectional(LSTM(recurrent_units, return_sequences=True))(m_q1)
    v_q2_i = Bidirectional(LSTM(recurrent_units, return_sequences=True))(m_q2)

    avgpool_q1 = GlobalAveragePooling1D()(v_q1_i)
    avgpool_q2 = GlobalAveragePooling1D()(v_q2_i)
    maxpool_q1 = GlobalMaxPooling1D()(v_q1_i)
    maxpool_q2 = GlobalMaxPooling1D()(v_q2_i)

    merged_q1 = concatenate([avgpool_q1, maxpool_q1])
    merged_q2 = concatenate([avgpool_q2, maxpool_q2])

    final_v = BatchNormalization()(concatenate([merged_q1, merged_q2]))
    output = Dense(units=dense_units, activation='relu')(final_v)
    output = BatchNormalization()(output)
    output = Dropout(dropout_rate)(output)
    output = Dense(units=out_size, activation='sigmoid')(output)

    model = Model(inputs=[input_q1_layer, input_q2_layer], output=output)
    adam_optimizer = keras.optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)

    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=['binary_crossentropy', 'accuracy'])

    return model

Using TensorFlow backend.


In [5]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
words_stop= stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [0]:
keep_punctuation=True

In [8]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
words_stop= stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [0]:
import numpy as np
from tqdm import tqdm
from string import punctuation as p
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
import json
import re

class Dataloader(object):
    def __init__(self,clean_data=True, remove_stopwords=True):
        self.q1_data, self.q2_data, self.label = self.read_dataset("train_temp_wikiSent.json")
        self.embedding_index = self.load_pretrain_embedding("model.txt")
        self.word_num=len(self.embedding_index.wv.vocab)
        if clean_data:
            if remove_stopwords:
                self.ignored_word = words_stop
            self.cleaned_q1_data, self.cleaned_q2_data = [], []
            for text in self.q1_data:
                self.cleaned_q1_data.append(self.clean_data(text))
            for text in self.q2_data:
                self.cleaned_q2_data.append(self.clean_data(text))
        self.q1_sequences, self.q2_sequences, self.word_index = self.tokenizer()
        self.nb_words, self.embedding_matrix = self.prepare_embedding_matrix()
       
    def read_dataset(self, train_path):
        q1_data=[]
        q2_data=[]
        label=[]
        with open(train_path,'r') as f:
            file_content=json.load(f)

        for key in file_content.keys():
            detail_content = file_content[key]
            q1_data.append (detail_content['claim'])

            evidences=detail_content['evidence']
            temp_evidence=""
            for evidence in evidences:
              temp_evidence.join(evidence[2])
            
            q2_data.append(temp_evidence)

        label = np.array(label)    
        return q1_data, q2_data, label

    def load_pretrain_embedding(self, file):
        print('Indexing word vector...')
        embedding_index = KeyedVectors.load_word2vec_format(file,binary=False)

        return embedding_index

    def clean_data(self, text):
        replace_numbers = re.compile(r'\d+', re.IGNORECASE)
        try:
          text = text.lower()
        except:
          print(text)
        text = re.sub(r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)", "", text)
        text = re.sub(r"(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(\.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}", "", text)
        text = re.sub(r"what's", "what is ", text)
        text = re.sub(r"\'s", " ", text)
        text = re.sub(r"\'ve", " have ", text)
        text = re.sub(r"can't", "cannot ", text)
        text = re.sub(r"n't", " not ", text)
        text = re.sub(r"i'm", "i am ", text)
        text = re.sub(r"i’m", "i am", text)
        text = re.sub(r"\'re", " are ", text)
        text = re.sub(r"\'d", " would ", text)
        text = re.sub(r"\'ll", " will ", text)
        text = re.sub(r"e - mail", "email", text)
        text = re.sub(r" +", "", text)

        stop_p = p + "~·！@#￥%……&*（）——=+-{}【】：；“”‘’《》，。？、|、"

        if keep_punctuation:
            text = re.sub(r"”", "\"", text)
            text = re.sub(r"“", "\"", text)
            text = re.sub(r"´", "'", text)
            text = re.sub(r"—", " ", text)
            text = re.sub(r"’", "'", text)
            text = re.sub(r"‘", "'", text)
            text = re.sub(r",", " ", text)
            text = re.sub(r"\.", " ", text)
            text = re.sub(r"!", " ! ", text)
            text = re.sub(r"\/", " ", text)
            text = re.sub(r"\^", " ^ ", text)
            text = re.sub(r"\+", " + ", text)
            text = re.sub(r"\-", " - ", text)
            text = re.sub(r"\=", " = ", text)
            text = re.sub(r"'", " ", text)
            text = re.sub(r":", " : ", text)
            text = re.sub(r"−", " ", text)
            text = re.sub(r"\?", " ? ", text)
            text = re.sub(r"\^", " ^ ", text)
            text = re.sub(r"#", " # ", text)
            text = re.sub(r"￥", "$", text)
        else:
            for token in stop_p:
                text = re.sub(token, "", text)

        text = replace_numbers.sub('', text)

        text = "".join([word for word in text if word not in self.ignored_word])

        return text

    def tokenizer(self):
        tokenizer = Tokenizer(num_words=None)
        tokenizer.fit_on_texts(self.cleaned_q1_data + self.cleaned_q2_data)
        q1_sequences = tokenizer.texts_to_sequences(self.cleaned_q1_data)
        q2_sequences = tokenizer.texts_to_sequences(self.cleaned_q2_data)

        word_index = tokenizer.word_index
        print('Found %s unique tokens' % len(word_index))

        # Padding
        q1_data = pad_sequences(q1_sequences, maxlen=60)
        print('Shape of q1_data tensor: ', q1_data.shape)
        q2_data = pad_sequences(q2_sequences, maxlen=60)
        print('Shape of q2_data tensor: ', q2_data.shape)
        print('Shape of label tensor: ', self.label.shape)

        return q1_data, q2_data, word_index

    def prepare_embedding_matrix(self):
        nb_words = len(self.word_index)
        embedding_matrix = np.zeros((nb_words + 1, 300))

        print('Creating embedding matrix ...')
        for word, idx in self.word_index.items():
            if word in self.embedding_index.wv.vocab:
                embedding_vector = self.embedding_index.wv[word]
                embedding_matrix[idx] = embedding_vector

        return nb_words, embedding_matrix

In [0]:
class Testloader(object):
    def __init__(self,clean_data=True, remove_stopwords=True):
        self.q1_data, self.q2_data, self.label = self.read_dataset("test-unlabelled_evidence_0.8_siameseLSTM_epoch9_with_sentence.json")
        self.embedding_index = self.load_pretrain_embedding("model.txt")
        self.word_num=len(self.embedding_index.wv.vocab)
        if clean_data:
            if remove_stopwords:
                self.ignored_word = words_stop
            self.cleaned_q1_data, self.cleaned_q2_data = [], []
            for text in self.q1_data:
                self.cleaned_q1_data.append(self.clean_data(text))
            for text in self.q2_data:
                self.cleaned_q2_data.append(self.clean_data(text))
        self.q1_sequences, self.q2_sequences, self.word_index = self.tokenizer()
        self.nb_words, self.embedding_matrix = self.prepare_embedding_matrix()
       
    def read_dataset(self, train_path):
        q1_data=[]
        q2_data=[]
        label=[]
        with open(train_path,'r') as f:
            file_content=json.load(f)

        for key in file_content.keys():
            detail_content = file_content[key]
            q1_data.append (detail_content['claim'])

            evidences=detail_content['evidence']
            temp_evidence=""
            for evidence in evidences:
              temp_evidence.join(evidence[2])
            
            q2_data.append(temp_evidence)

        label = np.array(label)    
        return q1_data, q2_data, label

    def load_pretrain_embedding(self, file):
        print('Indexing word vector...')
        embedding_index = KeyedVectors.load_word2vec_format(file,binary=False)

        return embedding_index

    def clean_data(self, text):
        replace_numbers = re.compile(r'\d+', re.IGNORECASE)
        try:
          text = text.lower()
        except:
          print(text)
       
        text = "".join([word for word in text if word not in self.ignored_word])

        return text

    def tokenizer(self):
        tokenizer = Tokenizer(num_words=None)
        tokenizer.fit_on_texts(self.cleaned_q1_data + self.cleaned_q2_data)
        q1_sequences = tokenizer.texts_to_sequences(self.cleaned_q1_data)
        q2_sequences = tokenizer.texts_to_sequences(self.cleaned_q2_data)

        word_index = tokenizer.word_index
        print('Found %s unique tokens' % len(word_index))

        # Padding
        q1_data = pad_sequences(q1_sequences, maxlen=60)
        print('Shape of q1_data tensor: ', q1_data.shape)
        q2_data = pad_sequences(q2_sequences, maxlen=60)
        print('Shape of q2_data tensor: ', q2_data.shape)
        print('Shape of label tensor: ', self.label.shape)

        return q1_data, q2_data, word_index

    def prepare_embedding_matrix(self):
        nb_words = len(self.word_index)
        embedding_matrix = np.zeros((nb_words + 1, 300))

        print('Creating embedding matrix ...')
        for word, idx in self.word_index.items():
            if word in self.embedding_index.wv.vocab:
                embedding_vector = self.embedding_index.wv[word]
                embedding_matrix[idx] = embedding_vector

        return nb_words, embedding_matrix

In [24]:
data_loader = Dataloader()

Indexing word vector...
Found 135360 unique tokens
Shape of q1_data tensor:  (145449, 60)
Shape of q2_data tensor:  (145449, 60)
Shape of label tensor:  (0,)
Creating embedding matrix ...


In [23]:
test_loader=Testloader()

Indexing word vector...
Found 15551 unique tokens
Shape of q1_data tensor:  (14997, 60)
Shape of q2_data tensor:  (14997, 60)
Shape of label tensor:  (0,)
Creating embedding matrix ...


In [36]:
import warnings, os
from keras.models import load_model
warnings.filterwarnings('ignore')

model_path='model_esimESIM_2.h5'
model = get_ESIM_model(data_loader.nb_words+1, 300, data_loader.embedding_matrix,
                           300, 300, 0.5, 60, 1)

model.load_weights(model_path)
eval_predict = model.predict([test_loader.q1_sequences, test_loader.q2_sequences], 
                                     batch_size=1024, verbose=1)



In [0]:
with open("test-unlabelled_evidence_0.8_siameseLSTM_epoch9_with_sentence.json",'r') as f:
      file_content=json.load(f)
      for i,key in enumerate(file_content.keys()):
          detail_content = file_content[key]
          
          if(eval_predict[i]==1):
              detail_content['label']='SUPPORTS'
          elif(eval_predict[i]==0):
              detail_content['label']='REFUTES'
          else:
              detail_content['label']='NOT ENOUGH INFO'

          evidences=detail_content['evidence']
          
          if(evidences==[]):
            detail_content['label']='NOT ENOUGH INFO'
          for evidence in evidences:
            del evidence[2]

with open('word2vec-result2.json','w') as t:
      json.dump(file_content, t, indent=2, separators=(',',':'))

In [40]:
print(test_loader.embedding_matrix)

[[ 0.        0.        0.       ...  0.        0.        0.      ]
 [ 0.064784 -0.094822  0.115844 ...  0.03993  -0.005045 -0.069976]
 [ 0.        0.        0.       ...  0.        0.        0.      ]
 ...
 [ 0.        0.        0.       ...  0.        0.        0.      ]
 [ 0.        0.        0.       ...  0.        0.        0.      ]
 [ 0.        0.        0.       ...  0.        0.        0.      ]]
