#this file does pure ESIM model 
including training and predicting

written by Linghan Zhou

---


In [0]:
import keras
from keras.layers import *
from keras.activations import softmax
from keras.models import Model
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization


def get_ESIM_model(nb_words, embedding_dim, embedding_matrix, recurrent_units, dense_units, dropout_rate, max_sequence_length, out_size):
    embedding_layer = Embedding(nb_words,
                                embedding_dim,
                                weights=[embedding_matrix],
                                input_length=max_sequence_length,
                                trainable=False)

    input_q1_layer = Input(shape=(max_sequence_length,), dtype='int32', name='q1')
    input_q2_layer = Input(shape=(max_sequence_length,), dtype='int32', name='q2')
    
    embedding_sequence_q1 = BatchNormalization(axis=2)(embedding_layer(input_q1_layer))
    embedding_sequence_q2 = BatchNormalization(axis=2)(embedding_layer(input_q2_layer))

    final_embedding_sequence_q1 = SpatialDropout1D(0.25)(embedding_sequence_q1)
    final_embedding_sequence_q2 = SpatialDropout1D(0.25)(embedding_sequence_q2)

    rnn_layer_q1 = Bidirectional(LSTM(recurrent_units, return_sequences=True))(final_embedding_sequence_q1)
    rnn_layer_q2 = Bidirectional(LSTM(recurrent_units, return_sequences=True))(final_embedding_sequence_q2)

    attention = Dot(axes=-1)([rnn_layer_q1, rnn_layer_q2])
    w_attn_1 = Lambda(lambda x: softmax(x, axis=1))(attention)
    w_attn_2 = Permute((2, 1))(Lambda(lambda x: softmax(x, axis=2))(attention))
    align_layer_1 = Dot(axes=1)([w_attn_1, rnn_layer_q1])
    align_layer_2 = Dot(axes=1)([w_attn_2, rnn_layer_q2])

    subtract_layer_1 = subtract([rnn_layer_q1, align_layer_1])
    subtract_layer_2 = subtract([rnn_layer_q2, align_layer_2])

    multiply_layer_1 = multiply([rnn_layer_q1, align_layer_1])
    multiply_layer_2 = multiply([rnn_layer_q2, align_layer_2])

    m_q1 = concatenate([rnn_layer_q1, align_layer_1, subtract_layer_1, multiply_layer_1])
    m_q2 = concatenate([rnn_layer_q2, align_layer_2, subtract_layer_2, multiply_layer_2])

    v_q1_i = Bidirectional(LSTM(recurrent_units, return_sequences=True))(m_q1)
    v_q2_i = Bidirectional(LSTM(recurrent_units, return_sequences=True))(m_q2)

    avgpool_q1 = GlobalAveragePooling1D()(v_q1_i)
    avgpool_q2 = GlobalAveragePooling1D()(v_q2_i)
    maxpool_q1 = GlobalMaxPooling1D()(v_q1_i)
    maxpool_q2 = GlobalMaxPooling1D()(v_q2_i)

    merged_q1 = concatenate([avgpool_q1, maxpool_q1])
    merged_q2 = concatenate([avgpool_q2, maxpool_q2])

    final_v = BatchNormalization()(concatenate([merged_q1, merged_q2]))
    output = Dense(units=dense_units, activation='relu')(final_v)
    output = BatchNormalization()(output)
    output = Dropout(dropout_rate)(output)
    output = Dense(units=out_size, activation='softmax')(output)

    model = Model(inputs=[input_q1_layer, input_q2_layer], output=output)
    adam_optimizer = keras.optimizers.Adam(lr=1e-3, decay=1e-6, clipvalue=5)

    model.compile(loss='binary_crossentropy', optimizer=adam_optimizer, metrics=[ 'binary_crossentropy', 'accuracy'])

    return model

In [2]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
words_stop= stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
keep_punctuation=True

In [0]:
import numpy as np
from tqdm import tqdm
from string import punctuation as p
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from gensim.models import KeyedVectors
from keras.utils import np_utils
import json

class Dataloader(object):
    def __init__(self,clean_data=True, remove_stopwords=True):
        self.q1_data, self.q2_data, self.label = self.read_dataset("word2train.json")
        self.embedding_index = self.load_pretrain_embedding("model.txt")
        self.word_num=len(self.embedding_index.wv.vocab)
        if clean_data:
            if remove_stopwords:
                self.ignored_word = words_stop
            self.cleaned_q1_data, self.cleaned_q2_data = [], []
            for text in self.q1_data:
                self.cleaned_q1_data.append(self.clean_data(text))
            for text in self.q2_data:
                self.cleaned_q2_data.append(self.clean_data(text))
        self.q1_sequences, self.q2_sequences, self.word_index = self.tokenizer()
        self.nb_words, self.embedding_matrix = self.prepare_embedding_matrix()

    def read_dataset(self, train_path):
        q1_data=[]
        q2_data=[]
        label=[]
        with open(train_path,'r') as f:
            file_content=json.load(f)

        for key in file_content.keys():
            detail_content = file_content[key]
            evidences=detail_content['evidence']
          
            temp_evidence=" "
            for evidence in evidences:
              temp_evidence=temp_evidence+evidence[2]
 
            q1_data.append(detail_content['claim'])
            q2_data.append(temp_evidence)
    
            if(detail_content['label']=='SUPPORTS'):
                label.append(1)
            elif(detail_content['label']=='REFUTES'):
                label.append(0)
            else:
                label.append(2)

        label=np_utils.to_categorical(label, 3)
   
        return q1_data, q2_data, label

    def load_pretrain_embedding(self, file):
        print('Indexing word vector...')
        embedding_index = KeyedVectors.load_word2vec_format(file,binary=False)

        return embedding_index
    def clean_data(self, text):
        try:
          text = text.lower()
        except:
          print(text)
        words=nltk.tokenize.word_tokenize(text)
        text = " ".join([word.lower() for word in words if word not in self.ignored_word and word.isalpha()])

        return text

    def tokenizer(self):
        tokenizer = Tokenizer(num_words=None)
        tokenizer.fit_on_texts(self.cleaned_q1_data + self.cleaned_q2_data)
        q1_sequences = tokenizer.texts_to_sequences(self.cleaned_q1_data)
        q2_sequences = tokenizer.texts_to_sequences(self.cleaned_q2_data)

        word_index = tokenizer.word_index
        print('Found %s unique tokens' % len(word_index))

        # Padding
        q1_data = pad_sequences(q1_sequences, maxlen=50)
        print('Shape of q1_data tensor: ', q1_data.shape)
        q2_data = pad_sequences(q2_sequences, maxlen=50)
        print('Shape of q2_data tensor: ', q2_data.shape)
        print('Shape of label tensor: ', self.label.shape)

        return q1_data, q2_data, word_index

    def prepare_embedding_matrix(self):
        nb_words = len(self.word_index)
        embedding_matrix = np.zeros((nb_words + 1, 300))

        print('Creating embedding matrix ...')
        for word, idx in self.word_index.items():
            if word in self.embedding_index.wv.vocab:
                embedding_vector = self.embedding_index.wv[word]
                embedding_matrix[idx] = embedding_vector

        return nb_words, embedding_matrix

In [0]:
model_path="esm3"

In [0]:
import warnings, os
import tensorflow as tf
import numpy as np
from sklearn.metrics import accuracy_score, log_loss, precision_score, recall_score, f1_score
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.backend.tensorflow_backend import set_session
from keras.models import load_model
warnings.filterwarnings('ignore')


# Init settings
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
# config.gpu_options.per_process_gpu_memory_fraction = 0.5
set_session(tf.Session(config=config))


def train_model_by_logloss(model, batch_size, train_q1, train_q2, train_y, val_q1, val_q2, val_y, fold_id):
    early_stopping = EarlyStopping(monitor='val_loss', patience=7)
    best_model_path = model_path + 'ESIM_' + str(fold_id) + '.h5'
    model_checkpoint = ModelCheckpoint(best_model_path, save_best_only=True, save_weights_only=True)
    hist = model.fit([train_q1, train_q2], train_y, validation_data=([val_q1, val_q2], val_y),
                     epochs=2, batch_size=batch_size, shuffle=True,
                     callbacks=[early_stopping, model_checkpoint])
    best_val_score = min(hist.history['val_loss'])
    predictions = model.predict([val_q1, val_q2])

    return model, best_val_score, predictions

def train_folds(q1, q2, y, fold_count, batch_size, get_model_func):
    fold_size = len(q1) // fold_count
    models, fold_predictions = [], []
    score, total_auc = 0, 0
    write_file = open('Logger.txt', 'w', encoding='utf-8')
    for fold_id in range(0, fold_count):
        fold_start = fold_size * fold_id
        fold_end = fold_start + fold_size

        if fold_id == fold_count - 1:
            fold_end = len(q1)

        train_q1 = np.concatenate([q1[:fold_start], q1[fold_end:]])
        train_q2 = np.concatenate([q2[:fold_start], q2[fold_end:]])
        train_y = np.concatenate([y[:fold_start], y[fold_end:]])

        val_q1 = q1[fold_start: fold_end]
        val_q2 = q2[fold_start: fold_end]
        val_y = y[fold_start: fold_end]

        print('In fold {}'.format(fold_id + 1))
        model, best_val_score, fold_prediction = train_model_by_logloss(get_model_func, batch_size,
                                                                             train_q1, train_q2, train_y,
                                                                             val_q1, val_q2, val_y, fold_id)
        score += best_val_score
        fold_predictions.append(fold_prediction)
        models.append(model)
        write_file.write('Fold {}\tLoss {}\n'.format(fold_id + 1, best_val_score))
        write_file.flush()
    
    write_file.close()

    return models, score / fold_count,  fold_predictions


def train():
    # q1 & q2 sequences (after tokenize operation) + label + embedding_matrix
    #data_loader = Dataloader()
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    model = get_ESIM_model(data_loader.nb_words + 1, 300, data_loader.embedding_matrix,
                           300, 300, 0.5,
                           50, 3)
    # model = get_ESIM_model(pm.MAX_NB_WORDS, pm.EMBEDDING_DIM, None,
    #                        pm.RECURRENT_UNITS, pm.DENSE_UNITS, pm.DROPOUT_RATE,
    #                        pm.MAX_SEQUENCE_LENGTH, 1)
    print(model.summary())

    models, val_loss,  fold_predictions = train_folds(data_loader.q1_sequences,
                                                                data_loader.q2_sequences,
                                                                data_loader.label,
                                                                2,
                                                                1024,
                                                                model)

    print('Overall val-loss: {}'.format(val_loss))


def evaluate():
    '''
    For training OOB(out-of-bag) Evaluation.
    '''
    data_loader = Dataloader()
    eval_predicts_list = []
    for fold_id in range(0, 10):
        model = get_ESIM_model(data_loader.nb_words + 1, 300, data_loader.embedding_matrix,
                           300, 300, 0.5, 50, 3)

        model.load_weights(model_path + 'ESIM_' + str(fold_id) + '.h5')
        eval_predict = model.predict([data_loader.q1_sequences, data_loader.q2_sequences], 
                                     batch_size=1024, verbose=1)
        eval_predicts_list.append(eval_predict)
    
        train_loss = log_loss(data_loader.label, eval_predict)
        train_acc = accuracy_score(data_loader.label, eval_predict.round())
        train_precision = precision_score(data_loader.label, eval_predict.round())
        train_recall = recall_score(data_loader.label, eval_predict.round())
        train_f1_score = f1_score(data_loader.label, eval_predict.round())
        print('Training LOSS:{}\tACCURACY:{}\tPRECISION:{}\tRECALL:{}\tF1_SCORE:{}'.format(
             train_loss, train_acc, train_precision, train_recall, train_f1_score))

    
    train_fold_predictions = np.zeros(eval_predicts_list[0].shape)
    for fold_predict in eval_predicts_list:
        train_fold_predictions += fold_predict
    train_fold_predictions /= len(eval_predicts_list)

    train_auc = roc_auc_score(data_loader.label, train_fold_predictions)
    train_loss = log_loss(data_loader.label, train_fold_predictions)
    train_acc = accuracy_score(data_loader.label, train_fold_predictions.round())
    train_precision = precision_score(data_loader.label, train_fold_predictions.round())
    train_recall = recall_score(data_loader.label, train_fold_predictions.round())
    train_f1_score = f1_score(data_loader.label, train_fold_predictions.round())
    print('Training LOSS:{}\tACCURACY:{}\tPRECISION:{}\tRECALL:{}\tF1_SCORE:{}'.format(
         train_loss, train_acc, train_precision, train_recall, train_f1_score))



In [0]:
class Testloader(object):
    def __init__(self,clean_data=True, remove_stopwords=True):
        self.q1_data, self.q2_data, self.label = self.read_dataset("test-unlabelled_evidence_0.8_siameseLSTM_epoch9_with_sentence.json")
        if clean_data:
            if remove_stopwords:
                self.ignored_word = words_stop
            self.cleaned_q1_data, self.cleaned_q2_data = [], []
            for text in self.q1_data:
                self.cleaned_q1_data.append(self.clean_data(text))
            for text in self.q2_data:
                self.cleaned_q2_data.append(self.clean_data(text))
        self.q1_sequences, self.q2_sequences, self.word_index = self.tokenizer()
       
    def read_dataset(self, train_path):
        q1_data=[]
        q2_data=[]
        label=[]
        with open(train_path,'r') as f:
            file_content=json.load(f)

        for key in file_content.keys():
            detail_content = file_content[key]
            q1_data.append (detail_content['claim'])

            evidences=detail_content['evidence']
            temp_evidence=""
            for evidence in evidences:
              temp_evidence+=evidence[2]
            
            q2_data.append(temp_evidence)

        label = np.array(label)    
        return q1_data, q2_data, label

    def clean_data(self, text):
        try:
          text = text.lower()
        except:
          print(text)
       
        words=nltk.tokenize.word_tokenize(text)
        text = " ".join([word.lower() for word in words if word not in self.ignored_word and word.isalpha()])

        return text

    def tokenizer(self):
        tokenizer = Tokenizer(num_words=None)
        tokenizer.fit_on_texts(self.cleaned_q1_data + self.cleaned_q2_data)
        q1_sequences = tokenizer.texts_to_sequences(self.cleaned_q1_data)
        q2_sequences = tokenizer.texts_to_sequences(self.cleaned_q2_data)

        word_index = tokenizer.word_index
        print('Found %s unique tokens' % len(word_index))

        # Padding
        q1_data = pad_sequences(q1_sequences, maxlen=50)
        print('Shape of q1_data tensor: ', q1_data.shape)
        q2_data = pad_sequences(q2_sequences, maxlen=50)
        print('Shape of q2_data tensor: ', q2_data.shape)
        print('Shape of label tensor: ', self.label.shape)

        return q1_data, q2_data, word_index


In [11]:
!apt-get install -y -qq software-properties-common python-software-properties module-init-tools
!add-apt-repository -y ppa:alessandro-strada/ppa 2>&1 > /dev/null
!apt-get update -qq 2>&1 > /dev/null
!apt-get -y install -qq google-drive-ocamlfuse fuse
from google.colab import auth
auth.authenticate_user()
from oauth2client.client import GoogleCredentials
creds = GoogleCredentials.get_application_default()
import getpass
!google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret} < /dev/null 2>&1 | grep URL
vcode = getpass.getpass()
!echo {vcode} | google-drive-ocamlfuse -headless -id={creds.client_id} -secret={creds.client_secret}

E: Package 'python-software-properties' has no installation candidate
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
··········
Please, open the following URL in a web browser: https://accounts.google.com/o/oauth2/auth?client_id=32555940559.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive&response_type=code&access_type=offline&approval_prompt=force
Please enter the verification code: Access token retrieved correctly.


In [0]:
!mkdir -p drive
!google-drive-ocamlfuse -o nonempty drive

In [13]:

%cd drive/Colab Notebooks/drive

/content/drive/Colab Notebooks/drive


In [14]:
data_loader = Dataloader()

Indexing word vector...
Found 39689 unique tokens
Shape of q1_data tensor:  (101053, 50)
Shape of q2_data tensor:  (101053, 50)
Shape of label tensor:  (101053, 3)
Creating embedding matrix ...


In [17]:
train()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
q1 (InputLayer)                 (None, 50)           0                                            
__________________________________________________________________________________________________
q2 (InputLayer)                 (None, 50)           0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 50, 300)      11907000    q1[0][0]                         
                                                                 q2[0][0]                         
__________________________________________________________________________________________________
batch_normalization_5 (BatchNor (None, 50, 300)      1200        embedding_2[0][0]                
__________

KeyboardInterrupt: ignored

In [0]:
test_loader=Testloader()

Found 27726 unique tokens
Shape of q1_data tensor:  (14997, 50)
Shape of q2_data tensor:  (14997, 50)
Shape of label tensor:  (0,)


In [0]:
import warnings, os
from keras.models import load_model
warnings.filterwarnings('ignore')

model_path='esm3ESIM_1.h5ESIM_1.h5'
model = get_ESIM_model(data_loader.nb_words+1, 300, data_loader.embedding_matrix,
                           300, 300, 0.5, 50, 3)

model.load_weights(model_path)
eval_predict = model.predict([test_loader.q1_sequences, test_loader.q2_sequences], 
                                     batch_size=1024, verbose=1)



In [0]:


with open("test-unlabelled_evidence_0.8_siameseLSTM_epoch9_with_sentence.json",'r') as f:
      file_content=json.load(f)
      for i,key in enumerate(file_content.keys()):
          detail_content = file_content[key]
          result=eval_predict[i]
          lab=np.argmax(result)
          if(lab==1):
              detail_content['label']='SUPPORTS'
          elif(lab==0):
              detail_content['label']='REFUTES'
          else:
              detail_content['label']='NOT ENOUGH INFO'

          evidences=detail_content['evidence']
          if(evidences==[]):
            detail_content['label']='NOT ENOUGH INFO'
          for evidence in evidences:
            del evidence[2]

with open('word2vec-result2.json','w') as t:
      json.dump(file_content, t, indent=2, separators=(',',':'))