In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# imports
from utils import *
from display_rational import convert_res_to_htmls
from config import *
from losses import imbalanced_bce_bayesian, imbalanced_bce_resampling, exp_interval_loss
from metrices import *
from tqdm import tqdm_notebook
from bert import optimization
from bert import run_classifier
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.models import load_model
from tensorflow.python.keras.backend import set_session
from tensorflow.keras import backend as K
import numpy as np
import pandas as pd
import tensorflow_hub as hub
import os
import pickle
from collections import defaultdict
from datetime import datetime

import bert

import tensorflow
if tensorflow.__version__.startswith('2'):
    import tensorflow.compat.v1 as tf
    tf.disable_v2_behavior()
else:
    import tensorflow as tf

Instructions for updating:
non-resource variables are not supported in the long term



In [3]:
# variable hyper-parameters
import argparse

parser = argparse.ArgumentParser()
parser.add_argument('--par_lambda', type=float)
parser.add_argument('--gpu_id', type=str)
parser.add_argument('--batch_size', type=int)
parser.add_argument('--num_epochs', type=int)
parser.add_argument('--dataset', type=str, choices='fever multirc movies'.split())
parser.add_argument("--do_train", action='store_true')
parser.add_argument('--exp_visualize', action='store_true')
parser.add_argument('--evaluate', action='store_true')
parser.add_argument('--exp_benchmark', action='store_true')
parser.add_argument('--modeling_structure', type=str, default='bert', choices='bert lstm'.split())
parser.add_argument('--exp_structure', type=str, default='gru', choices='gru rnr'.split())
parser.add_argument('--delete_checkpoints', action='store_true')
parser.add_argument('--merge_evidences', action='store_true')

args = ['--par_lambda', '0.01', 
        '--gpu_id', '-1', 
        '--batch_size', '2', 
        '--num_epochs', '10',
        '--dataset', 'movies',
        '--do_train',
        '--evaluate',
        '--exp_benchmark',
        '--modeling_structure', 'lstm',
        '--exp_structure', 'rnr',
        '--delete_checkpoints',
        '--merge_evidences']

args = parser.parse_args(args)
#args = parser.parse_args()

BATCH_SIZE = args.batch_size
par_lambda = args.par_lambda
NUM_EPOCHS = args.num_epochs
gpu_id = args.gpu_id
exp_structure = args.exp_structure
dataset = args.dataset
DO_DELETE = args.delete_checkpoints
do_train = args.do_train
load_best = not do_train
evaluate = args.evaluate
exp_visualize = args.exp_visualize
exp_benchmark = args.exp_benchmark
merge_evidences = args.merge_evidences

LEARNING_RATE = 1e-5

In [4]:
import chakin

CHAKIN_INDEX = 13
NUMBER_OF_DIMENSIONS = 200
SUBFOLDER_NAME = "glove.6B"

DATA_FOLDER = "embeddings"
ZIP_FILE = os.path.join(DATA_FOLDER, "{}.zip".format(SUBFOLDER_NAME))
ZIP_FILE_ALT = "glove" + ZIP_FILE[5:]  # sometimes it's lowercase only...
UNZIP_FOLDER = os.path.join(DATA_FOLDER, SUBFOLDER_NAME)
if SUBFOLDER_NAME[-1] == "d":
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.txt".format(SUBFOLDER_NAME))
else:
    GLOVE_FILENAME = os.path.join(UNZIP_FOLDER, "{}.{}d.txt".format(SUBFOLDER_NAME, NUMBER_OF_DIMENSIONS))

if not os.path.exists(ZIP_FILE) and not os.path.exists(UNZIP_FOLDER):
    # GloVe by Stanford is licensed Apache 2.0: 
    #     https://github.com/stanfordnlp/GloVe/blob/master/LICENSE
    #     http://nlp.stanford.edu/data/glove.twitter.27B.zip
    #     Copyright 2014 The Board of Trustees of The Leland Stanford Junior University
    print("Downloading embeddings to '{}'".format(ZIP_FILE))
    chakin.download(number=CHAKIN_INDEX, save_dir='./{}'.format(DATA_FOLDER))
else:
    print("Embeddings already downloaded.")
    
if not os.path.exists(UNZIP_FOLDER):
    import zipfile
    if not os.path.exists(ZIP_FILE) and os.path.exists(ZIP_FILE_ALT):
        ZIP_FILE = ZIP_FILE_ALT
    with zipfile.ZipFile(ZIP_FILE,"r") as zip_ref:
        print("Extracting embeddings to '{}'".format(UNZIP_FOLDER))
        zip_ref.extractall(UNZIP_FOLDER)
else:
    print("Embeddings already extracted.")

Embeddings already downloaded.
Embeddings already extracted.


In [5]:
def load_embedding_from_disks(glove_filename, with_indexes=True):
    """
    Read a GloVe txt file. If `with_indexes=True`, we return a tuple of two dictionnaries
    `(word_to_index_dict, index_to_embedding_array)`, otherwise we return only a direct 
    `word_to_embedding_dict` dictionnary mapping from a string to a numpy array.
    """
    if with_indexes:
        word_to_index_dict = dict()
        index_to_embedding_array = []
    else:
        word_to_embedding_dict = dict()

    with open(glove_filename, 'r') as glove_file:
        for (i, line) in enumerate(glove_file):
            split = line.split(' ')
            word = split[0]
            representation = split[1:]
            representation = np.array([float(val) for val in representation])
            if with_indexes:
                word_to_index_dict[word] = i
                index_to_embedding_array.append(representation)
            else:
                word_to_embedding_dict[word] = representation

    _WORD_NOT_FOUND = [0.0]* len(representation)  # Empty representation for unknown words.
    if with_indexes:
        _LAST_INDEX = i + 1
        word_to_index_dict = defaultdict(lambda: _LAST_INDEX, word_to_index_dict)
        index_to_embedding_array = np.array(index_to_embedding_array + [_WORD_NOT_FOUND])
        return word_to_index_dict, index_to_embedding_array
    else:
        word_to_embedding_dict = defaultdict(lambda: _WORD_NOT_FOUND)
        return word_to_embedding_dict

In [13]:
print("Loading embedding from disks...")
word_to_index, index_to_embedding = load_embedding_from_disks(GLOVE_FILENAME, with_indexes=True)
print("Embedding loaded from disks.")

Loading embedding from disks...
Embedding loaded from disks.


In [15]:
chakin.search(lang='English')

                   Name  Dimension                     Corpus VocabularySize  \
2          fastText(en)        300                  Wikipedia           2.5M   
11         GloVe.6B.50d         50  Wikipedia+Gigaword 5 (6B)           400K   
12        GloVe.6B.100d        100  Wikipedia+Gigaword 5 (6B)           400K   
13        GloVe.6B.200d        200  Wikipedia+Gigaword 5 (6B)           400K   
14        GloVe.6B.300d        300  Wikipedia+Gigaword 5 (6B)           400K   
15       GloVe.42B.300d        300          Common Crawl(42B)           1.9M   
16      GloVe.840B.300d        300         Common Crawl(840B)           2.2M   
17    GloVe.Twitter.25d         25               Twitter(27B)           1.2M   
18    GloVe.Twitter.50d         50               Twitter(27B)           1.2M   
19   GloVe.Twitter.100d        100               Twitter(27B)           1.2M   
20   GloVe.Twitter.200d        200               Twitter(27B)           1.2M   
21  word2vec.GoogleNews        300      

In [14]:
index_to_embedding.shape

(400001, 200)

In [None]:
input_data = padding(data)

In [None]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, Attention, Dense
from tensorflow.keras.layers import GRU
#from tensorflow.keras.layers import CuDNNGRU as GRU

GRU_DIM = 200

sentence_input = Input(shape=(padded_length, ), name='input_ids')
padding_mask_input = Input(shape=(padded_length, ), name='input_mask')

embedding = Embedding(vocab_size, embedding_dim, weights=[index_to_embedding])(input_layer)
gru1_seq = Bidirectional(layer=GRU(GRU_DIM, return_sequence=True), merge_mode='concat')(embedding)
attention = Attention([lstm1_seq, lstm1_seq, lstm1_seq], mask=[padding_mask_input, padding_mask_input]) #padding_mask_input should be a boolean vector, element == False corresponds to the padding
gru2_cls, gru2_seq = Bidirectional(layer=GRU(GRU_DIM, return_sequence=True, return_state=True))
cls_output = Dense(1, activation='sigmoid')(Dropout(rate=0.05)(gru2_cls), name='cls_output')

if EXP_OUTPUT == 'gru':
    gru = CuDNNGRU(GRU_DIM, return_sequences=True)(gru2_seq)
    exp = Dense(1, activation='sigmoid')(gru)
    padding_mask_output = Lambda(lambda x: tf.cast(x, tf.int32))(padding_mask_input)
    output_mask = Reshape((padded_length, 1))(padding_mask_output)
    exp_outputs = Multiply(name='exp_output')([output_mask, exp])
elif EXP_OUTPUT == 'rnr':
    M1 = Bidirectional(layer=CuDNNLSTM(NUM_INTERVAL_LSTM_WIDTH, return_sequences=True),
                       merge_mode='concat')(Concatenate(axis=-1)([bert_exp_output, ])
    p_starts = Dense(1, activation='sigmoid')(Concatenate(axis=-1)([bert_exp_output, M1]))

    m1_tilde = Dot(axes=-2)([p_starts, M1])
    M1_tilde = Lambda(lambda x: tf.tile(x, (1, MAX_SEQ_LENGTH, 1)))(m1_tilde)
    x = Multiply()([M1, M1_tilde])
    M2 = Bidirectional(layer=CuDNNLSTM(NUM_INTERVAL_LSTM_WIDTH, return_sequences=True),
                       merge_mode='concat')(Concatenate(axis=-1)([bert_exp_output, M1, M1_tilde, x]))
    p_end_given_start = Dense(MAX_SEQ_LENGTH, activation='softmax')(Concatenate(axis=-1)([bert_exp_output, M2]))
    p_end_given_start = Lambda(lambda x: tf.linalg.band_part(x, 0, -1))(p_end_given_start)
    exp_outputs = Concatenate(axis=-1, name='exp_output')([p_starts, p_end_given_start])
    #exp_outputs = Lambda(lambda x: tf.reduce_sum(x, axis=-1, keepdims=True), name='exp_output')(p_dist)
outputs.append(exp_outputs)

gru = tf.keras.layers.GRU(self.enc_units,
                                   return_sequences=True,
                                   return_state=True,
                                   recurrent_initializer='glorot_uniform')

       def call(self, x, hidden):
           x = self.embedding(x)
           output, state = self.gru(x, initial_state = hidden)
           return output, state

       def initialize_hidden_state(self):
           return tf.zeros((self.batch_sz, self.enc_units))

In [7]:
batch_size = 2  # Any size is accepted

import os
os.environ['CUDA_VISIBILE_DIVICES'] = ''

graph = tf.get_default_graph()
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.gpu_options.visible_device_list = ""
config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
#tf.config.set_visible_devices([], 'GPU')
#sess = tf.Session(config=config)
#set_session(sess)

tf.reset_default_graph()
sess = tf.InteractiveSession(config=config)  # sess = tf.Session()

# Define the variable that will hold the embedding:
tf_embedding = tf.Variable(
    tf.constant(0.0, shape=index_to_embedding.shape),
    trainable=False,
    name="Embedding"
)

tf_word_ids = tf.placeholder(tf.int32, shape=[batch_size])

tf_word_representation_layer = tf.nn.embedding_lookup(
    params=tf_embedding,
    ids=tf_word_ids
)

In [8]:
tf_embedding_placeholder = tf.placeholder(tf.float32, shape=index_to_embedding.shape)
tf_embedding_init = tf_embedding.assign(tf_embedding_placeholder)
_ = sess.run(
    tf_embedding_init, 
    feed_dict={
        tf_embedding_placeholder: index_to_embedding
    }
)

print("Embedding now stored in TensorFlow. Can delete numpy array to clear some CPU RAM.")
del index_to_embedding

Embedding now stored in TensorFlow. Can delete numpy array to clear some CPU RAM.


In [11]:

batch_indexes = [word_to_index[w.lower()] for w in batch_of_words]

embedding_from_batch_lookup = sess.run(
    tf_word_representation_layer, 
    feed_dict={
        tf_word_ids: batch_indexes
    }
)


Representations for ['Hello', '!']:
[[ 0.26609    0.21821   -0.10996   -0.48408   -0.11181   -0.09882
  -0.45315    0.44198   -0.034614   0.10541   -0.29537   -0.10881
   0.20916    0.52484   -0.17985   -0.31187   -0.25724    0.65267
   0.217      0.86503    0.47239   -0.078582   0.31035   -0.12155
  -0.12502   -0.40418    0.53803   -0.57842   -0.63668   -0.13502
  -0.040484   0.41378   -0.63201   -0.38847   -0.43767   -0.19706
   0.2878     0.36039   -0.032893  -0.20361   -0.34918    0.95923
  -0.51221   -0.19035    0.1567     0.17704    0.55302    0.27636
  -0.13707    0.91361    0.25948   -0.30107    0.48343   -0.046869
  -0.2796    -0.040385  -0.45773    0.2768    -0.14468    0.036539
   0.36018   -0.54939    0.19359   -0.38263   -0.29661   -0.18938
   0.095681   0.46646    0.3366     0.78351    0.49517   -0.82418
   0.34402   -0.50038   -0.71074   -0.25711   -0.36619    0.61746
  -0.31281   -0.042413   0.37915   -0.62383    0.27208    0.32852
  -0.23045   -0.12469    0.29898   -0.

In [None]:
prefix = SUBFOLDER_NAME + "." + str(NUMBER_OF_DIMENSIONS) + "d"
TF_EMBEDDINGS_FILE_NAME = os.path.join(DATA_FOLDER, prefix + ".ckpt")
DICT_WORD_TO_INDEX_FILE_NAME = os.path.join(DATA_FOLDER, prefix + ".json")

variables_to_save = [tf_embedding]
embedding_saver = tf.train.Saver(variables_to_save)
embedding_saver.save(sess, save_path=TF_EMBEDDINGS_FILE_NAME)
print("TF embeddings saved to '{}'.".format(TF_EMBEDDINGS_FILE_NAME))
sess.close()

with open(DICT_WORD_TO_INDEX_FILE_NAME, 'w') as f:
    json.dump(word_to_index, f)
print("word_to_index dict saved to '{}'.".format(DICT_WORD_TO_INDEX_FILE_NAME))