In [1]:
# Load Hyperparameters and global variables

In [2]:
import tensorflow as tf
import numpy as np
import argparse
import tqdm
import logging
import pprint # pretty print python objects
import sys
import os
import re 
import tempfile

# new additions
import tensorflow.keras
import tensorflow.keras.backend as K
from tensorflow.keras import Model, utils
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.layers import concatenate, Dense, Input, Dropout, TimeDistributed, LSTM
from tensorflow.keras.layers import Embedding, BatchNormalization, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.regularizers import l2

FLAGS = {
    "snli_link": "https://nlp.stanford.edu/projects/snli/snli_1.0.zip",
    "snli_zipfilename": "snli_1.0.zip",
    "snli_trainfilename": "snli_1.0_train.txt",
    "snli_validatefilename": "snli_1.0_dev.txt",
    "snli_testfilename": "snli_1.0_test.txt",
    "word_embeddings_link": "http://nlp.stanford.edu/data/glove.6B.zip",
    "word_embeddings_zipfilename": "glove.6B.zip",
    "word_embeddings_txtfilename": "glove.6B.50d.txt",
    "glove_store": "precomputed_glove.weights",
    "buffer_size": 10000,
    "max_data_items": 100000,
    "batch_size": 512,
    "hidden_length": 64,
    "layers": 1,
    "train_embed": False,
    "embedding_size": 50, # 50 dim embeddings
    "num_epochs": 42,
}

def create_logger():
    log = logging.getLogger() # root logger
    log.setLevel(logging.DEBUG)
    formatter = logging.Formatter(fmt="%(asctime)s : %(levelname)s %(message)s")
    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    log.addHandler(handler)
    return logging.getLogger()

pp = pprint.PrettyPrinter(indent=2)
logger = create_logger()
glove_wordmap = {}

print(pp.pformat(FLAGS))

{ 'batch_size': 512,
  'buffer_size': 10000,
  'embedding_size': 50,
  'glove_store': 'precomputed_glove.weights',
  'hidden_length': 64,
  'layers': 1,
  'max_data_items': 100000,
  'num_epochs': 42,
  'snli_link': 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip',
  'snli_testfilename': 'snli_1.0_test.txt',
  'snli_trainfilename': 'snli_1.0_train.txt',
  'snli_validatefilename': 'snli_1.0_dev.txt',
  'snli_zipfilename': 'snli_1.0.zip',
  'train_embed': False,
  'word_embeddings_link': 'http://nlp.stanford.edu/data/glove.6B.zip',
  'word_embeddings_txtfilename': 'glove.6B.50d.txt',
  'word_embeddings_zipfilename': 'glove.6B.zip'}


In [3]:
# Load dataset manually
def prepare_snli_corpus():
    snli_link = FLAGS['snli_link']
    snli_zip_file = FLAGS['snli_zipfilename']
    snli_train_file = FLAGS['snli_trainfilename']
    snli_validate_file = FLAGS['snli_validatefilename']
    snli_test_file = FLAGS['snli_testfilename']
    
    if (not os.path.isfile(snli_zip_file)):
        print("Snli corpus not found. Downloading from site...")
        import urllib.request
        # download glove zip file
        urllib.request.urlretrieve(snli_link, snli_zip_file)
    print("Snli corpus file already downloaded. Extracting...")
    # extract train, validate and test files
    if (not os.path.isfile(snli_train_file)):
        unzip_single_file(snli_zip_file, snli_train_file)
        print("Extracted {}\n".format(snli_validate_file))
    else:
        print("{} already extracted.\n".format(snli_train_file))
    if (not os.path.isfile(snli_validate_file)):
        unzip_single_file(snli_zip_file, snli_validate_file)
        print("Extracted {}\n".format(snli_validate_file))
    else:
        print("{} already extracted.\n".format(snli_validate_file))
    if (not os.path.isfile(snli_test_file)):
        unzip_single_file(snli_zip_file, snli_test_file)
        print("Extracted {}\n".format(snli_test_file))
    else:
        print("{} already extracted.\n".format(snli_validate_file))
    return

def prepare_glove_embeddings():
    glove_link = FLAGS['word_embeddings_link']
    glove_zip_file = FLAGS['word_embeddings_zipfilename']
    glove_text_file = FLAGS['word_embeddings_txtfilename']
    
    if (not os.path.isfile(glove_zip_file) and not os.path.isfile(glove_text_file)):
        print("Glove embeddings not found. Downloading from site...")
        import urllib.request
        # download glove zip file
        urllib.request.urlretrieve(glove_link, glove_zip_file)
        print("Glove embeddings file downloaded.")
        # extract zip to text file
        unzip_single_file(glove_zip_file, glove_text_file)
    return

def unzip_single_file(zip_file_name, output_file_name):
    """
    If the outfile exists, don't recreate, else create from zipfile
    """
    if not os.path.isfile(output_file_name):
        import zipfile
        print("Unzipping glove embeddings {}..".format(zip_file_name))
        with open(output_file_name, "wb") as out_file:
            with zipfile.ZipFile(zip_file_name) as zipped:
                for info in zipped.infolist():
                    if output_file_name in info.filename:
                        with zipped.open(info) as requested_file:
                            out_file.write(requested_file.read())
                            print("Glove embeddings unzipped to {}".format(output_file_name))
                            return
    return

prepare_snli_corpus()
prepare_glove_embeddings()

Snli corpus file already downloaded. Extracting...
snli_1.0_train.txt already extracted.

snli_1.0_dev.txt already extracted.

snli_1.0_dev.txt already extracted.



In [4]:
# def sentence2sequence(sentence):
#     '''
#     Turns an input sentence into a (n, d) matrix. 
#     n is the number of tokens in the sentence.
#     d is the number of dimensions each word vector has.
#     '''
#     tokens = None

#     try:
#         tokens = sentence.decode().lower().split(" ")
#     except AttributeError: # not byte-encoded
#         tokens = sentence.lower().split(" ")
#     rows = []
#     words = []
#     for token in tokens: # each token is a word in the sentence
#         i = len(token)
#         while len(token) > 0 and i > 0:
#             word = token[:i]
#             if word in glove_wordmap:
#                 rows.append(glove_wordmap[word])
#                 words.append(word)
#                 token = token[i:]
#                 i = len(token)
#             else:
#                 # no such word, add keep reducing until we find a word
#                 i = i - 1
#     return { "words": words, "rows": rows }

# def sentence_score_setup(row):
#     convert_dict = {
#         'contradiction': 0,
#         'neutral': 1,
#         'entailment': 2
#     }
#     score = np.zeros((3,1))
#     for x in range(1,6):
#         tag = row["label"+str(x)]
#         if tag in convert_dict: 
#             score[convert_dict[tag]] += 1
#     return score / (1.0 * np.sum(score)) # return normalised np array

def fit_to_size(matrix, shape):
    res = np.zeros(shape)
    #print("Before: {}".format(pp.pformat(matrix.shape)))
    slices = [slice(0, min(dim, shape[e])) for e, dim in enumerate(matrix.shape)]
    res[slices] = matrix[slices]
    #print("After: {}".format(pp.pformat(res.shape)))
    return res

# Hello, there. -> Hello , there .
def surround_punc_w_space(string):
    string = re.sub('([.,!?()])', r' \1 ', string)
    string = re.sub('\s{2,}', ' ', string)
    return string

def load_snli_data(filename):
    if not os.path.isfile(filename):
        print("ERROR: FILE NOT FOUND. EXITING...")
    else:
        print("Preprocessing {} & parsing to arrays...\n".format(filename))
        import csv
        
        convert_dict = { 'contradiction': 0, 'neutral': 1, 'entailment': 2 }
        with open(filename, "r", encoding='utf-8') as data:
            train = csv.DictReader(data, delimiter='\t')
            premise_sentences = []
            hypothesis_sentences = []
            # array of binary class matrix e.g [1.0, 0.0, 0.0]
            y = [] 
            # i = 0
            for row in tqdm.tqdm(iterable=train):
                # i += 1
                # if i > FLAGS['max_data_items']: 
                #   break
                if row["gold_label"] == "-": 
                    continue
                premise_sentences.append(surround_punc_w_space(row["sentence1"].lower()))
                hypothesis_sentences.append(surround_punc_w_space(row["sentence2"].lower()))
                y.append(convert_dict[row["gold_label"]])
            return (premise_sentences, hypothesis_sentences, utils.to_categorical(np.array(y)))        

# max_data_items rows processed
training = load_snli_data(FLAGS['snli_trainfilename'])
validation = load_snli_data(FLAGS['snli_validatefilename'])
test = load_snli_data(FLAGS['snli_testfilename'])
# print("PREMISES {} \nHYPOTHESIS {} \nY {}".format(pp.pformat(training[0]), pp.pformat(training[1]), pp.pformat(training[2])))

0it [00:00, ?it/s]

Preprocessing snli_1.0_train.txt & parsing to arrays...



550152it [00:34, 15896.75it/s]
1296it [00:00, 12952.88it/s]

Preprocessing snli_1.0_dev.txt & parsing to arrays...



10000it [00:00, 13944.30it/s]
1616it [00:00, 16151.51it/s]

Preprocessing snli_1.0_test.txt & parsing to arrays...



10000it [00:00, 14092.39it/s]


In [5]:
tokenizer = Tokenizer(lower=False, filters='')
tokenizer.fit_on_texts(training[0] + training[1])

# lowest index from the tokenizer is 1 - we need to include 0 in our vocab count
VOCAB = len(tokenizer.word_counts) + 1
LABELS = { 'contradiction': 0, 'neutral': 1, 'entailment': 2 }
# RNN = lambda *args, **kwargs: Bidirectional(LSTM(*args, **kwargs))
RNN = None
# LAYERS = FLAGS['layers']
TRAIN_EMBED = FLAGS['train_embed']
EMBED_HIDDEN_SIZE = FLAGS['embedding_size']
SENT_HIDDEN_SIZE = 50
BATCH_SIZE = FLAGS['batch_size']
PATIENCE = 4 #???
MAX_EPOCHS = FLAGS['num_epochs']
MAX_LEN = 42 # arbitrily decided
DP = 0.2 # dropout rate
L2 = 4e-6
ACTIVATION = 'relu'
OPTIMIZER = 'rmsprop'
embed = None # embedding layer
print("RNN / Embed / Sent = {}, {}, {}".format(RNN, EMBED_HIDDEN_SIZE, SENT_HIDDEN_SIZE))
print("Trainable word embeddings = {}".format(TRAIN_EMBED))

to_seq = lambda X: pad_sequences(tokenizer.texts_to_sequences(X), maxlen=MAX_LEN)
prepare_data = lambda data: (to_seq(data[0]), to_seq(data[1]), data[2])

training = prepare_data(training)
validation = prepare_data(validation)
test = prepare_data(test)

RNN / Embed / Sent = None, 50, 50
Trainable word embeddings = False


In [6]:
print("Build mode...")
print("Vocab size=", VOCAB)

def load_glove_embeddings():
    global glove_wordmap
    global glove_wordmap_size

    glove_text_file = FLAGS['word_embeddings_txtfilename']
    printOne = True    

    with open(glove_text_file, "r", encoding='utf-8') as glove:
        for line in glove:
            values = line.split(' ')
            word = values[0]
            # tensorflow only accepts arrays, not python lists
            featuresMatrix = np.asarray(values[1:], dtype='float32')
            # print a sample word with feature matrix
            if printOne:
                printOne = False
                print("Sample word \"{}\" with features {}".format(word, pp.pformat(featuresMatrix)))
            glove_wordmap[word] = featuresMatrix
    print("Glove wordmap populated, found %s vectors\n" % len(glove_wordmap))

def load_glove_embeddings_matrix():
    global embed
    
    GLOVE_STORE = FLAGS["glove_store"]
    # if not os.path.exists(GLOVE_STORE + ".npy"):
    print("Computing glove embedding matrix")

    # prepare embedding matrix
    embedding_matrix = np.zeros((VOCAB, EMBED_HIDDEN_SIZE))
    for word, i in tokenizer.word_index.items():
        embedding_vector = glove_wordmap.get(word)
        if embedding_vector is not None:
            # word not found in embedding index will be all-zeros
            embedding_matrix[i] = embedding_vector
        else:
            print("{} - missing from GloVe: {}".format(i, word))
    # .npy appended to filename (first var)
    np.save(GLOVE_STORE, embedding_matrix)
    # print("Loading GloVe")
    # embedding_matrix = np.load(GLOVE_STORE + ".npy")
    print("Total number of null word embeddings:")
    print(np.sum(np.sum(embedding_matrix, axis=1) == 0))
    # freeze the embedding layer, set trainable to false
    embed = Embedding(VOCAB, EMBED_HIDDEN_SIZE, weights=[embedding_matrix], input_length=MAX_LEN, trainable=TRAIN_EMBED)
    return 

load_glove_embeddings()
load_glove_embeddings_matrix()
# enumerate_glove(glove_wordmap)

Build mode...
Vocab size= 38700
Sample word "the" with features array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)
Glove wordmap populated, found 400000 vectors

Computing glove embedding matrix
654 - missing from GloVe: man's
699 - missing from GloVe: blond-hair
851 - missing from GloVe: woman's
895 - missi

In [7]:
rnn_kwargs = dict(units=SENT_HIDDEN_SIZE, dropout=DP, recurrent_dropout=DP)
SumEmbeddings = tf.keras.layers.Lambda(lambda x: K.sum(x, axis=1), output_shape=(SENT_HIDDEN_SIZE, ))

translate = TimeDistributed(Dense(SENT_HIDDEN_SIZE, activation=ACTIVATION))
premise = Input(shape=(MAX_LEN,), dtype='int32')
hypothesis = Input(shape=(MAX_LEN,), dtype='int32')

prem = embed(premise)
hypo = embed(hypothesis)

prem = translate(prem)
hypo = translate(hypo)

rnn = SumEmbeddings if not RNN else RNN(return_sequences=False, **rnn_kwargs)
prem = rnn(prem)
hypo = rnn(hypo)
prem = BatchNormalization()(prem)
hypo = BatchNormalization()(hypo)

joint = concatenate([prem, hypo])
joint = Dropout(DP)(joint)
for i in range(3):
    joint = Dense(2 * SENT_HIDDEN_SIZE, activation=ACTIVATION)(joint)
    joint = Dropout(DP)(joint)
    joint = BatchNormalization()(joint)
    
pred = Dense(len(LABELS), activation='softmax')(joint)

In [None]:
model = Model(inputs=[premise, hypothesis], outputs=pred)
model.compile(optimizer=OPTIMIZER, loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

print("Training")

_, temp_filename = tempfile.mkstemp()
# Save the best model during validation and bail out of traning early if we're not improving
callbacks = [EarlyStopping(patience=PATIENCE), ModelCheckpoint(temp_filename, save_best_only=True, save_weights_only=True)]
model.fit([training[0], training[1]], training[2], batch_size=BATCH_SIZE, epochs=MAX_EPOCHS, validation_data=([validation[0], validation[1]], validation[2]))


W0731 16:34:46.543843 139705335535424 deprecation.py:323] From /home/challenger2/anaconda3/envs/venv/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 42)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 42)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 42, 50)       1935000     input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
time_distributed (TimeDistribut (None, 42, 50)       2550        embedding[0][0]              