In [1]:
# Load Hyperparameters and global variables

In [2]:
import tensorflow as tf
import numpy as np
import argparse
import tqdm
import logging
import pprint # pretty print python objects
import sys
import os

FLAGS = {
    "config_file": "snli.config",
    "buffer_size": 10000,
    "max_data_items": 50000,
    "snli_link": "https://nlp.stanford.edu/projects/snli/snli_1.0.zip",
    "snli_zipfilename": "snli_1.0.zip",
    "snli_trainfilename": "snli_1.0_train.txt",
    "snli_validatefilename": "snli_1.0_dev.txt",
    "snli_testfilename": "snli_1.0_test.txt",
    "word_embeddings_link": "http://nlp.stanford.edu/data/glove.6B.zip",
    "word_embeddings_zipfilename": "glove.6B.zip",
    "word_embeddings_txtfilename": "glove.6B.50d.txt",
    "max_premise_length": 30,
    "max_hypothesis_length": 30,
    "batch_size": 128,
    "hidden_length": 64,
    "embedding_size": 50, # 50 dim embeddings
    "max_features": 50000,
    "num_epochs": 5
}

def create_logger():
    log = logging.getLogger() # root logger
    log.setLevel(logging.DEBUG)
    formatter = logging.Formatter(fmt="%(asctime)s : %(levelname)s %(message)s")
    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    log.addHandler(handler)
    return logging.getLogger()

pp = pprint.PrettyPrinter(indent=2)
logger = create_logger()
glove_wordmap = {}
glove_wordmap_size = 0

print(pp.pformat(FLAGS))

{ 'batch_size': 128,
  'buffer_size': 10000,
  'config_file': 'snli.config',
  'embedding_size': 50,
  'hidden_length': 64,
  'max_data_items': 50000,
  'max_features': 50000,
  'max_hypothesis_length': 30,
  'max_premise_length': 30,
  'num_epochs': 5,
  'snli_link': 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip',
  'snli_testfilename': 'snli_1.0_test.txt',
  'snli_trainfilename': 'snli_1.0_train.txt',
  'snli_validatefilename': 'snli_1.0_dev.txt',
  'snli_zipfilename': 'snli_1.0.zip',
  'word_embeddings_link': 'http://nlp.stanford.edu/data/glove.6B.zip',
  'word_embeddings_txtfilename': 'glove.6B.50d.txt',
  'word_embeddings_zipfilename': 'glove.6B.zip'}


In [3]:
# Load dataset manually
def prepare_snli_corpus():
    snli_link = FLAGS['snli_link']
    snli_zip_file = FLAGS['snli_zipfilename']
    snli_train_file = FLAGS['snli_trainfilename']
    snli_validate_file = FLAGS['snli_validatefilename']
    snli_test_file = FLAGS['snli_testfilename']
    
    if (not os.path.isfile(snli_zip_file)):
        print("Snli corpus not found. Downloading from site...")
        import urllib.request
        # download glove zip file
        urllib.request.urlretrieve(snli_link, snli_zip_file)
    print("Snli corpus file already downloaded. Extracting...")
    # extract train, validate and test files
    if (not os.path.isfile(snli_train_file)):
        unzip_single_file(snli_zip_file, snli_train_file)
        print("Extracted {}\n".format(snli_validate_file))
    else:
        print("{} already extracted.\n".format(snli_train_file))
    if (not os.path.isfile(snli_validate_file)):
        unzip_single_file(snli_zip_file, snli_validate_file)
        print("Extracted {}\n".format(snli_validate_file))
    else:
        print("{} already extracted.\n".format(snli_validate_file))
    if (not os.path.isfile(snli_test_file)):
        unzip_single_file(snli_zip_file, snli_test_file)
        print("Extracted {}\n".format(snli_test_file))
    else:
        print("{} already extracted.\n".format(snli_validate_file))
    return

def prepare_glove_embeddings():
    glove_link = FLAGS['word_embeddings_link']
    glove_zip_file = FLAGS['word_embeddings_zipfilename']
    glove_text_file = FLAGS['word_embeddings_txtfilename']
    
    if (not os.path.isfile(glove_zip_file) and not os.path.isfile(glove_text_file)):
        print("Glove embeddings not found. Downloading from site...")
        import urllib.request
        # download glove zip file
        urllib.request.urlretrieve(glove_link, glove_zip_file)
        print("Glove embeddings file downloaded.")
        # extract zip to text file
        unzip_single_file(glove_zip_file, glove_text_file)
    return

def unzip_single_file(zip_file_name, output_file_name):
    """
    If the outfile exists, don't recreate, else create from zipfile
    """
    if not os.path.isfile(output_file_name):
        import zipfile
        print("Unzipping glove embeddings {}..".format(zip_file_name))
        with open(output_file_name, "wb") as out_file:
            with zipfile.ZipFile(zip_file_name) as zipped:
                for info in zipped.infolist():
                    if output_file_name in info.filename:
                        with zipped.open(info) as requested_file:
                            out_file.write(requested_file.read())
                            print("Glove embeddings unzipped to {}".format(output_file_name))
                            return
    return

prepare_snli_corpus()
prepare_glove_embeddings()

Snli corpus file already downloaded. Extracting...
snli_1.0_train.txt already extracted.

snli_1.0_dev.txt already extracted.

snli_1.0_dev.txt already extracted.



In [4]:
def sentence2sequence(sentence):
    '''
    Turns an input sentence into a (n, d) matrix. 
    n is the number of tokens in the sentence.
    d is the number of dimensions each word vector has.
    '''
    tokens = None

    try:
        tokens = sentence.decode().lower().split(" ")
    except AttributeError: # not byte-encoded
        tokens = sentence.lower().split(" ")
    rows = []
    words = []
    for token in tokens: # each token is a word in the sentence
        i = len(token)
        while len(token) > 0 and i > 0:
            word = token[:i]
            if word in glove_wordmap:
                rows.append(glove_wordmap[word])
                words.append(word)
                token = token[i:]
                i = len(token)
            else:
                # no such word, add keep reducing until we find a word
                i = i - 1
    return { "words": words, "rows": rows }

def sentence_score_setup(row):
    convert_dict = {
        'entailment': 0,
        'neutral': 1,
        'contradiction': 2
    }
    score = np.zeros((3,1))
    for x in range(1,6):
        tag = row["label"+str(x)]
        if tag in convert_dict: 
            score[convert_dict[tag]] += 1
    return score / (1.0 * np.sum(score)) # return normalised np array

def fit_to_size(matrix, shape):
    res = np.zeros(shape)
    #print("Before: {}".format(pp.pformat(matrix.shape)))
    slices = [slice(0, min(dim, shape[e])) for e, dim in enumerate(matrix.shape)]
    res[slices] = matrix[slices]
    #print("After: {}".format(pp.pformat(res.shape)))
    return res

def load_glove_embeddings():
    global glove_wordmap
    global glove_wordmap_size

    glove_text_file = FLAGS['word_embeddings_txtfilename']
    printOne = True    

    with open(glove_text_file, "r", encoding='utf-8') as glove:
        for line in glove:
            values = line.split()
            word = values[0]
            # tensorflow only accepts arrays, not python lists
            featuresMatrix = np.asarray(values[1:], dtype='float32')
            # print a sample word with feature matrix
            if printOne:
                printOne = False
                print("Sample word \"{}\" with features {}".format(word, pp.pformat(featuresMatrix)))
            glove_wordmap[word] = featuresMatrix
    glove_wordmap_size = len(glove_wordmap)
    print("Glove wordmap populated, found %s vectors\n" % glove_wordmap_size)
    
def load_snli_data(filename):
    if not os.path.isfile(filename):
        print("ERROR: FILE NOT FOUND. EXITING...")
    else:
        print("Preprocessing {} & parsing to arrays...\n".format(filename))
        import csv
        with open(filename, "r", encoding='utf-8') as data:
            train = csv.DictReader(data, delimiter='\t')
            premise_embeds = []
            hypothesis_embeds = []
            labels = []
            scores = []
            i = 0
            for row in tqdm.tqdm(iterable=train):
                i += 1
                if i > FLAGS['max_data_items']:
                    break
                premise_embeds.append(
                    np.vstack(sentence2sequence(row["sentence1"].lower())["rows"]))
                hypothesis_embeds.append(
                    np.vstack(sentence2sequence(row["sentence2"].lower())["rows"]))
                labels.append(row["gold_label"])
                scores.append(sentence_score_setup(row))
                # print("Sample data piece: {}".format(pp.pformat(row)))
                # print(pp.pformat(sentence2sequence(row['sentence1'].lower())['rows']))
                # print(np.vstack(pp.pformat(sentence2sequence(row['sentence1'].lower())['rows'])))
                # print(pp.pformat(sentence_score_setup(row)))
            print("Padding premise embeddings...")
            max_premise_length = FLAGS['max_premise_length']
            max_hypothesis_length = FLAGS['max_hypothesis_length']
            embedding_size = FLAGS['embedding_size']
            premise_embeds_pad = np.stack([fit_to_size(x, (max_premise_length, embedding_size))
                                      for x in tqdm.tqdm(iterable=premise_embeds)])
#             premise_embeds_pad = np.array([fit_to_size(x, (max_premise_length, embedding_size))
#                                       for x in tqdm.tqdm(iterable=premise_embeds)])
            del premise_embeds
            print("Premise with shape {} Padding hypothesis embeddings...".format(premise_embeds_pad.shape))
            hypothesis_embeds_pad = np.stack([fit_to_size(x, (max_hypothesis_length, embedding_size))
                                         for x in tqdm.tqdm(iterable=hypothesis_embeds)])
#             hypothesis_embeds_pad = np.array([fit_to_size(x, (max_hypothesis_length, embedding_size))
#                                                      for x in tqdm.tqdm(iterable=hypothesis_embeds)])
            del hypothesis_embeds
            print("Hypothesis with shape {}.".format(hypothesis_embeds_pad.shape))
            return (premise_embeds_pad, hypothesis_embeds_pad), labels, np.array(scores)        
    
load_glove_embeddings()
# max_data_items rows processed
data_features_tuple, labels, scores = load_snli_data(FLAGS['snli_trainfilename'])
v_features_tuple, v_labels, v_scores = load_snli_data(FLAGS['snli_validatefilename'])

Sample word "the" with features array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)


387it [00:00, 3866.17it/s]

Glove wordmap populated, found 400000 vectors

Preprocessing snli_1.0_train.txt & parsing to arrays...



49649it [00:08, 7048.69it/s]
 16%|█▋        | 8140/50000 [00:00<00:00, 81392.51it/s]

Padding premise embeddings...


100%|██████████| 50000/50000 [00:00<00:00, 58440.94it/s]
 22%|██▏       | 11182/50000 [00:00<00:00, 111815.04it/s]

Premise with shape (50000, 30, 50) Padding hypothesis embeddings...


100%|██████████| 50000/50000 [00:00<00:00, 82349.60it/s]
0it [00:00, ?it/s]

Hypothesis with shape (50000, 30, 50).
Preprocessing snli_1.0_dev.txt & parsing to arrays...



10000it [00:01, 5107.61it/s]
100%|██████████| 10000/10000 [00:00<00:00, 66564.53it/s]


Padding premise embeddings...


100%|██████████| 10000/10000 [00:00<00:00, 64776.80it/s]

Premise with shape (10000, 30, 50) Padding hypothesis embeddings...





Hypothesis with shape (10000, 30, 50).


In [5]:
# a = tf.Variable(tf.ones(shape=(128,30,50)), name="a")
# b = tf.Variable(tf.zeros(shape=(128,30,50)), name="b")
# c = tf.concat([a, b], 1)
# d = tf.transpose(c, [1,0,2])
# print(d)
# e = tf.reshape(d, [-1, 50])
# print(e)
# f = tf.split(e, 60)
# print(pp.pformat(f))
# print(tf.equal(tf.Variable(True), tf.Variable(True)))
# print(np.array([[1,2,3], [1,2,3]]).shape[0])
# print(data_features_tuple[0].shape)
# print(np.random.randint(data_features_tuple[0].shape[0], size=128))

In [6]:
from tensorflow import keras
from tensorflow.keras import layers

In [7]:
input_length = FLAGS['max_premise_length'] + FLAGS['max_hypothesis_length'] # first layer length
batch_size = FLAGS['batch_size']
premise_len = FLAGS['max_premise_length']
hypothesis_len = FLAGS['max_hypothesis_length']
dim = FLAGS['embedding_size']
hidden_len = FLAGS['hidden_length']
num_epochs = FLAGS['num_epochs']
max_data_items = FLAGS['max_data_items']

LABELS = {
    'entailment': 0,
    'neutral': 1,
    'contradiction': 2
}
# print(data_features_tuple[0].shape)
# premise_embeds_tensor = tf.Variable(data_features_tuple[0]) 
# hypothesis_embeds_tensor = tf.Variable(data_features_tuple[1])
# print(premise_embeds_tensor.shape)
# x = tf.keras.layers.concatenate([premise_embeds_tensor, hypothesis_embeds_tensor], axis=1)
# print(x.shape)

# (max_inputs, 60, 50)
# input_tensor = layers.concatenate([data_features_tuple[0], data_features_tuple[1]], axis=1)
input_array = np.concatenate((data_features_tuple[0], data_features_tuple[1]), axis=1)
# validate_tensor = layers.concatenate([v_features_tuple[0], v_features_tuple[1]], axis=1)
validate_array = np.concatenate((v_features_tuple[0], v_features_tuple[1]), axis=1)

model = keras.Sequential()
model.add(layers.Bidirectional(layers.LSTM(input_length, return_sequences=True), input_shape=(60, dim)))
model.add(layers.Dense(hidden_len, activation="relu"))
model.add(layers.Dense(len(LABELS), activation="softmax"))
model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=['accuracy'])
model.summary()
print(len(input_array))
print(len(scores))

model.fit(x=input_array, y=scores, batch_size=128, epochs=num_epochs, validation_data=(validate_array, v_scores), verbose=2)

W0730 17:10:18.333141 140424002889536 deprecation.py:323] From /home/challenger2/anaconda3/envs/venv/lib/python3.6/site-packages/tensorflow/python/ops/math_grad.py:1250: add_dispatch_support.<locals>.wrapper (from tensorflow.python.ops.array_ops) is deprecated and will be removed in a future version.
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bidirectional (Bidirectional (None, 60, 120)           53280     
_________________________________________________________________
dense (Dense)                (None, 60, 64)            7744      
_________________________________________________________________
dense_1 (Dense)              (None, 60, 3)             195       
Total params: 61,219
Trainable params: 61,219
Non-trainable params: 0
_________________________________________________________________
50000
50000
Train on 50000 samples, validate on 10000 samples
Epoch 1/5


InvalidArgumentError:  logits and labels must have the same first dimension, got logits shape [7680,3] and labels shape [384]
	 [[node loss/dense_1_loss/SparseSoftmaxCrossEntropyWithLogits/SparseSoftmaxCrossEntropyWithLogits (defined at <ipython-input-7-b6650a068cb3>:37) ]] [Op:__inference_keras_scratch_graph_3518]

Function call stack:
keras_scratch_graph
