In [None]:
# Load Hyperparameters and global variables

In [None]:
import tensorflow as tf
import numpy as np
import argparse
import tqdm
import logging
import pprint # pretty print python objects
import sys
import os

# new additions


FLAGS = {
    "config_file": "snli.config",
    "buffer_size": 10000,
    "max_data_items": 50000,
    "snli_link": "https://nlp.stanford.edu/projects/snli/snli_1.0.zip",
    "snli_zipfilename": "snli_1.0.zip",
    "snli_trainfilename": "snli_1.0_train.txt",
    "snli_validatefilename": "snli_1.0_dev.txt",
    "snli_testfilename": "snli_1.0_test.txt",
    "word_embeddings_link": "http://nlp.stanford.edu/data/glove.6B.zip",
    "word_embeddings_zipfilename": "glove.6B.zip",
    "word_embeddings_txtfilename": "glove.6B.50d.txt",
    "max_premise_length": 30,
    "max_hypothesis_length": 30,
    "batch_size": 128,
    "hidden_length": 64,
    "embedding_size": 50, # 50 dim embeddings
    "max_features": 50000,
    "num_epochs": 5
}

def create_logger():
    log = logging.getLogger() # root logger
    log.setLevel(logging.DEBUG)
    formatter = logging.Formatter(fmt="%(asctime)s : %(levelname)s %(message)s")
    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    log.addHandler(handler)
    return logging.getLogger()

pp = pprint.PrettyPrinter(indent=2)
logger = create_logger()
glove_wordmap = {}
glove_wordmap_size = 0

print(pp.pformat(FLAGS))

In [None]:
# Load dataset manually
def prepare_snli_corpus():
    snli_link = FLAGS['snli_link']
    snli_zip_file = FLAGS['snli_zipfilename']
    snli_train_file = FLAGS['snli_trainfilename']
    snli_validate_file = FLAGS['snli_validatefilename']
    snli_test_file = FLAGS['snli_testfilename']
    
    if (not os.path.isfile(snli_zip_file)):
        print("Snli corpus not found. Downloading from site...")
        import urllib.request
        # download glove zip file
        urllib.request.urlretrieve(snli_link, snli_zip_file)
    print("Snli corpus file already downloaded. Extracting...")
    # extract train, validate and test files
    if (not os.path.isfile(snli_train_file)):
        unzip_single_file(snli_zip_file, snli_train_file)
        print("Extracted {}\n".format(snli_validate_file))
    else:
        print("{} already extracted.\n".format(snli_train_file))
    if (not os.path.isfile(snli_validate_file)):
        unzip_single_file(snli_zip_file, snli_validate_file)
        print("Extracted {}\n".format(snli_validate_file))
    else:
        print("{} already extracted.\n".format(snli_validate_file))
    if (not os.path.isfile(snli_test_file)):
        unzip_single_file(snli_zip_file, snli_test_file)
        print("Extracted {}\n".format(snli_test_file))
    else:
        print("{} already extracted.\n".format(snli_validate_file))
    return

def prepare_glove_embeddings():
    glove_link = FLAGS['word_embeddings_link']
    glove_zip_file = FLAGS['word_embeddings_zipfilename']
    glove_text_file = FLAGS['word_embeddings_txtfilename']
    
    if (not os.path.isfile(glove_zip_file) and not os.path.isfile(glove_text_file)):
        print("Glove embeddings not found. Downloading from site...")
        import urllib.request
        # download glove zip file
        urllib.request.urlretrieve(glove_link, glove_zip_file)
        print("Glove embeddings file downloaded.")
        # extract zip to text file
        unzip_single_file(glove_zip_file, glove_text_file)
    return

def unzip_single_file(zip_file_name, output_file_name):
    """
    If the outfile exists, don't recreate, else create from zipfile
    """
    if not os.path.isfile(output_file_name):
        import zipfile
        print("Unzipping glove embeddings {}..".format(zip_file_name))
        with open(output_file_name, "wb") as out_file:
            with zipfile.ZipFile(zip_file_name) as zipped:
                for info in zipped.infolist():
                    if output_file_name in info.filename:
                        with zipped.open(info) as requested_file:
                            out_file.write(requested_file.read())
                            print("Glove embeddings unzipped to {}".format(output_file_name))
                            return
    return

prepare_snli_corpus()
prepare_glove_embeddings()

In [None]:
def sentence2sequence(sentence):
    '''
    Turns an input sentence into a (n, d) matrix. 
    n is the number of tokens in the sentence.
    d is the number of dimensions each word vector has.
    '''
    tokens = None

    try:
        tokens = sentence.decode().lower().split(" ")
    except AttributeError: # not byte-encoded
        tokens = sentence.lower().split(" ")
    rows = []
    words = []
    for token in tokens: # each token is a word in the sentence
        i = len(token)
        while len(token) > 0 and i > 0:
            word = token[:i]
            if word in glove_wordmap:
                rows.append(glove_wordmap[word])
                words.append(word)
                token = token[i:]
                i = len(token)
            else:
                # no such word, add keep reducing until we find a word
                i = i - 1
    return { "words": words, "rows": rows }

def sentence_score_setup(row):
    convert_dict = {
        'contradiction': 0,
        'neutral': 1,
        'entailment': 2
    }
    score = np.zeros((3,1))
    for x in range(1,6):
        tag = row["label"+str(x)]
        if tag in convert_dict: 
            score[convert_dict[tag]] += 1
    return score / (1.0 * np.sum(score)) # return normalised np array

def fit_to_size(matrix, shape):
    res = np.zeros(shape)
    #print("Before: {}".format(pp.pformat(matrix.shape)))
    slices = [slice(0, min(dim, shape[e])) for e, dim in enumerate(matrix.shape)]
    res[slices] = matrix[slices]
    #print("After: {}".format(pp.pformat(res.shape)))
    return res

def load_glove_embeddings():
    global glove_wordmap
    global glove_wordmap_size

    glove_text_file = FLAGS['word_embeddings_txtfilename']
    printOne = True    

    with open(glove_text_file, "r", encoding='utf-8') as glove:
        for line in glove:
            values = line.split()
            word = values[0]
            # tensorflow only accepts arrays, not python lists
            featuresMatrix = np.asarray(values[1:], dtype='float32')
            # print a sample word with feature matrix
            if printOne:
                printOne = False
                print("Sample word \"{}\" with features {}".format(word, pp.pformat(featuresMatrix)))
            glove_wordmap[word] = featuresMatrix
    glove_wordmap_size = len(glove_wordmap)
    print("Glove wordmap populated, found %s vectors\n" % glove_wordmap_size)
    
def load_snli_data(filename):
    if not os.path.isfile(filename):
        print("ERROR: FILE NOT FOUND. EXITING...")
    else:
        print("Preprocessing {} & parsing to arrays...\n".format(filename))
        import csv
        
        convert_dict = { 'contradiction': 0, 'neutral': 1, 'entailment': 2 }
        with open(filename, "r", encoding='utf-8') as data:
            train = csv.DictReader(data, delimiter='\t')
            premise_embeds = []
            hypothesis_embeds = []
            labels = []
            # array of binary class matrix e.g [1.0, 0.0, 0.0]
            y = [] 
            i = 0
            for row in tqdm.tqdm(iterable=train):
                i += 1
                if i > FLAGS['max_data_items']:
                    break
                premise_sentences.append(row["sentence1"].lower())
                hypothesis_sentences.append(row["sentence2"].lower())
                labels.append(row["gold_label"])
                y.append(convert_dict[row["gold_label"]])
                # print("Sample data piece: {}".format(pp.pformat(row)))
                # print(pp.pformat(sentence2sequence(row['sentence1'].lower())['rows']))
                # print(np.vstack(pp.pformat(sentence2sequence(row['sentence1'].lower())['rows'])))
                # print(pp.pformat(sentence_score_setup(row)))
            return (premise_embeds_pad, hypothesis_embeds_pad), labels, np.array(scores)        
    
load_glove_embeddings()
# max_data_items rows processed
data_features_tuple, labels, scores = load_snli_data(FLAGS['snli_trainfilename'])
v_features_tuple, v_labels, v_scores = load_snli_data(FLAGS['snli_validatefilename'])

In [None]:
from tensorflow import keras
from tensorflow.keras import layers

In [None]:
# a = tf.Variable(tf.ones(shape=(128,30,50)), name="a")
# b = tf.Variable(tf.zeros(shape=(128,30,50)), name="b")
# c = tf.concat([a, b], 1)
# d = tf.transpose(c, [1,0,2])
# print(d)
# e = tf.reshape(d, [-1, 50])
# print(e)
# f = tf.split(e, 60)
# print(pp.pformat(f))
# print(tf.equal(tf.Variable(True), tf.Variable(True)))
# print(np.array([[1,2,3], [1,2,3]]).shape[0])
# print(data_features_tuple[0].shape)
# print(np.random.randint(data_features_tuple[0].shape[0], size=128))

# input_length = FLAGS['max_premise_length'] + FLAGS['max_hypothesis_length'] # first layer length
# batch_size = FLAGS['batch_size']
# premise_len = FLAGS['max_premise_length']
# hypothesis_len = FLAGS['max_hypothesis_length']
# dim = FLAGS['embedding_size']
# hidden_len = FLAGS['hidden_length']
# num_epochs = FLAGS['num_epochs']
# max_data_items = FLAGS['max_data_items']

# LABELS = {
#     'entailment': 0,
#     'neutral': 1,
#     'contradiction': 2
# }
# print(data_features_tuple[0].shape)
# premise_embeds_tensor = tf.Variable(data_features_tuple[0]) 
# hypothesis_embeds_tensor = tf.Variable(data_features_tuple[1])
# print(premise_embeds_tensor.shape)
# x = tf.keras.layers.concatenate([premise_embeds_tensor, hypothesis_embeds_tensor], axis=1)
# print(x.shape)

# (max_inputs, 60, 50)
# input_tensor = layers.concatenate([data_features_tuple[0], data_features_tuple[1]], axis=1)
# input_array = np.concatenate((data_features_tuple[0], data_features_tuple[1]), axis=1)
# validate_tensor = layers.concatenate([v_features_tuple[0], v_features_tuple[1]], axis=1)
# validate_array = np.concatenate((v_features_tuple[0], v_features_tuple[1]), axis=1)

# model = keras.Sequential()
# model.add(layers.Bidirectional(layers.LSTM(input_length, return_sequences=True), input_shape=(60, dim)))
# model.add(layers.Dense(hidden_len, activation="relu"))
# model.add(layers.Dense(len(LABELS), activation="softmax"))
# model.compile(optimizer="rmsprop", loss="sparse_categorical_crossentropy", metrics=['accuracy'])
# model.summary()
# print(len(input_array))
# print(len(scores))

# model.fit(x=input_array, y=scores, batch_size=128, epochs=num_epochs, validation_data=(validate_array, v_scores), verbose=2)