In [1]:
# Load Hyperparameters and global variables

In [2]:
import tensorflow as tf
import numpy as np
import argparse
import tqdm
import logging
import pprint # pretty print python objects
import sys
import os

FLAGS = {
    "config_file": "snli.config",
    "buffer_size": 10000,
    "snli_link": "https://nlp.stanford.edu/projects/snli/snli_1.0.zip",
    "snli_zipfilename": "snli_1.0.zip",
    "snli_trainfilename": "snli_1.0_train.txt",
    "snli_validatefilename": "snli_1.0_validate.txt",
    "snli_testfilename": "snli_1.0_test.txt",
    "word_embeddings_link": "http://nlp.stanford.edu/data/glove.6B.zip",
    "word_embeddings_zipfilename": "glove.6B.zip",
    "word_embeddings_txtfilename": "glove.6B.50d.txt",
    "max_premise_length": 30,
    "max_hypothesis_length": 30,
    "batch_size": 128,
    "hidden_length": 64,
    "embedding_size": 50, # 50 dim embeddings
    "max_features": 50000,
    "num_epochs": 5
}

def create_logger():
    log = logging.getLogger() # root logger
    log.setLevel(logging.DEBUG)
    formatter = logging.Formatter(fmt="%(asctime)s : %(levelname)s %(message)s")
    handler = logging.StreamHandler()
    handler.setFormatter(formatter)
    log.addHandler(handler)
    return logging.getLogger()

pp = pprint.PrettyPrinter(indent=2)
logger = create_logger()
glove_wordmap = {}
glove_wordmap_size = 0

print(pp.pformat(FLAGS))

{ 'batch_size': 128,
  'buffer_size': 10000,
  'config_file': 'snli.config',
  'embedding_size': 50,
  'hidden_length': 64,
  'max_features': 50000,
  'max_hypothesis_length': 30,
  'max_premise_length': 30,
  'num_epochs': 5,
  'snli_link': 'https://nlp.stanford.edu/projects/snli/snli_1.0.zip',
  'snli_testfilename': 'snli_1.0_test.txt',
  'snli_trainfilename': 'snli_1.0_train.txt',
  'snli_validatefilename': 'snli_1.0_validate.txt',
  'snli_zipfilename': 'snli_1.0.zip',
  'word_embeddings_link': 'http://nlp.stanford.edu/data/glove.6B.zip',
  'word_embeddings_txtfilename': 'glove.6B.50d.txt',
  'word_embeddings_zipfilename': 'glove.6B.zip'}


In [3]:
# Load dataset manually
def prepare_snli_corpus():
    snli_link = FLAGS['snli_link']
    snli_zip_file = FLAGS['snli_zipfilename']
    snli_train_file = FLAGS['snli_trainfilename']
    snli_validate_file = FLAGS['snli_validatefilename']
    snli_test_file = FLAGS['snli_testfilename']
    
    if (not os.path.isfile(snli_zip_file)):
        print("Snli corpus not found. Downloading from site...")
        import urllib.request
        # download glove zip file
        urllib.request.urlretrieve(snli_link, snli_zip_file)
    print("Snli corpus file already downloaded. Extracting...")
    # extract train, validate and test files
    if (not os.path.isfile(snli_train_file)):
        unzip_single_file(snli_zip_file, snli_train_file)
        print("Extracted {}\n".format(snli_validate_file))
    else:
        print("{} already extracted.\n".format(snli_train_file))
    if (not os.path.isfile(snli_validate_file)):
        unzip_single_file(snli_zip_file, snli_validate_file)
        print("Extracted {}\n".format(snli_validate_file))
    else:
        print("{} already extracted.\n".format(snli_validate_file))
    if (not os.path.isfile(snli_test_file)):
        unzip_single_file(snli_zip_file, snli_test_file)
        print("Extracted {}\n".format(snli_test_file))
    else:
        print("{} already extracted.\n".format(snli_validate_file))
    return

def prepare_glove_embeddings():
    glove_link = FLAGS['word_embeddings_link']
    glove_zip_file = FLAGS['word_embeddings_zipfilename']
    glove_text_file = FLAGS['word_embeddings_txtfilename']
    
    if (not os.path.isfile(glove_zip_file) and not os.path.isfile(glove_text_file)):
        print("Glove embeddings not found. Downloading from site...")
        import urllib.request
        # download glove zip file
        urllib.request.urlretrieve(glove_link, glove_zip_file)
        print("Glove embeddings file downloaded.")
        # extract zip to text file
        unzip_single_file(glove_zip_file, glove_text_file)
    return

def unzip_single_file(zip_file_name, output_file_name):
    """
    If the outfile exists, don't recreate, else create from zipfile
    """
    if not os.path.isfile(output_file_name):
        import zipfile
        print("Unzipping glove embeddings {}..".format(zip_file_name))
        with open(output_file_name, "wb") as out_file:
            with zipfile.ZipFile(zip_file_name) as zipped:
                for info in zipped.infolist():
                    if output_file_name in info.filename:
                        with zipped.open(info) as requested_file:
                            out_file.write(requested_file.read())
                            print("Glove embeddings unzipped to {}".format(output_file_name))
                            return
    return

prepare_snli_corpus()
prepare_glove_embeddings()

Snli corpus file already downloaded. Extracting...
snli_1.0_train.txt already extracted.

snli_1.0_validate.txt already extracted.

snli_1.0_validate.txt already extracted.



In [15]:
def sentence2sequence(sentence):
    '''
    Turns an input sentence into a (n, d) matrix. 
    n is the number of tokens in the sentence.
    d is the number of dimensions each word vector has.
    '''
    tokens = None

    try:
        tokens = sentence.decode().lower().split(" ")
    except AttributeError: # not byte-encoded
        tokens = sentence.lower().split(" ")
    rows = []
    words = []
    for token in tokens: # each token is a word in the sentence
        i = len(token)
        while len(token) > 0 and i > 0:
            word = token[:i]
            if word in glove_wordmap:
                rows.append(glove_wordmap[word])
                words.append(word)
                token = token[i:]
                i = len(token)
            else:
                # no such word, add keep reducing until we find a word
                i = i - 1
    return { "words": words, "rows": rows }

def sentence_score_setup(row):
    convert_dict = {
        'entailment': 0,
        'neutral': 1,
        'contradiction': 2
    }
    score = np.zeros((3,1))
    for x in range(1,6):
        tag = row["label"+str(x)]
        if tag in convert_dict: 
            score[convert_dict[tag]] += 1
    return score / (1.0 * np.sum(score)) # return normalised np array

def fit_to_size(matrix, shape):
    res = np.zeros(shape)
    #print("Before: {}".format(pp.pformat(matrix.shape)))
    slices = [slice(0, min(dim, shape[e])) for e, dim in enumerate(matrix.shape)]
    res[slices] = matrix[slices]
    #print("After: {}".format(pp.pformat(res.shape)))
    return res

def load_glove_embeddings():
    global glove_wordmap
    global glove_wordmap_size

    glove_text_file = FLAGS['word_embeddings_txtfilename']
    printOne = True    

    with open(glove_text_file, "r", encoding='utf-8') as glove:
        for line in glove:
            values = line.split()
            word = values[0]
            # tensorflow only accepts arrays, not python lists
            featuresMatrix = np.asarray(values[1:], dtype='float32')
            # print a sample word with feature matrix
            if printOne:
                printOne = False
                print("Sample word \"{}\" with features {}".format(word, pp.pformat(featuresMatrix)))
            glove_wordmap[word] = featuresMatrix
    glove_wordmap_size = len(glove_wordmap)
    print("Glove wordmap populated, found %s vectors\n" % glove_wordmap_size)
    
def load_snli_data(filename):
    if not os.path.isfile(filename):
        print("ERROR: FILE NOT FOUND. EXITING...")
    else:
        print("Preprocessing snli data & parsing to arrays...")
        import csv
        with open(filename, "r", encoding='utf-8') as data:
            train = csv.DictReader(data, delimiter='\t')
            premise_embeds = []
            hypothesis_embeds = []
            labels = []
            scores = []
            for row in tqdm.tqdm(iterable=train):
                premise_embeds.append(
                    np.vstack(sentence2sequence(row["sentence1"].lower())["rows"]))
                hypothesis_embeds.append(
                    np.vstack(sentence2sequence(row["sentence2"].lower())["rows"]))
                labels.append(row["gold_label"])
                scores.append(sentence_score_setup(row))
                print("Sample data piece: {}".format(pp.pformat(row)))
                # print(pp.pformat(sentence2sequence(row['sentence1'].lower())['rows']))
                # print(np.vstack(pp.pformat(sentence2sequence(row['sentence1'].lower())['rows'])))
                # print(pp.pformat(sentence_score_setup(row)))
                break
            premise_embeds = np.stack([fit_to_size(x, (FLAGS['max_premise_length'], FLAGS['embedding_size']))
                                      for x in premise_embeds])
            hypothesis_embeds = np.stack([fit_to_size(x, (FLAGS['max_hypothesis_length'], FLAGS['embedding_size']))
                                         for x in hypothesis_embeds])
            
            return (np.array(premise_embeds), np.array(hypothesis_embeds)), labels, scores        
    
load_glove_embeddings()
# 550152 rows processed
data_features_tuple, labels, scores = load_snli_data(FLAGS['snli_trainfilename'])

Sample word "the" with features array([ 4.1800e-01,  2.4968e-01, -4.1242e-01,  1.2170e-01,  3.4527e-01,
       -4.4457e-02, -4.9688e-01, -1.7862e-01, -6.6023e-04, -6.5660e-01,
        2.7843e-01, -1.4767e-01, -5.5677e-01,  1.4658e-01, -9.5095e-03,
        1.1658e-02,  1.0204e-01, -1.2792e-01, -8.4430e-01, -1.2181e-01,
       -1.6801e-02, -3.3279e-01, -1.5520e-01, -2.3131e-01, -1.9181e-01,
       -1.8823e+00, -7.6746e-01,  9.9051e-02, -4.2125e-01, -1.9526e-01,
        4.0071e+00, -1.8594e-01, -5.2287e-01, -3.1681e-01,  5.9213e-04,
        7.4449e-03,  1.7778e-01, -1.5897e-01,  1.2041e-02, -5.4223e-02,
       -2.9871e-01, -1.5749e-01, -3.4758e-01, -4.5637e-02, -4.4251e-01,
        1.8785e-01,  2.7849e-03, -1.8411e-01, -1.1514e-01, -7.8581e-01],
      dtype=float32)
Glove wordmap populated, found 400000 vectors

Preprocessing snli data & parsing to arrays...


0it [00:00, ?it/s]

Sample data piece: OrderedDict([ ('gold_label', 'neutral'),
              ( 'sentence1_binary_parse',
                '( ( ( A person ) ( on ( a horse ) ) ) ( ( jumps ( over ( a ( '
                'broken ( down airplane ) ) ) ) ) . ) )'),
              ( 'sentence2_binary_parse',
                '( ( A person ) ( ( is ( ( training ( his horse ) ) ( for ( a '
                'competition ) ) ) ) . ) )'),
              ( 'sentence1_parse',
                '(ROOT (S (NP (NP (DT A) (NN person)) (PP (IN on) (NP (DT a) '
                '(NN horse)))) (VP (VBZ jumps) (PP (IN over) (NP (DT a) (JJ '
                'broken) (JJ down) (NN airplane)))) (. .)))'),
              ( 'sentence2_parse',
                '(ROOT (S (NP (DT A) (NN person)) (VP (VBZ is) (VP (VBG '
                'training) (NP (PRP$ his) (NN horse)) (PP (IN for) (NP (DT a) '
                '(NN competition))))) (. .)))'),
              ( 'sentence1',
                'A person on a horse jumps over a broken down airplan




Before: (12, 50)




After: (30, 50)
Before: (10, 50)
After: (30, 50)


In [5]:
input_length = FLAGS['max_premise_length'] + FLAGS['max_hypothesis_length'] # first layer length


In [8]:
a = tf.Variable(tf.ones(shape=(128,30,50)), name="a")
b = tf.Variable(tf.zeros(shape=(128,30,50)), name="b")
c = tf.concat([a, b], 1)
d = tf.transpose(c, [1,0,2])
# print(d)
e = tf.reshape(d, [-1, 50])
# print(e)
f = tf.split(e, 60)
#print(pp.pformat(f))
# print(tf.equal(tf.Variable(True), tf.Variable(True)))
# print(np.array([[1,2,3], [1,2,3]]).shape[0])
print(data_features_tuple[0].shape)
print(np.random.randint(data_features_tuple[0].shape[0], size=128))

(550152,)
[196009 304376 231330 304232 421940 124056  51227   1164 386737 421021
  83404  17689  35619 488592 362493 237388 134576  53548  34074 312369
 443143 193800 513020  65164 531701  32125 245461 496864 352922 284100
 407511 239138 397509 224205 363856 339736 143113 152375 238305  47237
 140212 297843 349415 368655 124533 460977 408040 313762 183325 456792
 455842 198291 353704 254073 440335   9509 336486 209511 175368 148314
 386784  78051 191574 433823 500165 244361 403903  90774 371095  79094
 102464 474293 108214 519147  14697 217546 275235  74941 137053 439646
 371747 413758  85852 133124 404831  83049 517217 402674 206020 100245
 481365 309041  74696 259249 353265 309733 152715 498222 258519 261915
 392926 491256  15952 168110 354946 410538 264103 394077 101536 165094
 402456 470900 205137 127180 402022 401769 508720 216047   9677 151715
 189836   3574 151599 462690 179618 226335 234097 189862]


In [7]:
batch_size = FLAGS.batch_size
premise_len = FLAGS.max_premise_length
hypothesis_len = FLAGS.max_hypothesis_length
dimensions = FLAGS.embedding_size

input_layer = tf.keras.Input()

AttributeError: 'dict' object has no attribute 'batch_size'