In [28]:
import tensorflow as tf
import numpy as np
from functools import partial
import csv


In [4]:
def data_generator(data):
    for example in data:
        yield example
        

In [5]:
def process_data(row):
    features = row[:-1]
    labels = row[-1]
    items, alias_inputs = tf.unique(features)

    vector_length = tf.shape(features)[0]
    n_nodes = tf.shape(items)[0]
    indices = tf.gather(alias_inputs, tf.stack([tf.range(vector_length - 1), tf.range(vector_length - 1) + 1],
                                               axis=0))  # Stack and stagger values
    unique_indices, _ = tf.unique(indices[0] * (vector_length + 1) + indices[1])  # unique(a*x + b)
    unique_indices = tf.sort(unique_indices)  # Sort ascending
    unique_indices = tf.stack(
        [tf.floor_div(unique_indices, (vector_length + 1)), tf.floormod(unique_indices, (vector_length + 1))],
        axis=1)  # Ungroup and stack
    unique_indices = tf.cast(unique_indices, tf.int64)

    values = tf.ones(tf.shape(unique_indices, out_type=tf.int64)[0], dtype=tf.int64)
    dense_shape = tf.cast([n_nodes, n_nodes], tf.int64)

    adj = tf.SparseTensor(indices=unique_indices, values=values, dense_shape=dense_shape)
    adj = tf.sparse.to_dense(adj)

    u_sum_in_tf = tf.math.reduce_sum(adj, 0)
    u_sum_in_tf = tf.clip_by_value(u_sum_in_tf, 1, tf.reduce_max(u_sum_in_tf))
    A_in = tf.math.divide(adj, u_sum_in_tf)

    u_sum_out_tf = tf.math.reduce_sum(adj, 1)
    u_sum_out_tf = tf.clip_by_value(u_sum_out_tf, 1, tf.reduce_max(u_sum_out_tf))
    A_out = tf.math.divide(tf.transpose(adj), u_sum_out_tf)

    mask = tf.fill(tf.shape(features), 1)

    return A_in, A_out, alias_inputs, items, mask, labels


In [6]:
def train_input_fn(batch_size):
    with open("datasets/thg/processed/train.csv", "r") as data_file:
        data = [list(map(int, rec)) for rec in csv.reader(data_file, delimiter=',')]
    max_seq = len(max(data, key=len))
    max_n_node = len(max([np.unique(i) for i in data], key=len))

    dataset = tf.data.Dataset.from_generator(partial(data_generator, data), output_types=(tf.int32))
    dataset = dataset.map(process_data)
    dataset = dataset.shuffle(100000)

    dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(
        [max_n_node, max_n_node],
        [max_n_node, max_n_node],
        [max_seq],
        [max_seq],
        [max_seq],
        []))

    dataset = dataset.prefetch(batch_size)
    return dataset


In [7]:
def eval_input_fn(batch_size):
    with open("datasets/thg/processed/test.csv", "r") as data_file:
        data = [list(map(int, rec)) for rec in csv.reader(data_file, delimiter=',')]
    max_seq = len(max(data, key=len))
    max_n_node = len(max([np.unique(i) for i in data], key=len))

    dataset = tf.data.Dataset.from_generator(partial(data_generator, data), output_types=(tf.int32))
    dataset = dataset.map(process_data)

    dataset = dataset.padded_batch(batch_size=batch_size, padded_shapes=(
        [max_n_node, max_n_node],
        [max_n_node, max_n_node],
        [max_seq],
        [max_seq],
        [max_seq],
        []))

    dataset = dataset.prefetch(batch_size)
    return dataset


In [8]:
dataset = train_input_fn(1)


W0309 23:09:05.743708 4412249536 deprecation.py:323] From /Users/vladjiman/miniconda3/envs/SR-GNN/lib/python3.6/site-packages/tensorflow/python/data/ops/dataset_ops.py:410: py_func (from tensorflow.python.ops.script_ops) is deprecated and will be removed in a future version.
Instructions for updating:
tf.py_func is deprecated in TF V2. Instead, there are two
    options available in V2.
    - tf.py_function takes a python function which manipulates tf eager
    tensors instead of numpy arrays. It's easy to convert a tf eager tensor to
    an ndarray (just call tensor.numpy()) but having access to eager tensors
    means `tf.py_function`s can use accelerators such as GPUs as well as
    being differentiable using a gradient tape.
    - tf.numpy_function maintains the semantics of the deprecated tf.py_func
    (it is not differentiable, and manipulates numpy arrays). It drops the
    stateful argument making all functions stateful.
    


In [9]:
for i in dataset.take(1):
    A_in, A_out, alias_inputs, items, mask, labels = i
tf.shape(A_in)


<tf.Tensor: id=186, shape=(3,), dtype=int32, numpy=array([ 1, 89, 89], dtype=int32)>

In [22]:
with open("datasets/thg/processed/train.csv", "r") as data_file:
    data = [list(map(int, rec)) for rec in csv.reader(data_file, delimiter=',')]

In [28]:
%%time
np.amax([np.amax(z) for z in data])



CPU times: user 4.44 s, sys: 85.6 ms, total: 4.53 s
Wall time: 4.5 s


1183