In [1]:
import numpy as np

In [3]:
with open('disaster-xids.npy', 'rb') as f:
    Xids = np.load(f, allow_pickle=True)
with open('disaster-xmask.npy', 'rb') as f:
    Xmask = np.load(f, allow_pickle=True)
with open('disaster-labels.npy', 'rb') as f:
    labels = np.load(f, allow_pickle=True)

In [5]:
Xids.shape

(7613, 512)

In [14]:
labels

array([1, 1, 1, ..., 1, 1, 1])

In [10]:
import tensorflow as tf

In [11]:
dataset = tf.data.Dataset.from_tensor_slices((Xids, Xmask, labels))

In [12]:
dataset.take(1)

<_TakeDataset element_spec=(TensorSpec(shape=(512,), dtype=tf.int64, name=None), TensorSpec(shape=(512,), dtype=tf.int64, name=None), TensorSpec(shape=(), dtype=tf.int64, name=None))>

In [16]:
# map the previous format to new format
def map_func(input_ids, masks, labels):
    return {'input_ids': input_ids,
            'attention_mask': masks}, labels

In [17]:
dataset = dataset.map(map_func)

In [18]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(512,), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(512,), dtype=tf.int64, name=None)}, TensorSpec(shape=(), dtype=tf.int64, name=None))>

# Shuffle and Batch

In [19]:
batch_size = 16

In [20]:
dataset = dataset.shuffle(10000).batch(batch_size, drop_remainder=True)

In [22]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(16,), dtype=tf.int64, name=None))>

In [23]:
# Splitting 90-10 training to validation
split = 0.9

In [28]:
size = int((Xids.shape[0] / batch_size) * split)

In [29]:
train_ds = dataset.take(size)

val_ds = dataset.skip(size)

In [32]:
# Saving the training and set files
tf.data.experimental.save(train_ds, 'train')
tf.data.experimental.save(val_ds, 'val')

In [33]:
# Define the tensor element spec
train_ds.element_spec

({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None),
  'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)},
 TensorSpec(shape=(16,), dtype=tf.int64, name=None))

In [34]:
# Confirming the element spec is the same
val_ds.element_spec

({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None),
  'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)},
 TensorSpec(shape=(16,), dtype=tf.int64, name=None))

In [35]:
ds = tf.data.experimental.load('train', element_spec=train_ds.element_spec)

Instructions for updating:
Use `tf.data.Dataset.load(...)` instead.


In [36]:
ds.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None), 'attention_mask': TensorSpec(shape=(16, 512), dtype=tf.int64, name=None)}, TensorSpec(shape=(16,), dtype=tf.int64, name=None))>