In [1]:
import numpy as np
import random
import requests
import string
import tarfile
import tensorflow as tf

**Load Numpy Arrays**

Build a data pipeline over numpy arrays.

In [5]:
# Create a toy dataset
evens = np.arange(0, 100, step=2, dtype=np.int32)
evens_label = np.zeros(50, dtype=np.int32)
odds = np.arange(1, 100, step=2, dtype=np.int32)
odds_label = np.ones(50, dtype=np.int32)
# Concatenate arrays
features = np.concatenate([evens, odds])
labels = np.concatenate([evens_label, odds_label])

# Load a numpy array using tf data api with `from_tensor_slices`.
data = tf.data.Dataset.from_tensor_slices((features, labels))
# Refill data indefinitely.
data = data.repeat()
# Shuffle data
data = data.shuffle(buffer_size=100)
# Batch data
data = data.batch(batch_size=4)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [7]:
for batch_x, batch_y in data.take(5):
    print(batch_x, batch_y)

tf.Tensor([75 68 35 67], shape=(4,), dtype=int32) tf.Tensor([1 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([11 51 78 77], shape=(4,), dtype=int32) tf.Tensor([1 1 0 1], shape=(4,), dtype=int32)
tf.Tensor([18 80 81 97], shape=(4,), dtype=int32) tf.Tensor([0 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([15 43 38 22], shape=(4,), dtype=int32) tf.Tensor([1 1 0 0], shape=(4,), dtype=int32)
tf.Tensor([ 1  4 34 61], shape=(4,), dtype=int32) tf.Tensor([1 0 0 1], shape=(4,), dtype=int32)


In [8]:
# Note: If you are planning on calling multiple time,
# you can user the iterator way:
ite_data = iter(data)
for i in range(5):
    batch_x, batch_y = next(ite_data)
    print(batch_x, batch_y)

for i in range(5):
    batch_x, batch_y = next(ite_data)
    print(batch_x, batch_y)


tf.Tensor([33 46  5 93], shape=(4,), dtype=int32) tf.Tensor([1 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([24  6 57 74], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)
tf.Tensor([54 14 53 25], shape=(4,), dtype=int32) tf.Tensor([0 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([12 84 37 71], shape=(4,), dtype=int32) tf.Tensor([0 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([64 94 65 29], shape=(4,), dtype=int32) tf.Tensor([0 0 1 1], shape=(4,), dtype=int32)
tf.Tensor([14 66 15  8], shape=(4,), dtype=int32) tf.Tensor([0 0 1 0], shape=(4,), dtype=int32)
tf.Tensor([79 43 76  9], shape=(4,), dtype=int32) tf.Tensor([1 1 0 1], shape=(4,), dtype=int32)
tf.Tensor([52 92 20 22], shape=(4,), dtype=int32) tf.Tensor([0 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([19 26 72 48], shape=(4,), dtype=int32) tf.Tensor([1 0 0 0], shape=(4,), dtype=int32)
tf.Tensor([96 31 91 73], shape=(4,), dtype=int32) tf.Tensor([0 1 1 1], shape=(4,), dtype=int32)


**Load CSV files**

In [9]:
# download Titanic dataset (in csv format).
d = requests.get("https://raw.githubusercontent.com/tflearn/tflearn.github.io/master/resources/titanic_dataset.csv")
with open("titanic_dataset.csv", "wb") as f:
    f.write(d.content)

In [11]:
# Load Titanic dataset.
# Original features: survived,pclass,name,sex,age,sibsp,parch,ticket,fare
# Select specific columns: survived,pclass,name,sex,age,fare
column_to_use = [0, 1, 2, 3, 4, 8]
record_defaults = [tf.int32, tf.int32, tf.string, tf.string, tf.float32, tf.float32]

# Load the whole dataset file, and slice each line.
data = tf.data.experimental.CsvDataset("titanic_dataset.csv", record_defaults, header=True, select_cols=column_to_use)
# Refill data indefinitely.
data = data.repeat()
# Shuffle data
data = data.shuffle(buffer_size=1000)
# Batch data (aggregate records together).
data = data.batch(batch_size=2)
# Prefetcb batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [12]:
for survived, pclass, name, sex, age, fare in data.take(1):
    print(survived.numpy())
    print(pclass.numpy())
    print(name.numpy())
    print(sex.numpy())
    print(age.numpy())
    print(fare.numpy())

[0 0]
[3 3]
[b'Lennon, Mr. Denis' b'Elias, Mr. Dibo']
[b'male' b'male']
[0. 0.]
[15.5    7.225]


**Load Images**

In [13]:
# Download Oxford 17 flowers dataset
d = requests.get("http://www.robots.ox.ac.uk/~vgg/data/flowers/17/17flowers.tgz")
with open("17flowers.tgz", "wb") as f:
    f.write(d.content)
# Extract archive.
with tarfile.open("17flowers.tgz") as t:
    t.extractall()

In [14]:
with open('jpg/dataset.csv', 'w') as f:
    c = 0
    for i in range(1360):
        f.write("jpg/image_%04i.jpg,%i\n" % (i+1, c))
        if (i+1) % 80 == 0:
            c += 1

In [15]:
# Load Images
with open("jpg/dataset.csv") as f:
    dataset_file = f.read().splitlines()

# Load the whole dataset file, and slice each line.
data = tf.data.Dataset.from_tensor_slices(dataset_file)
# Refill data Indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=1000)

# Load and pre-process images.
def load_image(path):
    # Read image from path.
    image = tf.io.read_file(path)
    # Decode the jpeg image to array [0, 255].
    image = tf.image.decode_jpeg(image)
    # Resize images to a common size of 256x256
    image = tf.image.resize(image, [256, 256])
    # Rescale values to [-1, 1].
    image = 1. - image / 127.5
    return image

# Decode each line from the dataset file.
def parse_records(line):
    # File is in csv format: "image_path, label_id".
    # TensorFlow requires a default value, but it will never be used.
    image_path, image_label = tf.io.decode_csv(line, ["", 0])
    # Apply the function to load images.
    image = load_image(image_path)
    return image, image_label

# Use 'map' to apply the above functions in parallel.
data = data.map(parse_records, num_parallel_calls=4)

# Batch data (aggregate images-array together).
data = data.batch(batch_size=2)
# Prefetch batch (pre-load batch for faster consumption).
data = data.prefetch(buffer_size=1)

In [16]:
for batch_x, batch_y in data.take(1):
    print(batch_x, batch_y)

tf.Tensor(
[[[[ 0.76464176  0.83523     0.8117006 ]
   [ 0.7647059   0.8352941   0.8117647 ]
   [ 0.77254903  0.84313726  0.81960785]
   ...
   [ 0.41360295  0.31464463  0.6754289 ]
   [ 0.4240809   0.2896446   0.6627451 ]
   [ 0.42745095  0.2862745   0.6627451 ]]

  [[ 0.7721814   0.8427696   0.8192402 ]
   [ 0.77254903  0.84313726  0.81960785]
   [ 0.78016526  0.8507535   0.8272241 ]
   ...
   [ 0.43390298  0.31742013  0.68304515]
   [ 0.4240809   0.2896446   0.6627451 ]
   [ 0.42745095  0.2862745   0.6627451 ]]

  [[ 0.76815164  0.85442615  0.8230536 ]
   [ 0.77254903  0.85882354  0.827451  ]
   [ 0.78039217  0.8666667   0.8352941 ]
   ...
   [ 0.45159316  0.31788164  0.690737  ]
   [ 0.43192405  0.28180146  0.6627451 ]
   [ 0.4352941   0.27843136  0.6627451 ]]

  ...

  [[ 0.62352943  0.6313726   0.67058825]
   [ 0.6384622   0.6388662   0.6880151 ]
   [ 0.69644606  0.6639093   0.7435049 ]
   ...
   [ 0.5265213   0.50191003  0.69460785]
   [ 0.38658088  0.35520834  0.56697303]
   [ 

**Load data from a Generator**

In [17]:
# Create a dummy generator.
def generate_features():
    # Function to generate a random string.
    def random_string(length):
        return ''.join(random.choice(string.ascii_letters) for m in range(length))
    # Return a random string, a random vector, and a random int.
    yield random_string(4), np.random.uniform(size=4), random.randint(0, 10)

In [19]:
# Load a numpy array using tf data api with `from_tensor_slices`.
data = tf.data.Dataset.from_generator(generate_features, output_types=(tf.string, tf.float32, tf.int32))
# Refill data indefinitely.
data = data.repeat()
# Shuffle data.
data = data.shuffle(buffer_size=100)
# Batch data.
data = data.batch(batch_size=4)
# Prefetch data.
data = data.prefetch(buffer_size=1)

In [21]:
# Display data.
for batch_str, batch_vector, batch_int in data.take(5):
    print(batch_str, batch_vector, batch_int)

tf.Tensor([b'kIlI' b'RAzR' b'XNhx' b'Juny'], shape=(4,), dtype=string) tf.Tensor(
[[0.836966   0.29700842 0.24015798 0.86873   ]
 [0.12320157 0.93517476 0.51339155 0.59283787]
 [0.26115897 0.8187252  0.90030366 0.23011203]
 [0.45327082 0.9455417  0.16996107 0.5389088 ]], shape=(4, 4), dtype=float32) tf.Tensor([ 5  1  6 10], shape=(4,), dtype=int32)
tf.Tensor([b'GNHj' b'gsSM' b'gIJp' b'NJeB'], shape=(4,), dtype=string) tf.Tensor(
[[0.1047591  0.99539644 0.29704782 0.606994  ]
 [0.13766076 0.07080046 0.32486567 0.41179022]
 [0.68911415 0.9707358  0.41976437 0.351048  ]
 [0.45072424 0.35818252 0.60431564 0.5289074 ]], shape=(4, 4), dtype=float32) tf.Tensor([7 3 2 2], shape=(4,), dtype=int32)
tf.Tensor([b'IMgP' b'iJjU' b'NlyM' b'RjLR'], shape=(4,), dtype=string) tf.Tensor(
[[0.23942423 0.0825363  0.6748568  0.16108687]
 [0.4157596  0.8442947  0.37160802 0.9812635 ]
 [0.49186072 0.5348949  0.33804983 0.38073397]
 [0.02785444 0.7672291  0.78736955 0.74145055]], shape=(4, 4), dtype=float32) t