In [116]:
import tensorflow as tf
import numpy as np
import os

In [24]:
x = tf.range(5)
dataset = tf.data.Dataset.from_tensor_slices(x)
dataset

<TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [25]:
for item in dataset :
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)


batch

In [26]:
dataset = dataset.batch(2)
for item in dataset :
    print(item)

tf.Tensor([0 1], shape=(2,), dtype=int32)
tf.Tensor([2 3], shape=(2,), dtype=int32)
tf.Tensor([4], shape=(1,), dtype=int32)


map function

In [27]:
dataset = dataset.map(lambda z : z*2)
for item in dataset :
    print(item)

tf.Tensor([0 2], shape=(2,), dtype=int32)
tf.Tensor([4 6], shape=(2,), dtype=int32)
tf.Tensor([8], shape=(1,), dtype=int32)


unbatch

In [28]:
dataset = dataset.unbatch()
for item in dataset :
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)


In [34]:
dataset = dataset.filter(lambda x : x > 3) # keep only > 3
for item in dataset.take(2) : # get 2 sample
    print(item)

tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)


In [105]:
dataset = tf.data.Dataset.range(20)
dataset = dataset.shuffle(buffer_size=4, seed=42).batch(6)
for item in dataset :
    print(item)

tf.Tensor([1 4 2 3 5 0], shape=(6,), dtype=int64)
tf.Tensor([ 6  9  8 12 10 13], shape=(6,), dtype=int64)
tf.Tensor([11 14 15  7 19 16], shape=(6,), dtype=int64)
tf.Tensor([17 18], shape=(2,), dtype=int64)


## Spliting the Data to CSV files

In [107]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

> For a very large dataset that does not fit in memory

In [156]:
def save_to_csv(data, name_prefix, header=None, n_parts=5) :
    data_dir = os.path.join('dataset', 'housing')
    os.makedirs(data_dir, exist_ok=True)
    path_format = os.path.join(data_dir, '_{}_{:02d}.csv')

    filepath = []
    m = len(data)
    for file_id, row_ in enumerate(np.array_split(np.arange(m), n_parts)) : # like batch
        part_csv = path_format.format(name_prefix, file_id)
        filepath.append(part_csv)

        with open(part_csv, 'wt', encoding='utf-8') as file : # write text mode
            if header : 
                file.write(header)
                file.write('\n')
            for row_id in row_ :
                file.write(','.join([ repr(column) for column in data[row_id] ]))
                file.write('\n')
    return filepath

In [157]:
# map x and y
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ['MedianHouseValue'] # x and y
header = ','.join(header_cols)

train_filepaths = save_to_csv(train_data, "train", header, n_parts=5)
valid_filepaths = save_to_csv(valid_data, "valid", header, n_parts=3)
test_filepaths = save_to_csv(test_data, "test", header, n_parts=3)

Okay, now let's take a peek at the first few lines of one of these CSV files:

In [160]:
import pandas as pd

pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,3.5214,15.0,3.049945,1.106548,1447.0,1.605993,37.63,-122.43,1.442
1,5.3275,5.0,6.49006,0.991054,3464.0,3.44334,33.69,-117.39,1.687
2,3.1,29.0,7.542373,1.591525,1328.0,2.250847,38.44,-122.98,1.621
3,7.1736,12.0,6.289003,0.997442,1054.0,2.695652,33.55,-117.7,2.621
4,2.0549,13.0,5.312457,1.085092,3297.0,2.244384,33.93,-116.93,0.956


## Building an Input Pipeline

In [162]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42) # shuffle  
for filepath in filepath_dataset :
    print(filepath)

tf.Tensor(b'dataset\\housing\\_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'dataset\\housing\\_train_04.csv', shape=(), dtype=string)
tf.Tensor(b'dataset\\housing\\_train_01.csv', shape=(), dtype=string)
tf.Tensor(b'dataset\\housing\\_train_03.csv', shape=(), dtype=string)
tf.Tensor(b'dataset\\housing\\_train_02.csv', shape=(), dtype=string)


In [200]:
n_reader = 2
dataset = filepath_dataset.interleave( # like pipeline
    lambda filepath : tf.data.TextLineDataset(filepath).skip(1), # skip line 1
    cycle_length = n_reader)

In [201]:
for line in dataset.take(10) : # take 10 lines 
    print(line.numpy())

b'4.163,49.0,4.71830985915493,0.9894366197183099,660.0,2.323943661971831,37.54,-122.31,3.938'
b'2.0134,23.0,4.046153846153846,1.1692307692307693,245.0,1.8846153846153846,38.3,-122.28,1.425'
b'4.5293,19.0,6.920661157024793,1.1305785123966943,1827.0,3.0198347107438015,38.34,-122.31,2.104'
b'2.3319,15.0,5.488745980707395,1.0707395498392283,2097.0,3.3713826366559485,32.68,-115.48,0.912'
b'4.2708,45.0,5.121387283236994,0.953757225433526,492.0,2.8439306358381504,37.48,-122.19,2.67'
b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215'
b'4.7426,19.0,5.871428571428571,1.022857142857143,1064.0,3.04,37.93,-121.66,2.631'
b'4.4474,25.0,6.342776203966006,1.0226628895184136,928.0,2.6288951841359776,38.03,-122.26,2.037'
b'2.3912,34.0,5.652960526315789,1.144736842105263,1783.0,2.932565789473684,35.16,-119.45,0.529'
b'2.6591,24.0,5.244299674267101,1.0,748.0,2.436482084690554,40.57,-122.34,0.828'
