In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import os

In [2]:
dataset = fetch_california_housing()
X, y = dataset.data, dataset.target

X_train_full, X_test, y_train_full, y_test = train_test_split(X, y)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full)

scaler = StandardScaler()
# (mean, std) for layer data preprocessing
scaler.fit(X_train)
X_mean, X_std = scaler.mean_, scaler.scale_

In [3]:
def split_csv_files(data, header, name_prefix, n_parts=10):
    parent_dir = os.path.join("datasets", "housing")
    dir_path = os.path.join(parent_dir, "test")
    os.makedirs(dir_path, exist_ok=True)
    path_format = os.path.join(dir_path, "my_{}_{:02d}.csv")
        
    file_paths = []
    
    m = len(data)
    for index, part_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, index)
        file_paths.append(part_csv)
        
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for part_index in part_indices:
                f.write(",".join(repr(col) for col in data[part_index]))
                f.write("\n")
    return file_paths

In [4]:
train_data = np.c_[X_train, y_train]
test_data = np.c_[X_test, y_test]
val_data = np.c_[X_val, y_val]

header_cols = dataset.feature_names + ["MedianHouseValues"]
header_cols = ','.join(header_cols)

train_filepath = split_csv_files(X_train, header_cols, "train", 20)
test_filepath = split_csv_files(X_test, header_cols, "test")
val_filepath = split_csv_files(X_val, header_cols, "valid")

In [5]:
def preprocess(line, n_inputs=8):
    defs = [0.] * n_inputs + [tf.constant([],dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    X = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (X - X_mean) / X_std, y

In [6]:
def csv_reader_dataset(filepath, n_cycles, n_threads, n_batch=32):
    dataset = tf.data.Dataset.list_files(filepath).repeat(1)
    dataset = tf.data.Dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), 
                                         num_parallel_calls=n_threads, 
                                         cycle_length=n_cycles)
    dataset = dataset.shuffle(10000)
    dataset = dataset.map(preprocess, num_parallel_calls=n_threads)
    dataset = dataset.batch(n_batch)
    
    return dataset.prefetch(1)


In [44]:
keras.backend.clear_session()
file_paths = ["datasets\housing\my_test_00.scv",
             "datasets\housing\my_test_01.scv",
             "datasets\housing\my_test_02.scv",
             "datasets\housing\my_test_03.scv",]
dataset = tf.data.Dataset.list_files(file_paths).repeat(1)
dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=2)
dataset = dataset.shuffle(20)
dataset = dataset.batch(2)
for file in dataset:
    print(file)
# for epoch in range(5):
#     print(epoch)
#     for index, file in enumerate(dataset):
#         print(file)

tf.Tensor([b'2,2,2,2,2,2,2,2' b'3,3,3,3,3,3,3,3'], shape=(2,), dtype=string)
tf.Tensor([b'4,4,4,4,4,4,4,4' b'1,1,1,1,1,1,1,1'], shape=(2,), dtype=string)
tf.Tensor([b'1,1,1,1,1,1,1,1' b'1,1,1,1,1,1,1,1'], shape=(2,), dtype=string)
tf.Tensor([b'2,2,2,2,2,2,2,2' b'2,2,2,2,2,2,2,2'], shape=(2,), dtype=string)
tf.Tensor([b'4,4,4,4,4,4,4,4' b'3,3,3,3,3,3,3,3'], shape=(2,), dtype=string)
tf.Tensor([b'2,2,2,2,2,2,2,2' b'2,2,2,2,2,2,2,2'], shape=(2,), dtype=string)
tf.Tensor([b'4,4,4,4,4,4,4,4' b'1,1,1,1,1,1,1,1'], shape=(2,), dtype=string)
tf.Tensor([b'3,3,3,3,3,3,3,3' b'4,4,4,4,4,4,4,4'], shape=(2,), dtype=string)
tf.Tensor([b'1,1,1,1,1,1,1,1' b'4,4,4,4,4,4,4,4'], shape=(2,), dtype=string)
tf.Tensor([b'3,3,3,3,3,3,3,3' b'3,3,3,3,3,3,3,3'], shape=(2,), dtype=string)
