In [3]:
import tensorflow as tf
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import numpy as np
import os
import pandas as pd
from tensorflow import keras
from matplotlib import pyplot as plt

In [4]:
housing = fetch_california_housing()
X_train_full, X_test, y_train_full, y_test = train_test_split(
    housing.data, housing.target.reshape(-1, 1), random_state=42)

X_train, X_valid, y_train, y_valid = train_test_split(
    X_train_full, y_train_full, random_state=42)

In [5]:
X_train.shape 

(11610, 8)

In [6]:
 scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [7]:
def save_to_multiple_csv_files(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join(".", "datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.csv")

    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        filepaths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(",".join([repr(col) for col in data[row_idx]]))
                f.write("\n")
    return filepaths

In [8]:
train_data = np.c_[X_train, y_train]
valid_data = np.c_[X_valid, y_valid]
test_data = np.c_[X_test, y_test]
header_cols = housing.feature_names + ["MedianHouseValue"]
header = ",".join(header_cols)

In [9]:
train_filepaths = save_to_multiple_csv_files(train_data, "train", header, n_parts=20)
valid_filepaths = save_to_multiple_csv_files(valid_data, "valid", header, n_parts=10)
test_filepaths = save_to_multiple_csv_files(test_data, "test", header, n_parts=10)

In [10]:
pd.read_csv(train_filepaths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,3.5214,15.0,3.049945,1.106548,1447.0,1.605993,37.63,-122.43,1.442
1,5.3275,5.0,6.49006,0.991054,3464.0,3.44334,33.69,-117.39,1.687
2,3.1,29.0,7.542373,1.591525,1328.0,2.250847,38.44,-122.98,1.621
3,7.1736,12.0,6.289003,0.997442,1054.0,2.695652,33.55,-117.7,2.621
4,2.0549,13.0,5.312457,1.085092,3297.0,2.244384,33.93,-116.93,0.956


In [11]:
with open(train_filepaths[0]) as f:
    for i in range(5):
        print(f.readline(), end="")

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442
5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687
3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621
7.1736,12.0,6.289002557544757,0.9974424552429667,1054.0,2.6956521739130435,33.55,-117.7,2.621


In [12]:
train_filepaths 

['.\\datasets\\housing\\my_train_00.csv',
 '.\\datasets\\housing\\my_train_01.csv',
 '.\\datasets\\housing\\my_train_02.csv',
 '.\\datasets\\housing\\my_train_03.csv',
 '.\\datasets\\housing\\my_train_04.csv',
 '.\\datasets\\housing\\my_train_05.csv',
 '.\\datasets\\housing\\my_train_06.csv',
 '.\\datasets\\housing\\my_train_07.csv',
 '.\\datasets\\housing\\my_train_08.csv',
 '.\\datasets\\housing\\my_train_09.csv',
 '.\\datasets\\housing\\my_train_10.csv',
 '.\\datasets\\housing\\my_train_11.csv',
 '.\\datasets\\housing\\my_train_12.csv',
 '.\\datasets\\housing\\my_train_13.csv',
 '.\\datasets\\housing\\my_train_14.csv',
 '.\\datasets\\housing\\my_train_15.csv',
 '.\\datasets\\housing\\my_train_16.csv',
 '.\\datasets\\housing\\my_train_17.csv',
 '.\\datasets\\housing\\my_train_18.csv',
 '.\\datasets\\housing\\my_train_19.csv']

In [13]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [18]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'.\\datasets\\housing\\my_train_09.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_02.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_15.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_05.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_03.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_12.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_18.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_19.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_00.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_11.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_17.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_08.csv', shape=(), dtype=string)
tf.Tensor(b'.\\datasets\\housing\\my_train_14.csv', shape=(), dtype=string)
tf.Tensor(b'

In [19]:
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers)

In [20]:
for line in dataset.take(5):
    print(line.numpy())

b'8.72,44.0,6.163179916317992,1.0460251046025104,668.0,2.794979079497908,34.2,-118.18,4.159'
b'5.9522,26.0,6.196521739130435,1.0069565217391305,1479.0,2.5721739130434784,34.5,-119.75,4.384'
b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215'
b'3.0217,22.0,4.983870967741935,1.1008064516129032,615.0,2.4798387096774195,38.76,-120.6,1.069'
b'2.4792,24.0,3.4547038327526134,1.1341463414634145,2251.0,3.921602787456446,34.18,-118.38,2.0'


In [22]:
n_inputs = 8

def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y