In [1]:
import numpy as np
import tensorflow as tf
from tensorflow import keras

In [2]:
X = tf.range(10)
for item in X:
    print(item)
dataset = tf.data.Dataset.from_tensor_slices(X)
dataset

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)
tf.Tensor(8, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)


<_TensorSliceDataset element_spec=TensorSpec(shape=(), dtype=tf.int32, name=None)>

In [3]:
def sqrt(x):
    return x ** 2

In [4]:
# dataset = dataset.map(lambda x: sqrt(x))
# dataset = dataset.map(lambda x: x ** 2)
dataset = dataset.map(sqrt)
for item in dataset:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(9, shape=(), dtype=int32)
tf.Tensor(16, shape=(), dtype=int32)
tf.Tensor(25, shape=(), dtype=int32)
tf.Tensor(36, shape=(), dtype=int32)
tf.Tensor(49, shape=(), dtype=int32)
tf.Tensor(64, shape=(), dtype=int32)
tf.Tensor(81, shape=(), dtype=int32)


In [5]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

In [6]:
dataset = fetch_california_housing()
X, y = dataset.data, dataset.target.reshape(-1, 1)
X_train_full, X_test, y_train_full, y_test = train_test_split(X, y, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train_full, y_train_full, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_mean = scaler.mean_
X_std = scaler.scale_

In [7]:
def split_dataset_to_csv(data, name_prefix, header=None, n_parts=10):
    housing_dir = os.path.join("datasets", "housing")
    os.makedirs(housing_dir, exist_ok=True)
    path_format = os.path.join(housing_dir, "my_{}_{:02d}.scv")
    
    file_paths = []
    
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        part_csv = path_format.format(name_prefix, file_idx)
        file_paths.append(part_csv)
        with open(part_csv, "wt", encoding="utf-8") as f:
            if header is not None:
                f.write(header)
                f.write("\n")
            for row_idx in row_indices:
                f.write(','.join([repr(col) for col in data[row_idx]]))
                f.write("\n")
        
    return file_paths

In [8]:
train_data = np.c_[X_train, y_train]
test_data = np.c_[X_test, y_test]
val_data = np.c_[X_val, y_val]
header_cols = dataset.feature_names + ["MedianHouseValue"]
header = ','.join(header_cols)

train_file_paths = split_dataset_to_csv(train_data, "train", header, n_parts=20)
valid_file_paths = split_dataset_to_csv(val_data, "valid", header)
test_file_paths = split_dataset_to_csv(test_data, "test", header)

In [9]:
import pandas as pd

In [10]:
pd.read_csv(train_file_paths[0]).head()

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedianHouseValue
0,3.5214,15.0,3.049945,1.106548,1447.0,1.605993,37.63,-122.43,1.442
1,5.3275,5.0,6.49006,0.991054,3464.0,3.44334,33.69,-117.39,1.687
2,3.1,29.0,7.542373,1.591525,1328.0,2.250847,38.44,-122.98,1.621
3,7.1736,12.0,6.289003,0.997442,1054.0,2.695652,33.55,-117.7,2.621
4,2.0549,13.0,5.312457,1.085092,3297.0,2.244384,33.93,-116.93,0.956


In [11]:
train_file_paths[0]

'datasets\\housing\\my_train_00.scv'

In [12]:
filepath_dataset = tf.data.Dataset.list_files(train_file_paths)
for file in filepath_dataset:
    print(file)

tf.Tensor(b'datasets\\housing\\my_train_11.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_15.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_12.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_13.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_17.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_02.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_16.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_03.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_18.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_19.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_04.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_07.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_14.scv', shape=(), dtype=string)
tf.Tensor(b'datasets\\housing\\my_train_08.scv', sh

In [13]:
dataset = filepath_dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=5)

In [14]:
for line in dataset.take(5):
    print(line)

tf.Tensor(b'3.226,52.0,5.372469635627531,0.9473684210526315,1157.0,2.3421052631578947,37.96,-121.31,1.076', shape=(), dtype=string)
tf.Tensor(b'3.9688,41.0,5.259786476868327,0.9715302491103203,916.0,3.2597864768683276,33.98,-118.07,1.698', shape=(), dtype=string)
tf.Tensor(b'3.0217,22.0,4.983870967741935,1.1008064516129032,615.0,2.4798387096774195,38.76,-120.6,1.069', shape=(), dtype=string)
tf.Tensor(b'8.72,44.0,6.163179916317992,1.0460251046025104,668.0,2.794979079497908,34.2,-118.18,4.159', shape=(), dtype=string)
tf.Tensor(b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215', shape=(), dtype=string)


In [15]:
n_inputs = 8
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean) / X_std, y
preprocess(b'4.5909,16.0,5.475877192982456,1.0964912280701755,1357.0,2.9758771929824563,33.63,-117.71,2.418')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([ 0.36618188, -0.998705  ,  0.00781878, -0.00675364, -0.06140145,
         0.0072037 , -0.94465536,  0.9367464 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([2.418], dtype=float32)>)

In [16]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                       n_read_threads=None, shuffle_buffer_size=10000,
                       n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths).repeat(repeat)
    dataset = dataset.interleave(
        lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
        cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.batch(batch_size)
    return dataset.prefetch(1)

In [21]:
train_set = csv_reader_dataset(train_file_paths)
test_set = csv_reader_dataset(test_file_paths)
val_set = csv_reader_dataset(valid_file_paths)

In [18]:
keras.backend.clear_session()
np.random.seed(42)
tf.random.set_seed(42)

model = keras.models.Sequential([
    keras.layers.Dense(30, activation="relu", input_shape=[8,]),
    keras.layers.Dense(1)
    
])

In [19]:
print(len(X_train))
model.compile(loss="mse", optimizer=keras.optimizers.SGD(learning_rate=1e-3))
batch_size = 32
model.fit(train_set, steps_per_epoch=len(X_train) // batch_size, epochs=10,
          validation_data=val_set)

11610
Epoch 1/10
Epoch 2/10


<keras.callbacks.History at 0x154e7c96a30>