In [1]:
import numpy as np
import pandas as pd

In [2]:
import tensorflow as tf
from tensorflow import keras

In [3]:
from sklearn.datasets import fetch_california_housing

dataset = fetch_california_housing()

In [16]:
data = np.c_[dataset.data, dataset.target]

In [24]:
from sklearn.model_selection import train_test_split

train_full, test = train_test_split(data, test_size=0.1, random_state=42)
train, valid = train_test_split(train_full, test_size=0.1, random_state=42)

In [30]:
df_train = pd.DataFrame(train, columns=dataset.feature_names + dataset.target_names)
df_valid = pd.DataFrame(valid, columns=dataset.feature_names + dataset.target_names)
df_test = pd.DataFrame(test, columns=dataset.feature_names + dataset.target_names)

In [52]:
import os
if "ANN" in os.path.abspath(os.curdir): os.chdir("..")
BASE_DIR = os.path.abspath(os.curdir)
BASE_DIR

'D:\\TheCompleteML\\projects'

In [88]:
def split_and_save(df, split_count, target_dir, prefix):
    for i in range(split_count):
        df_ = df[i*int(df.shape[0]//15):(i+1)*int(df.shape[0]//15)]
        os.makedirs(target_dir, exist_ok=True)
        df_.to_csv(os.path.join(target_dir, "{}_{}.csv".format(prefix, i+1)), index=False)

In [111]:
for prefix in ["train", "valid", "test"]:
    split_count = 15
    target_dir = os.path.join(BASE_DIR, "datasets", "ann", prefix)
    split_and_save(df_valid, split_count, target_dir, prefix)

In [174]:
train_filepaths = [f'{os.path.join(BASE_DIR, "datasets", "ann", "train")}\\train_{i+1}.csv' for i in range(15)]
test_filepaths = [f'{os.path.join(BASE_DIR, "datasets", "ann", "test")}\\test_{i+1}.csv' for i in range(15)]
valid_filepaths = [f'{os.path.join(BASE_DIR, "datasets", "ann", "valid")}\\valid_{i+1}.csv' for i in range(15)]

In [175]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed=42)

In [176]:
for i in filepath_dataset:
    print(i)

tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_1.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_13.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_7.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_2.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_6.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_4.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_9.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_14.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_10.csv', shape=(), dtype=string)
tf.Tensor(b'D:\\TheCompleteML\\projects\\datasets\\ann\\train\\train_11.csv', shape=(), 

In [129]:
n_readers = 5
dataset = filepath_dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), 
                                      cycle_length=n_readers)

In [131]:
for item in dataset.take(3):
    print(item)

tf.Tensor(b'5.1482,12.0,6.781582054309327,1.244391971664699,2104.0,2.4840613931523023,33.38,-117.63,5.00001', shape=(), dtype=string)
tf.Tensor(b'9.2327,19.0,8.118279569892474,1.014336917562724,904.0,3.240143369175627,33.88,-117.81,4.613', shape=(), dtype=string)
tf.Tensor(b'6.2242,13.0,6.121320890165111,0.9791816223977028,4597.0,3.3000717875089736,33.65,-117.66,2.379', shape=(), dtype=string)


In [146]:
n_inputs = 8
X_mean = np.mean(train[:-1])
X_std = np.std(train[:-1])
X_mean, X_std

(154.2774108932893, 592.7775189615072)

### Loading, shuffling and preprocessing

Here main focus is on shufflig the data and also little bit of preprocessing

In [151]:
def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)
    X = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (X - X_mean) / X_std, y

In [152]:
preprocess(b'5.1482,12.0,6.781582054309327,1.244391971664699,2104.0,2.4840613931523023,33.38,-117.63,5.00001')

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([-0.25157702, -0.24001822, -0.24882154, -0.25816265,  3.2891304 ,
        -0.25607136, -0.20395072, -0.4587006 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([5.00001], dtype=float32)>)

In [157]:
def read_csv_dataset(filepaths, repeat=1, n_readers=5, n_read_threads=None, 
                     shuffle_buffer_size=10000, n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(lambda filepath: tf.data.TextLineDataset(filepath).skip(1), 
                                 cycle_length=n_readers,
                                 num_parallel_calls=n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size).repeat(repeat)
    return dataset.batch(batch_size).prefetch(1)

#### Demo to use train, valid and test set in keras API's

In [177]:
train_set = read_csv_dataset(train_filepaths)
valid_set = read_csv_dataset(valid_filepaths)
test_set = read_csv_dataset(test_filepaths)

In [178]:
model = keras.models.Sequential()
model.add(keras.layers.Flatten(input_shape=[8]))
model.add(keras.layers.Dense(100, activation="relu"))
model.add(keras.layers.Dense(1))

In [179]:
model.compile(loss=keras.losses.mean_squared_error,
             optimizer="sgd")

In [180]:
model.fit(train_set, epochs=3, validation_data=valid_set)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1ea52f65040>

### Preprocessing the input features

Here focus is to create a layer that is responsible from preprocessing

In [190]:
class Standardization(keras.layers.Layer):
    def adapt(self, data_sample):
        self.mean = np.mean(data_sample, axis=0, keepdims=True)
        self.std = np.std(data_sample, axis=0, keepdims=True)
        self.eps = keras.backend.epsilon()
    
    def call(self, inputs):
        return (inputs - self.mean) / (self.std + self.eps)

In [191]:
data_sample = np.array(pd.read_csv(train_filepaths[0]).iloc[:, :-1])
std_layer = Standardization()
std_layer.adapt(data_sample)

In [196]:
keras.backend.clear_session()
tf.random.set_seed(42)
np.random.seed(42)

model = keras.models.Sequential()
model.add(std_layer) # our preprocessing layer
model.add(keras.layers.Flatten(input_shape=[8]))
model.add(keras.layers.Dense(100, activation="relu"))
model.add(keras.layers.Dense(1))

model.compile(loss=keras.losses.mean_squared_error,
             optimizer="sgd")

model.fit(train_set, epochs=3, validation_data=valid_set)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x1ea51cd1f40>

### Encoding categorical to one-hot-encoding

In [199]:
vocab = ["< 1H OCEAN", "INLAND", "NEAR OCEAN", "NEAR BAY", "ISLAND"]
indices = tf.range(len(vocab), dtype=tf.int64)

table_init = tf.lookup.KeyValueTensorInitializer(vocab, indices)
num_oov_buckets = 2
table = tf.lookup.StaticVocabularyTable(table_init, num_oov_buckets)

In [200]:
categories = tf.constant(["NEAR BAY", "DESERT", "INLAND", "INLAND"])
categories

<tf.Tensor: shape=(4,), dtype=string, numpy=array([b'NEAR BAY', b'DESERT', b'INLAND', b'INLAND'], dtype=object)>

In [201]:
cat_indices = table.lookup(categories)
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

In [204]:
cat_one_hot = tf.one_hot(cat_indices, depth=len(vocab) + num_oov_buckets)
cat_one_hot

<tf.Tensor: shape=(4, 7), dtype=float32, numpy=
array([[0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0.]], dtype=float32)>

### Encoding categorical to embeddings

In [207]:
embedding_dim = 2
embed_init = tf.random.uniform([len(vocab) + num_oov_buckets, embedding_dim])
embedding_matrix = tf.Variable(embed_init)

In [209]:
embedding_matrix

<tf.Variable 'Variable:0' shape=(7, 2) dtype=float32, numpy=
array([[0.7413678 , 0.62854624],
       [0.01738465, 0.3431449 ],
       [0.51063764, 0.3777541 ],
       [0.07321596, 0.02137029],
       [0.2871771 , 0.4710616 ],
       [0.6936141 , 0.07321334],
       [0.93251204, 0.20843053]], dtype=float32)>

In [211]:
cat_indices

<tf.Tensor: shape=(4,), dtype=int64, numpy=array([3, 5, 1, 1], dtype=int64)>

In [210]:
tf.nn.embedding_lookup(embedding_matrix, cat_indices)

<tf.Tensor: shape=(4, 2), dtype=float32, numpy=
array([[0.07321596, 0.02137029],
       [0.6936141 , 0.07321334],
       [0.01738465, 0.3431449 ],
       [0.01738465, 0.3431449 ]], dtype=float32)>

#### Using keras embedding layer

In [212]:
regular_inputs = keras.layers.Input(shape=[8], dtype=tf.float32)
categorical_inputs = keras.layers.Input(shape=[], dtype=tf.string)
cat_indices = keras.layers.Lambda(lambda cats: table.lookup(cats))(categorical_inputs)
cat_embed = keras.layers.Embedding(input_dim=6, output_dim=2)(cat_indices)

encoded_inputs = keras.layers.concatenate([regular_inputs, cat_embed])
output = keras.layers.Dense(1)(encoded_inputs)

model = keras.models.Model(inputs=[regular_inputs, categorical_inputs], outputs=[output])