## Importing libraries

In [53]:
import tensorflow as tf
import numpy as np
import pandas as pd
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import os

## Shuffling the data

In [2]:
tf.random.set_seed(42)

In [3]:
dataset = tf.data.Dataset.range(10).repeat(3) # Creates the dataset

In [4]:
dataset = dataset.shuffle(buffer_size = 3, seed = 42).batch(7) # Shuffles the data

In [5]:
for item in dataset:
    print(item)

tf.Tensor([1 3 0 4 2 5 6], shape=(7,), dtype=int64)
tf.Tensor([8 7 1 0 3 2 5], shape=(7,), dtype=int64)
tf.Tensor([4 6 9 8 9 7 0], shape=(7,), dtype=int64)
tf.Tensor([3 1 4 5 2 8 7], shape=(7,), dtype=int64)
tf.Tensor([6 9], shape=(2,), dtype=int64)


- If we call `repeat()` on a shuffled dataset, it will by default create a new order at every iteration. If we prefer to use the same order in each iteration, set `reshuffle_each_iteration = False`.

## Shuffling the California housing dataset

- First we need to load the dataset, split them into train, validation and test sets and scale them.

In [7]:
housing = fetch_california_housing()

In [8]:
housing

{'data': array([[   8.3252    ,   41.        ,    6.98412698, ...,    2.55555556,
           37.88      , -122.23      ],
        [   8.3014    ,   21.        ,    6.23813708, ...,    2.10984183,
           37.86      , -122.22      ],
        [   7.2574    ,   52.        ,    8.28813559, ...,    2.80225989,
           37.85      , -122.24      ],
        ...,
        [   1.7       ,   17.        ,    5.20554273, ...,    2.3256351 ,
           39.43      , -121.22      ],
        [   1.8672    ,   18.        ,    5.32951289, ...,    2.12320917,
           39.43      , -121.32      ],
        [   2.3886    ,   16.        ,    5.25471698, ...,    2.61698113,
           39.37      , -121.24      ]]),
 'target': array([4.526, 3.585, 3.521, ..., 0.923, 0.847, 0.894]),
 'frame': None,
 'target_names': ['MedHouseVal'],
 'feature_names': ['MedInc',
  'HouseAge',
  'AveRooms',
  'AveBedrms',
  'Population',
  'AveOccup',
  'Latitude',
  'Longitude'],
 'DESCR': '.. _california_housing_dataset:\n

In [16]:
x = housing.data

In [17]:
y = housing.target.reshape(-1, 1)

In [18]:
x_train_full, x_test, y_train_full, y_test = train_test_split(x, y, random_state = 42)

In [19]:
x_train, x_valid, y_train, y_valid = train_test_split(x_train_full, y_train_full, random_state = 42)

In [20]:
scaler = StandardScaler()

In [21]:
scaler.fit(x_train)

StandardScaler()

In [22]:
x_mean = scaler.mean_
x_std = scaler.scale_

- Now we will be splitting the train, test and validation data and store them in multiple csv files.

In [31]:
for file_idx, row_indices in enumerate(np.array_split(np.arange(len(x_train)), 10)):
    print(file_idx, row_indices)

0 [   0    1    2 ... 1158 1159 1160]
1 [1161 1162 1163 ... 2319 2320 2321]
2 [2322 2323 2324 ... 3480 3481 3482]
3 [3483 3484 3485 ... 4641 4642 4643]
4 [4644 4645 4646 ... 5802 5803 5804]
5 [5805 5806 5807 ... 6963 6964 6965]
6 [6966 6967 6968 ... 8124 8125 8126]
7 [8127 8128 8129 ... 9285 9286 9287]
8 [ 9288  9289  9290 ... 10446 10447 10448]
9 [10449 10450 10451 ... 11607 11608 11609]


In [39]:
def save_to_mutiple_csv_files(data, name_prefix, header = None, n_parts = 10): 
    housing_dir = os.path.join('datasets', 'housing') 
    os.makedirs(housing_dir, exist_ok = True) # Creating base directories
    path_format = os.path.join(housing_dir, '{}_{:02d}.csv') # Path for each part of the set.
    filepaths = []
    m = len(data)
    for file_idx, row_indices in enumerate(np.array_split(np.arange(m), n_parts)):
        #Dividing the entire data into required no. of parts and assign part no(file_idx) and index for each instance
        part_csv = path_format.format(name_prefix, file_idx) # Adding the name of the set and the file index to path                         
        filepaths.append(part_csv)
        with open(part_csv, 'wt', encoding = 'utf-8') as f: # Opening the created csv file for writing
            if header is not None:
                f.write(header) # Writing the header if available
                f.write('\n')
            for row_idx in row_indices:
                f.write(','.join(repr(col) for col in data[row_idx])) # Writing down the features of each instance seperated by commas.
                f.write('\n')
    return filepaths

In [43]:
x_train.shape

(11610, 8)

In [42]:
np.c_[x_train, y_train].shape

(11610, 9)

In [44]:
# Concatenating the train, validation and test data with the respective labels.
train_data = np.c_[x_train, y_train]
valid_data = np.c_[x_valid, y_valid]
test_data = np.c_[x_test, y_test]

In [45]:
housing.feature_names

['MedInc',
 'HouseAge',
 'AveRooms',
 'AveBedrms',
 'Population',
 'AveOccup',
 'Latitude',
 'Longitude']

In [46]:
housing.target_names

['MedHouseVal']

In [50]:
header_cols = housing.feature_names + housing.target_names # Creating the header for the csv files

In [51]:
header = ','.join(header_cols)

In [52]:
# Splitting the datasets into multiple csv files
train_filepaths = save_to_mutiple_csv_files(train_data, 'train', header, n_parts = 20)
valid_filepaths = save_to_mutiple_csv_files(valid_data, 'validation', header, n_parts = 10)
test_fileaths = save_to_mutiple_csv_files(test_data, 'test', header, n_parts = 10)

In [55]:
pd.read_csv(train_filepaths[0]).head() # First 5 lines of the first part of training data

Unnamed: 0,MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal
0,3.5214,15.0,3.049945,1.106548,1447.0,1.605993,37.63,-122.43,1.442
1,5.3275,5.0,6.49006,0.991054,3464.0,3.44334,33.69,-117.39,1.687
2,3.1,29.0,7.542373,1.591525,1328.0,2.250847,38.44,-122.98,1.621
3,7.1736,12.0,6.289003,0.997442,1054.0,2.695652,33.55,-117.7,2.621
4,2.0549,13.0,5.312457,1.085092,3297.0,2.244384,33.93,-116.93,0.956


In [61]:
with open(train_filepaths[0]) as f:
    for i in range(5):
        print(f.readline())

MedInc,HouseAge,AveRooms,AveBedrms,Population,AveOccup,Latitude,Longitude,MedHouseVal

3.5214,15.0,3.0499445061043287,1.106548279689234,1447.0,1.6059933407325193,37.63,-122.43,1.442

5.3275,5.0,6.490059642147117,0.9910536779324056,3464.0,3.4433399602385686,33.69,-117.39,1.687

3.1,29.0,7.5423728813559325,1.5915254237288134,1328.0,2.2508474576271187,38.44,-122.98,1.621

7.1736,12.0,6.289002557544757,0.9974424552429667,1054.0,2.6956521739130435,33.55,-117.7,2.621



In [62]:
train_filepaths

['datasets/housing/train_00.csv',
 'datasets/housing/train_01.csv',
 'datasets/housing/train_02.csv',
 'datasets/housing/train_03.csv',
 'datasets/housing/train_04.csv',
 'datasets/housing/train_05.csv',
 'datasets/housing/train_06.csv',
 'datasets/housing/train_07.csv',
 'datasets/housing/train_08.csv',
 'datasets/housing/train_09.csv',
 'datasets/housing/train_10.csv',
 'datasets/housing/train_11.csv',
 'datasets/housing/train_12.csv',
 'datasets/housing/train_13.csv',
 'datasets/housing/train_14.csv',
 'datasets/housing/train_15.csv',
 'datasets/housing/train_16.csv',
 'datasets/housing/train_17.csv',
 'datasets/housing/train_18.csv',
 'datasets/housing/train_19.csv']

## Building an input pipeline

In [63]:
filepath_dataset = tf.data.Dataset.list_files(train_filepaths, seed = 42) # Returns a dataset that shuffles the file paths.

In [64]:
for filepath in filepath_dataset:
    print(filepath)

tf.Tensor(b'datasets/housing/train_15.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_08.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_03.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_01.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_10.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_05.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_19.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_16.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_02.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_09.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_00.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_07.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_12.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_04.csv', shape=(), dtype=string)
tf.Tensor(b'datasets/housing/train_17.csv', shap

- We can set `shuffle = False` in `list_files()` method if we dont want the shuffling of items to happen.

In [65]:
n_readers = 5
dataset = filepath_dataset.interleave(lambda filepath : tf.data.TextLineDataset(filepath).skip(1), cycle_length = n_readers)

- The `interleave()` method will create a dataset pulling out 5 filepaths from the `filepath_dataset` and to each one it will apply the lambda function given to create a new dataset using `TextLineDataset()` method.
- At this stage there will be 7 datasets in total; the `filepath_dataset`, the `interleave()` dataset, and 5 `TextLineDatasets()`.
- When we iterate over the interleave dataset, it will iterate thorugh these 5 TextLineDatasets and read 1 line from each of them until all datasets are out of items. Then it will take 5 another 5 filepaths and repeat the same until it runs out of filepaths.
- For interleave to work properly it is good to have files of identical length, otherwise the ends of the longest files will not be interleaved.
- By default `interleave()` does not use parallellism. If we want it to read multiple files in parallel, set `num_parallel_calls` to the no. of threads we want.

In [67]:
for line in dataset.take(5):
    print(line.numpy())

b'4.1812,52.0,5.701388888888889,0.9965277777777778,692.0,2.4027777777777777,33.73,-118.31,3.215'
b'1.6571,34.0,4.454976303317536,1.0876777251184835,1358.0,3.2180094786729856,37.94,-122.35,1.052'
b'3.8456,35.0,5.461346633416459,0.9576059850374065,1154.0,2.8778054862842892,37.96,-122.05,1.598'
b'5.9522,26.0,6.196521739130435,1.0069565217391305,1479.0,2.5721739130434784,34.5,-119.75,4.384'
b'3.226,52.0,5.372469635627531,0.9473684210526315,1157.0,2.3421052631578947,37.96,-121.31,1.076'


- These are just byte strings. We need to parse them and scale the data.