# TensorFlow crash course
### **PART 9.2**

## Interleaving lines from multiple files

### Get file paths

In [1]:
import tensorflow as tf 

train_filepath = "dataset/train/my_train_*.csv"
filepath_dataset = tf.data.Dataset.list_files(train_filepath, seed=42)

### Interleave
#### In this case we'll interleave from 5 files at the same time skipping the header using the skip() function

In [2]:
# Go through 5 files -> Read them skipping the header -> Interleave
n_readers = 5
dataset = filepath_dataset.interleave(
    lambda filepath : tf.data.TextLineDataset(filepath).skip(1),  
    cycle_length=n_readers
)

In [3]:
# Lets check the result out
for line in dataset.take(5):
    print(line.numpy())

b'-121.72,37.98,5.0,7105.0,1143.0,3523.0,1088.0,5.0468,168800.0'
b'-121.55,38.55,10.0,6227.0,1164.0,2909.0,1077.0,4.106,115900.0'
b'-118.47,34.0,42.0,1271.0,301.0,574.0,312.0,3.1304,340500.0'
b'-120.43,34.87,21.0,2131.0,329.0,1094.0,353.0,4.6648,193000.0'
b'-118.29,34.18,52.0,1602.0,265.0,667.0,251.0,5.0489999999999995,323500.0'


### Preprocess

In [4]:
import numpy as np 

train_distr = np.load("dataset/train_distr.npy")  # Load the pre-computed mean and std
X_mean, X_std = np.split(train_distr, 2)
n_inputs = 8

def preprocess(line):
    defs = [0.] * n_inputs + [tf.constant([], dtype=tf.float32)]
    fields = tf.io.decode_csv(line, record_defaults=defs)  # Parse each line of the csv data
    x = tf.stack(fields[:-1])
    y = tf.stack(fields[-1:])
    return (x - X_mean[:8]) / X_std[:8], y 

# Let's test the created function
preprocess(b"-121.78,37.68,17.0,3112.0,872.0,1392.0,680.0,3.0222,172500.0")

(<tf.Tensor: shape=(8,), dtype=float32, numpy=
 array([1.0353299 , 0.99768853, 0.15421276, 0.36362094, 0.8554898 ,
        0.1831137 , 0.60330486, 0.2901311 ], dtype=float32)>,
 <tf.Tensor: shape=(1,), dtype=float32, numpy=array([172500.], dtype=float32)>)

### Helper function
This function is supposed to help us by preprocessing the data using the following techniques
1. Data loading
2. Data splitting
3. Data preprocessing
4. Data shuffling
5. Data repeating
6. Data batch processing

In [5]:
def csv_reader_dataset(filepaths, repeat=1, n_readers=5,
                        n_read_threads=None, shuffle_buffer_size=10000,
                        n_parse_threads=5, batch_size=32):
    dataset = tf.data.Dataset.list_files(filepaths)
    dataset = dataset.interleave(
    lambda filepath: tf.data.TextLineDataset(filepath).skip(1),
    cycle_length=n_readers, num_parallel_calls=n_read_threads)
    dataset = dataset.map(preprocess, num_parallel_calls=n_parse_threads)
    dataset = dataset.shuffle(shuffle_buffer_size)
    """Note : With prefetching, the CPU and the GPU work in parallel: as the GPU works
    on one batch, the CPU works on the next"""
    return dataset.batch(batch_size).prefetch(1)  # This tells TensorFlow to be always one batch ahead