# Building Pipelines in Pytorch and Tensorflow

When the dataset is small in size that fits into our computer memory, we can perform the training in one pass.
However normally datasets are so big that it is needed to cut them into small parts (batches) that fit into memory.

Also in a normal pipeline there are some preprocessing steps like normalizing, TextVectorization, Bucketizing, Encoding, add noise, augmentation datasets etc,,,

In [1]:
import numpy as np
import torch
import tensorflow as tf

## Pytorch

In Pytorch data loading, shuffling and batching is performed with DataLoader() and Dataset()

https://pytorch.org/docs/stable/data.html#dataset-types

In [2]:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset

# Convert a Numpy Array, a torch tensor or a list 
# The DataLoader class allow to create batches of desired size.
t = torch.arange(6, dtype = torch.float32)

d_loader = DataLoader(t)


In [3]:
# Iterate through dataset

for item in d_loader:
    print(item)

tensor([0.])
tensor([1.])
tensor([2.])
tensor([3.])
tensor([4.])
tensor([5.])


In [4]:
t2 = torch.arange(8, dtype = torch.float32)

data_load = DataLoader(t2, batch_size = 2, drop_last = False)
for i, batch in enumerate(data_load):
    print(i, ' ', batch)

0   tensor([0., 1.])
1   tensor([2., 3.])
2   tensor([4., 5.])
3   tensor([6., 7.])


In [5]:
torch.manual_seed(42)

tx = torch.rand([4,3], dtype = torch.float32) # Tensor for features
ty = torch.rand([4,2], dtype = torch.float32) # Tensor for targets
''' A map-style dataset is one that implements the __getitem__() and __len__() protocols, 
and represents a map from (possibly non-integral) indices/keys to data samples.

For example, such a dataset, when accessed with dataset[idx], 
could read the idx-th image and its corresponding label from a folder on the disk.'''

class JointDataset(Dataset):
    '''
    A custom Dataset class must contain the following methods to be used in the dataloader
    __init__()
    __getitem__() to return the corresponding sample to the given index
    '''
    
    def __init__(self, x, y):
        self.x = x
        self.y = y
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self, index):
        return self.x[index],self.y[index]


In [6]:
jointDataset = JointDataset(tx,ty)

for example in jointDataset:
    print(' x: ', example[0], ' y: ', example[1])

 x:  tensor([0.8823, 0.9150, 0.3829])  y:  tensor([0.8694, 0.5677])
 x:  tensor([0.9593, 0.3904, 0.6009])  y:  tensor([0.7411, 0.4294])
 x:  tensor([0.2566, 0.7936, 0.9408])  y:  tensor([0.8854, 0.5739])
 x:  tensor([0.1332, 0.9346, 0.5936])  y:  tensor([0.2666, 0.6274])


In [7]:
from torch.utils.data import ConcatDataset

concatDataset = ConcatDataset([tx,ty])
for example in concatDataset:
    print(' x: ', example[0], ' y: ', example[1])

 x:  tensor(0.8823)  y:  tensor(0.9150)
 x:  tensor(0.9593)  y:  tensor(0.3904)
 x:  tensor(0.2566)  y:  tensor(0.7936)
 x:  tensor(0.1332)  y:  tensor(0.9346)
 x:  tensor(0.8694)  y:  tensor(0.5677)
 x:  tensor(0.7411)  y:  tensor(0.4294)
 x:  tensor(0.8854)  y:  tensor(0.5739)
 x:  tensor(0.2666)  y:  tensor(0.6274)


In [8]:
# Shuffle + batch + repeat in Pytorch

# Shuffle. and batch size is done directly into the DataLoader class
data_load = DataLoader(jointDataset, batch_size = 2, shuffle = True)

for i, batch in enumerate(data_load, 1):
    print(f'batch {i}:', 'x:', batch[0], 'y:', batch[1])


batch 1: x: tensor([[0.8823, 0.9150, 0.3829],
        [0.2566, 0.7936, 0.9408]]) y: tensor([[0.8694, 0.5677],
        [0.8854, 0.5739]])
batch 2: x: tensor([[0.1332, 0.9346, 0.5936],
        [0.9593, 0.3904, 0.6009]]) y: tensor([[0.2666, 0.6274],
        [0.7411, 0.4294]])


In [9]:
# Repeating shuffling and batching on every epoch
for epoch in range(2):
    print(f'epoch {epoch + 1}')
    for i, batch in enumerate(data_load, 1):
        print(f'batch {i}:', 'x:', batch[0], 'y:', batch[1])

epoch 1
batch 1: x: tensor([[0.8823, 0.9150, 0.3829],
        [0.2566, 0.7936, 0.9408]]) y: tensor([[0.8694, 0.5677],
        [0.8854, 0.5739]])
batch 2: x: tensor([[0.1332, 0.9346, 0.5936],
        [0.9593, 0.3904, 0.6009]]) y: tensor([[0.2666, 0.6274],
        [0.7411, 0.4294]])
epoch 2
batch 1: x: tensor([[0.9593, 0.3904, 0.6009],
        [0.2566, 0.7936, 0.9408]]) y: tensor([[0.7411, 0.4294],
        [0.8854, 0.5739]])
batch 2: x: tensor([[0.1332, 0.9346, 0.5936],
        [0.8823, 0.9150, 0.3829]]) y: tensor([[0.2666, 0.6274],
        [0.8694, 0.5677]])


## Tensorflow

In Tensorflow we use the Dataset class and the method from_tensor_slices

https://www.tensorflow.org/api_docs/python/tf/data

- tf.data: Build TensorFlow input pipelines

https://www.tensorflow.org/guide/data?_gl=1*16k8fhz*_ga*MTc0Mjk2NDk2Mi4xNjcxOTA0MjEy*_ga_W0YLR4190T*MTY3MTk5NzIzNi4yLjEuMTY3MTk5NzIzOS4wLjAuMA..

In [10]:
X = tf.range(6)  # any data tensor
dataset_tf = tf.data.Dataset.from_tensor_slices(X)

# Iterate through dataset

for item in dataset_tf:
    print(item)

tf.Tensor(0, shape=(), dtype=int32)
tf.Tensor(1, shape=(), dtype=int32)
tf.Tensor(2, shape=(), dtype=int32)
tf.Tensor(3, shape=(), dtype=int32)
tf.Tensor(4, shape=(), dtype=int32)
tf.Tensor(5, shape=(), dtype=int32)


In [11]:
dataset = tf.data.Dataset.from_tensor_slices(tf.range(10))

# In tensorflow we have the methods batch, repeat and shuffle 
dataset = dataset.repeat(3).batch(7)
for item in dataset:
    print(item)

tf.Tensor([0 1 2 3 4 5 6], shape=(7,), dtype=int32)
tf.Tensor([7 8 9 0 1 2 3], shape=(7,), dtype=int32)
tf.Tensor([4 5 6 7 8 9 0], shape=(7,), dtype=int32)
tf.Tensor([1 2 3 4 5 6 7], shape=(7,), dtype=int32)
tf.Tensor([8 9], shape=(2,), dtype=int32)


In [12]:
dataset = tf.data.Dataset.range(10).repeat(2)
dataset = dataset.shuffle(buffer_size=4, seed=42).batch(3)
for item in dataset:
    print(item)

tf.Tensor([1 4 2], shape=(3,), dtype=int64)
tf.Tensor([3 5 0], shape=(3,), dtype=int64)
tf.Tensor([6 9 8], shape=(3,), dtype=int64)
tf.Tensor([2 0 3], shape=(3,), dtype=int64)
tf.Tensor([1 4 5], shape=(3,), dtype=int64)
tf.Tensor([7 9 6], shape=(3,), dtype=int64)
tf.Tensor([7 8], shape=(2,), dtype=int64)


In [13]:
for item in dataset.take(2):
    print(item)

tf.Tensor([3 4 2], shape=(3,), dtype=int64)
tf.Tensor([0 1 8], shape=(3,), dtype=int64)


In [14]:
tf.random.set_seed(1)

t_x = tf.random.uniform([4, 3], dtype=tf.float32)
t_y = tf.range(4)


ds_x = tf.data.Dataset.from_tensor_slices(t_x)
ds_y = tf.data.Dataset.from_tensor_slices(t_y)
   
print(' Joined datasets in TF Method 1 with zip') 
ds_joint = tf.data.Dataset.zip((ds_x, ds_y))

## Order 1: shuffle -> batch -> repeat
ds = ds_joint.shuffle(4).batch(2).repeat(3)

for i,(batch_x, batch_y) in enumerate(ds):
    print(i, batch_x.shape, batch_y.numpy())

 Joined datasets in TF Method 1 with zip
0 (2, 3) [1 2]
1 (2, 3) [3 0]
2 (2, 3) [1 3]
3 (2, 3) [0 2]
4 (2, 3) [2 1]
5 (2, 3) [3 0]


In [15]:
## Order 2: batch -> shuffle -> repeat
ds = ds_joint.batch(2).shuffle(4).repeat(3)

for i,(batch_x, batch_y) in enumerate(ds):
    print(i, batch_x.shape, batch_y.numpy())

0 (2, 3) [0 1]
1 (2, 3) [2 3]
2 (2, 3) [0 1]
3 (2, 3) [2 3]
4 (2, 3) [2 3]
5 (2, 3) [0 1]


In [16]:
# Joint two Datasets in TF

tf.random.set_seed(1)

t_x = tf.random.uniform([4, 3], dtype=tf.float32)
t_y = tf.range(4)

# Method 1

ds_x = tf.data.Dataset.from_tensor_slices(t_x)
ds_y = tf.data.Dataset.from_tensor_slices(t_y)
   
print(' Joined datasets in TF Method 1 with zip') 
ds_joint = tf.data.Dataset.zip((ds_x, ds_y))

for example in ds_joint:
    print('  x: ', example[0].numpy(), 
          '  y: ', example[1].numpy())
    
   
print(' Joined datasets in TF Method 2') 
ds_joint = tf.data.Dataset.from_tensor_slices((t_x, t_y))

for example in ds_joint:
    print('  x: ', example[0].numpy(), 
          '  y: ', example[1].numpy())

 Joined datasets in TF Method 1 with zip
  x:  [0.16513085 0.9014813  0.6309742 ]   y:  0
  x:  [0.4345461  0.29193902 0.64250207]   y:  1
  x:  [0.9757855  0.43509948 0.6601019 ]   y:  2
  x:  [0.60489583 0.6366315  0.6144488 ]   y:  3
 Joined datasets in TF Method 2
  x:  [0.16513085 0.9014813  0.6309742 ]   y:  0
  x:  [0.4345461  0.29193902 0.64250207]   y:  1
  x:  [0.9757855  0.43509948 0.6601019 ]   y:  2
  x:  [0.60489583 0.6366315  0.6144488 ]   y:  3
