## Data generators

In [1]:
import numpy as np
import random as rnd

In [2]:
a = [1, 2, 3, 4]
b = [0] * 10

a_size = len(a)
b_size = len(b)

In [3]:
lines_index = [*range(a_size)]
lines_index

[0, 1, 2, 3]

In [4]:
lines_index = list(range(a_size))
lines_index

[0, 1, 2, 3]

In [5]:
index = 0
for i in range(b_size):
    if index >= a_size:
        index = 0
    b[i] = a[lines_index[index]]
    index += 1

print(b)

[1, 2, 3, 4, 1, 2, 3, 4, 1, 2]


## Shuffling data order

In [16]:
a = [1, 2, 3, 4]
b = []

a_size = len(a)
b_size = 10

lines_index = list(range(a_size))

print('Original order of index: {}'.format(lines_index))

Original order of index: [0, 1, 2, 3]


In [17]:
[a[i] for i in lines_index]

[1, 2, 3, 4]

In [18]:
rnd.shuffle(lines_index)

print('Shuffled order of index: {}'.format(lines_index))

Shuffled order of index: [3, 1, 0, 2]


In [19]:
[a[i] for i in lines_index]

[4, 2, 1, 3]

In [20]:
print('New value order for First batch: {}'.format([a[i] for i in lines_index]))

batch_counter = 1
index = 0

for i in range(b_size):
    if index >= a_size:
        index = 0
        batch_counter += 1
        rnd.shuffle(lines_index)
        print('\nShuffle index for Batch No.{}: {}'.format(batch_counter, lines_index))
        print('Values for Batch No.{}: {}'.format(batch_counter, [a[i] for i in lines_index]))
    
    b.append(a[lines_index[index]])
    index += 1

New value order for First batch: [4, 2, 1, 3]

Shuffle index for Batch No.2: [1, 0, 3, 2]
Values for Batch No.2: [2, 1, 4, 3]

Shuffle index for Batch No.3: [1, 3, 0, 2]
Values for Batch No.3: [2, 4, 1, 3]


In [21]:
print('Values of b: {}'.format(b))

Values of b: [4, 2, 1, 3, 2, 1, 4, 3, 2, 4]


There are three slightly out of the ordinary features. 

1. The first is the use of a list of a predefined size to store the data for each batch. Using a predefined size list reduces the computation time if the elements in the array are of a fixed size, like numbers. If the elements are of different sizes, it is better to use an empty array and append one element at a time during the loop.

2. The second is tracking the current location in the incoming lists of samples. Generators variables hold their values between invocations, so we create an `index` variable, initialize to zero, and increment by one for each sample included in a batch. However, we do not use the `index` to access the positions of the list of sentences directly. Instead, we use it to select one index from a list of indexes. In this way, we can change the order in which we traverse our original list, keeping untouched our original list.  

3. The third also relates to wrapping. Because `batch_size` and the length of the input lists are not aligned, gathering a batch_size group of inputs may involve wrapping back to the beginning of the input loop. In our approach, it is just enough to reset the `index` to 0. We can re-shuffle the list of indexes to produce different batches each time.

In [28]:
def data_generator(x, y, batch_size, shuffle=True):
    '''
      Input: 
        batch_size - integer describing the batch size
        x - list containing samples
        y - list containing labels
        shuffle - Shuffle the data order
      Output:
        a tuple containing 2 elements:
        X - list of dim (batch_size) of samples
        Y - list of dim (batch_size) of labels
    '''
    len_data = len(x)
    index_lst = list(range(len_data))
    
    if shuffle:
        rnd.shuffle(index_lst)
    index = 0
    
    while True:
        X = [0] * batch_size
        Y = [0] * batch_size
        
        for i in range(batch_size):
            if index >= len_data:
                index = 0
                if shuffle:
                    rnd.shuffle(index_lst)
            
            X[i] = x[index_lst[index]]
            Y[i] = y[index_lst[index]]
            index += 1
            
        yield ((X, Y))

In [23]:
x = [1, 2, 3, 4]
y = [j**2 for j in x]

In [24]:
print(x)
print(y)

[1, 2, 3, 4]
[1, 4, 9, 16]


In [29]:
generator = data_generator(x, y, 3, shuffle=True)

In [30]:
next(generator)

([1, 2, 3], [1, 4, 9])

In [31]:
next(generator)

([4, 2, 1], [16, 4, 1])

In [32]:
next(generator)

([3, 4, 2], [9, 16, 4])

In [33]:
next(generator)

([4, 1, 3], [16, 1, 9])

In [34]:
next(generator)

([2, 1, 4], [4, 1, 16])