In [1]:
import tensorflow as tf
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers,optimizers
import pandas as pd

In [2]:
hidden_size=300
output_size=25000

# Data pipe line

> Of course, you can use your customized iterable or generatic object whenever train your model

> But, for training efficiently, you should use functions operated by tensorflow. usually, most of function related with Data Pipe-line might be in tf.data. By using this module, we can reduce idle when between data pipe-line and model training


In [3]:
init=np.array(np.random.random([1000,50,300]),dtype=np.float32)

In [4]:
# simple example 1

# convert data to tf.Tensor 
# you can see that the GPU memory usage might be slightly increased
data1=tf.data.Dataset.from_tensor_slices(init)

# Data.from_tensor_slices 

In [5]:

# return tf.Dataset type.

# convert whole data to Tensordata
data1=tf.data.Dataset.from_tensor_slices(init)
data1=data1.batch(10)



In [6]:
for d in data1:
    print(d)

tf.Tensor(
[[[0.6677899  0.39611724 0.8758192  ... 0.7056666  0.6687067  0.9442015 ]
  [0.76737905 0.05751672 0.47104576 ... 0.04946665 0.30389622 0.6740789 ]
  [0.08362012 0.5589503  0.73827296 ... 0.65596527 0.17570876 0.7269968 ]
  ...
  [0.59927416 0.905612   0.1394997  ... 0.23201202 0.3192664  0.10209844]
  [0.9090567  0.05648753 0.68425673 ... 0.96876055 0.77549416 0.7695592 ]
  [0.7569096  0.10668972 0.6869494  ... 0.83939326 0.5659216  0.8429779 ]]

 [[0.10804183 0.1213812  0.31186196 ... 0.45609072 0.11082473 0.0654253 ]
  [0.33100152 0.46150368 0.09109472 ... 0.64980316 0.7237123  0.01082573]
  [0.09322093 0.3585907  0.49829286 ... 0.2973191  0.38112524 0.44444022]
  ...
  [0.8192913  0.67561823 0.90299046 ... 0.78134555 0.08296702 0.9636733 ]
  [0.32222807 0.80919224 0.10237583 ... 0.45205724 0.9637313  0.9863834 ]
  [0.16479047 0.40451613 0.8969054  ... 0.2262471  0.10567314 0.93691564]]

 [[0.50713176 0.4430209  0.62861675 ... 0.847513   0.4702953  0.19525842]
  [0.523780

In [7]:
# limitations: convert whole data

# require fixed length data like array (cannot considef of variable length) **

# but simple and easy way to convert data

In [8]:
# list example

import random

num_sample=300
sample_data=[]
for _ in range(num_sample):
    tmp_data=[]
    for i in range(random.randint(1,10)):
        tmp_data.append(random.randint(0,100))
    sample_data.append(tmp_data)

In [9]:

sample_data[:3]

[[77, 7, 87, 53, 25, 37, 47, 23, 17, 51],
 [9, 77, 4, 72, 94, 41, 49, 3, 35, 97],
 [95, 8, 17]]

In [10]:
# keras.preprocessing.sequence.pad_sequences
# can pad with fixed lengths.

padded_data=tf.keras.preprocessing.sequence.pad_sequences(sample_data,maxlen=10,dtype="int32",padding="post")

In [11]:
ds=tf.data.Dataset.from_tensor_slices(padded_data).batch(10).shuffle(100)

In [12]:
for d in ds:
    print(d)

tf.Tensor(
[[ 6 55 81 66 24  3 86  0  0  0]
 [69 47 53 16 98 28  0  0  0  0]
 [12 29 34 32  2  0  0  0  0  0]
 [16 57 91 71  0  0  0  0  0  0]
 [74 97 62  0  0  0  0  0  0  0]
 [79 42 48  0  0  0  0  0  0  0]
 [92  9 20  8 13 78 60  0  0  0]
 [83 38  4 40 68 25  0  0  0  0]
 [87 72 80 89 10 83 25 38 62  0]
 [55 50  0  0  0  0  0  0  0  0]], shape=(10, 10), dtype=int32)
tf.Tensor(
[[31 43 63 70 89  0  0  0  0  0]
 [ 6 18 90 90 20 10  0  0  0  0]
 [57 46 61 68 97 17 67 46  0  0]
 [67 14 89 69 69 10 32 22  0  0]
 [87 91 35 64 95 96 93 49 83 52]
 [47 31 34 99 80  0  0  0  0  0]
 [11 51  0  0  0  0  0  0  0  0]
 [37 18 24 83 49  0  0  0  0  0]
 [96 53 33  4 69 66 10 32  0  0]
 [23  2 31 33 47 74 66 68  0  0]], shape=(10, 10), dtype=int32)
tf.Tensor(
[[70 24 79 95 44 57 77  0  0  0]
 [52 47 19 89  0  0  0  0  0  0]
 [72 28 59 37 25 96  0  0  0  0]
 [56 33 17  0  0  0  0  0  0  0]
 [75  8 18 32 74 22 17 85 48  0]
 [11 26 68 67 67 71 27 73 73 87]
 [94 58 37 85 87 21  0  0  0  0]
 [23 68 56 11 

# Data.from_generator

In [13]:
# convert data to generator object

# By using generator, memory efficiently (highly recommended!)

# and whenever generate data, we can pad data dynamically
def make_generator(data):
    for d in data:
        yield d

In [14]:
gen=make_generator(init)
for g in gen:
    print(g)

[[0.6677899  0.39611724 0.8758192  ... 0.7056666  0.6687067  0.9442015 ]
 [0.76737905 0.05751672 0.47104576 ... 0.04946665 0.30389622 0.6740789 ]
 [0.08362012 0.5589503  0.73827296 ... 0.65596527 0.17570876 0.7269968 ]
 ...
 [0.59927416 0.905612   0.1394997  ... 0.23201202 0.3192664  0.10209844]
 [0.9090567  0.05648753 0.68425673 ... 0.96876055 0.77549416 0.7695592 ]
 [0.7569096  0.10668972 0.6869494  ... 0.83939326 0.5659216  0.8429779 ]]
[[0.10804183 0.1213812  0.31186196 ... 0.45609072 0.11082473 0.0654253 ]
 [0.33100152 0.46150368 0.09109472 ... 0.64980316 0.7237123  0.01082573]
 [0.09322093 0.3585907  0.49829286 ... 0.2973191  0.38112524 0.44444022]
 ...
 [0.8192913  0.67561823 0.90299046 ... 0.78134555 0.08296702 0.9636733 ]
 [0.32222807 0.80919224 0.10237583 ... 0.45205724 0.9637313  0.9863834 ]
 [0.16479047 0.40451613 0.8969054  ... 0.2262471  0.10567314 0.93691564]]
[[0.50713176 0.4430209  0.62861675 ... 0.847513   0.4702953  0.19525842]
 [0.52378017 0.12415262 0.33571613 ... 

[[0.14961524 0.59605587 0.5827606  ... 0.641217   0.3890996  0.9198329 ]
 [0.7316885  0.68213224 0.60662115 ... 0.12975252 0.8300498  0.03003834]
 [0.5046526  0.5345574  0.93662196 ... 0.31328955 0.09386247 0.45408946]
 ...
 [0.13609824 0.46333793 0.48278686 ... 0.9771735  0.8056433  0.7453888 ]
 [0.4041376  0.63013524 0.7224617  ... 0.84764606 0.18407942 0.03148131]
 [0.90662974 0.75347924 0.5048419  ... 0.80351245 0.14440587 0.24983911]]
[[0.17410302 0.41519177 0.7181977  ... 0.07496745 0.84358275 0.0075148 ]
 [0.18321222 0.04400444 0.7312185  ... 0.3822292  0.5481827  0.20013145]
 [0.5385989  0.31269887 0.5177077  ... 0.17346017 0.97306216 0.959448  ]
 ...
 [0.37328124 0.54155076 0.05890768 ... 0.6008114  0.14116439 0.12575635]
 [0.55426663 0.23998214 0.5470143  ... 0.28824452 0.30581087 0.3009922 ]
 [0.18185507 0.0125333  0.26868412 ... 0.33636814 0.41956022 0.39935586]]
[[0.8587809  0.7012818  0.36649814 ... 0.41411877 0.9847158  0.07094431]
 [0.89732844 0.06998793 0.39283365 ... 

In [15]:
# don't generate after one iteration
for g in gen:
    print(g)

In [16]:
# specify data type,output shape.

def dynamic_sequence(data):

    gen=tf.data.Dataset.from_generator(make_generator,args=[data],output_types=tf.int32)
    return gen.padded_batch(batch_size=30,padded_shapes=([None,None]))    

In [17]:
init.shape

(1000, 50, 300)

In [18]:
array_sample=np.array(sample_data)

In [19]:
def gen(pd_sample):
    for _,i in pd_sample.iterrows():
        yield i[0]
def func_gen(sample):
    dataset=tf.data.Dataset.from_generator(gen,args=[sample],output_types=(tf.int64))
    return dataset.padded_batch(batch_size=3,padded_shapes=([],[None]))

In [20]:
df=pd.DataFrame(np.array(sample_data))

In [21]:
size=3
num_buckets=len(df)//size+1
dfs = []
for bucket in range(num_buckets):
    new_df = df.loc[bucket * size: (bucket + 1) * size - 1]
    new_df = new_df.sample(frac=1).reset_index(drop=True)
    dfs.append(new_df)
random.shuffle(dfs)
df = pd.concat(dfs)


In [22]:
# iterator와 Dataset.from_generator, padded_batch 연계해서 사용하기.

#
def iterator():
    for i in sample_data:
        yield i # 나와야 하는 variable의 개수

def tf_data():
    
    dataset = tf.data.Dataset.from_generator(iterator,(tf.int64),(None,)) # map_function,dtype, shapes
    
    # 들어가는 feature의 개수 --> 나중에 multitask learning이나 3개 이상의 feature가 들어갈 수 있기에

    return dataset.padded_batch(batch_size=size,padded_shapes=([None,]))


In [23]:
gen=tf_data()

In [24]:
for g in gen:
    print(g)

tf.Tensor(
[[77  7 87 53 25 37 47 23 17 51]
 [ 9 77  4 72 94 41 49  3 35 97]
 [95  8 17  0  0  0  0  0  0  0]], shape=(3, 10), dtype=int64)
tf.Tensor(
[[36 64 50 36 62 93]
 [20 65  0  0  0  0]
 [80  0  0  0  0  0]], shape=(3, 6), dtype=int64)
tf.Tensor(
[[ 9 44 70 27 90  0  0  0  0]
 [97 65  4 56  0  0  0  0  0]
 [55 25  7 24 69 64 70 78  1]], shape=(3, 9), dtype=int64)
tf.Tensor(
[[99 95  0  0  0  0]
 [21 58  0  0  0  0]
 [ 5 64 27  1 89 53]], shape=(3, 6), dtype=int64)
tf.Tensor(
[[70 41  0  0  0  0  0]
 [54 79 54 64  0  0  0]
 [ 7 82 72 71 19 51 13]], shape=(3, 7), dtype=int64)
tf.Tensor(
[[52 75  2 17 11 92]
 [87 94 73 63 90  0]
 [91  0  0  0  0  0]], shape=(3, 6), dtype=int64)
tf.Tensor(
[[60 27 25 81 97 42  0  0  0]
 [ 0 18 11 90 18 39 60 53 49]
 [70 24 79 95 44 57 77  0  0]], shape=(3, 9), dtype=int64)
tf.Tensor(
[[52 47 19 89  0  0]
 [72 28 59 37 25 96]
 [56 33 17  0  0  0]], shape=(3, 6), dtype=int64)
tf.Tensor(
[[75  8 18 32 74 22 17 85 48  0]
 [11 26 68 67 67 71 27 73 73 87]

In [25]:


# make simple batch generator for language modeling
class BatchGenerator(object):
    def __init__(self,data:list,batch_size=3):
        """      
        
        """
        self.data=data
        self.batch_size=batch_size
        
        
    def bucketing(self):# sort by value length
        lens=[len(d) for d in self.data]

        self.data=[data for data,_  in sorted(zip(self.data,lens), key=lambda pair: pair[1],reverse=True)]
        
    def iterator(self):
        for x in self.data:
            yield x[:-1], x[1:]
            
            
    def dynamic_batch(self,bucketing=False):
        if bucketing:
            self.bucketing()

        dataset=tf.data.Dataset.from_generator(generator=self.iterator,output_types=(tf.int32,tf.int32),\
                                               output_shapes=([None],[None]))
        return dataset.padded_batch(batch_size=self.batch_size,padded_shapes=([None],[None])) 
    
    # padded_batch can make batch simultaneously while do train at GPU

In [26]:
# sample

import random

num_sample=30
X_train=[]
y_train=[]
for _ in range(num_sample):
    sequence=[]
    for i in range(random.randint(3,10)):
        sequence.append(random.randint(1,100))
    X_train.append(sequence)
    
    y_train.append(random.randint(0,1))

In [27]:
ds=BatchGenerator(data=X_train,batch_size=3)

In [28]:
batch_gen=ds.dynamic_batch(bucketing=True) 

In [29]:
for batch_x,batch_y in batch_gen:
    print(batch_x)
    print(batch_y)
    print()

tf.Tensor(
[[65 24 34 90 17 54 88 97 26]
 [98 90 68 40  7 62 23 65 51]
 [90 57 51 31 36 28 87 39 78]], shape=(3, 9), dtype=int32)
tf.Tensor(
[[24 34 90 17 54 88 97 26 14]
 [90 68 40  7 62 23 65 51 16]
 [57 51 31 36 28 87 39 78 84]], shape=(3, 9), dtype=int32)

tf.Tensor(
[[58 32  6 28 26 32 21 20 43]
 [96 60 38 59 17 37  8 32  0]
 [39 85 13  5 13 46 38 33  0]], shape=(3, 9), dtype=int32)
tf.Tensor(
[[32  6 28 26 32 21 20 43 63]
 [60 38 59 17 37  8 32 48  0]
 [85 13  5 13 46 38 33 67  0]], shape=(3, 9), dtype=int32)

tf.Tensor(
[[73 33 64 86 71 84 44]
 [46 62 69 25 89 41 88]
 [ 5 45 55 77 52 75 39]], shape=(3, 7), dtype=int32)
tf.Tensor(
[[33 64 86 71 84 44 58]
 [62 69 25 89 41 88 48]
 [45 55 77 52 75 39 34]], shape=(3, 7), dtype=int32)

tf.Tensor(
[[35 59 70 14 54 22 99]
 [93  2 37 99 80 19 74]
 [73 72 49 19 88 41  0]], shape=(3, 7), dtype=int32)
tf.Tensor(
[[59 70 14 54 22 99 78]
 [ 2 37 99 80 19 74 81]
 [72 49 19 88 41 10  0]], shape=(3, 7), dtype=int32)

tf.Tensor(
[[72 88 23 43 51 