In [1]:
%pip install einops

Collecting einops
  Downloading einops-0.6.0-py3-none-any.whl (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m41.6/41.6 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: einops
Successfully installed einops-0.6.0
[0mNote: you may need to restart the kernel to use updated packages.


In [2]:
import numpy as np
import tensorflow as tf
from einops import rearrange, repeat
import matplotlib.pyplot as plt

# **Parallel distribution**

In [3]:
strategy = tf.distribute.MirroredStrategy()
print('Number of devices: {}'.format(strategy.num_replicas_in_sync))

Number of devices: 2


In [4]:
AUTO = tf.data.AUTOTUNE
DATA_LENGTH = 94477
GLOBAL_BATCH_SIZE = 32
BATCH_SIZE_PER_REPLICA = 16

# **Loading Data**

In [5]:
##### Read TFRecord file
def _parse_tfr_element(element):
    parse_dic = {
    'b_feature': tf.io.FixedLenFeature([], tf.string), # Note that it is tf.string, not tf.float32
    'b_label': tf.io.FixedLenFeature([], tf.string),
    }
    example_message = tf.io.parse_single_example(element, parse_dic)

    b_feature = example_message['b_feature'] # get byte string
    b_label = example_message['b_label']
    
    feature = tf.io.parse_tensor(b_feature, out_type=tf.float32) # restore 2D array from byte string
    label = tf.io.parse_tensor(b_label, out_type=tf.int32)
    return (feature, label)

filenames = [f'/kaggle/input/tfrecords-sequences-for-asl/data{i}.tfrecords' for i in range(20)]
dataset = tf.data.TFRecordDataset(filenames) 
dataset = dataset.map(_parse_tfr_element, num_parallel_calls=AUTO).prefetch(AUTO)

In [6]:
def set_shapes(feature, label):
    feature.set_shape((50, 115, 3))
    label.set_shape([])
    return feature, label

dataset = dataset.map(set_shapes)

def load_data_out():
    feature = []
    label = []
    for i,j in enumerate(dataset):
        feature.append(j[0])
        label.append(j[1])
    return tf.stack(feature,0), tf.stack(label,0)

X,y = load_data_out()

# **Model + Training**

In [7]:
# https://arxiv.org/pdf/2010.11929v2.pdf flattened 2D patch
def patching(input_tensor,pad_dim):
    #assert input_tensor.shape == (50,115,3), "input tensor must be of shape (50,115,3)"
    assert 50 % pad_dim == 0, "padding should be divisible by 50"
    return tf.reshape(input_tensor,shape=(input_tensor.shape[0],int((input_tensor.shape[1]*input_tensor.shape[2])//(pad_dim*input_tensor.shape[2])),int(pad_dim*input_tensor.shape[2]*input_tensor.shape[3])))

def positional_embedding(d_model,length):
    lines = np.arange(1,length+1,1)[:,np.newaxis] # (1,length)
    temp = np.arange(1,(d_model+1)/2,1)           
    # 1,2,3,4 becomes 1,1,2,2,3,3,4,4
    columns = np.asarray([value for twin in zip(temp,temp) for value in twin])[np.newaxis,:] # (depth,1)
    # apply to the previous vector the "famous" omega minuscule
    columns = 1/np.power(10000,2*columns/d_model)
    # create the matrix and apply sin, cos to pair and odd element of the vector
    matrice = lines*columns                                                          # (depth,length)
    matrice[:,0::2] = np.sin(matrice[:,0::2])
    matrice[:,1::2] = np.cos(matrice[:,1::2])
    return matrice

In [8]:
class LPFP(tf.keras.layers.Layer):
    def __init__(self, pad_dim, d_model):
        super(LPFP,self).__init__()
        self.pad_dim = pad_dim
        self.d_model = d_model
        self.dense = tf.keras.layers.Dense(d_model, use_bias=False)

    def call(self,sample):
        patched_input = patching(sample,self.pad_dim)
        #print(patched_input.shape,'patched input')
        output = self.dense(patched_input)
        #print(patched_input.shape,'patched input dense')
        return output

class CLSTokenHandler(tf.keras.layers.Layer):
    def __init__(self, d_model):
        super(CLSTokenHandler,self).__init__()
        self.text_vectorizer = tf.keras.layers.TextVectorization()
        self.embedding = tf.keras.layers.Embedding(2,d_model)
    
    def call(self,tokenvalue):
        self.text_vectorizer.set_vocabulary(vocabulary=[tokenvalue])
        text_vector = self.text_vectorizer(tokenvalue)
        embedding = self.embedding(text_vector)
        return embedding
    
class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self,d_model,length):
        super(PositionalEmbedding,self).__init__()
        self.d_model = d_model
        self.length = length
        self.positionalencoding = positional_embedding(self.d_model,self.length)
        
    def call(self,matrix):
        # add positional embedding taking care of batch dimension (expand dim and repeat)
        # matrix.shape[0] contains the batch dimension
        matrix += tf.cast(tf.repeat(tf.expand_dims(self.positionalencoding,0),matrix.shape[0],axis=0),"float32")
        return matrix
    
class InitBlock(tf.keras.layers.Layer):
    def  __init__(self,pad_dim=5,d_model=768):
        super(InitBlock,self).__init__()
        self.pad_dim = pad_dim
        self.d_model = d_model
        self.LPFP = LPFP(pad_dim,d_model)
        self.CLSTokenHandler = CLSTokenHandler(d_model)
        # divided by 50 since we only accept inputs of size (50,115,3) 
        # 50 being the static length of the input
        # length of positional embeding should be of the same size as input length
        # for example 50,115,3 patched by 5 produces an input of shape 10x768 after linear
        # length (50//5) should also be equal to 10, hence the length of the input is the same
        # as the positional size since they are added to each other. 
        # I'm writing all the text if you're going to update the code to work with any input size.
        self.PositionalEmbedding = PositionalEmbedding(d_model,50//pad_dim)
        
    def call(self,input_):
        #print(input_.shape,'before lpfp')
        input_matrix = self.LPFP(input_)
        #print(input_matrix.shape,'before posemb after lpfp')
        input_matrix = self.PositionalEmbedding(input_matrix)
        #print(input_matrix.shape,'after posemb')
        token_embedding = self.CLSTokenHandler('[class]')
        
        
        token_embedding = tf.repeat(tf.expand_dims(token_embedding,0),input_.shape[0],axis=0)
        #token_embedding = tf.ensure_shape(token_embedding,shape=(32,1,768))
        token_embedding.set_shape((None,1,self.d_model))
        #print(token_embedding.shape,'token embedding')
        #print(token_embedding,'token embedding')
        z0 = tf.concat([input_matrix,token_embedding],1) 
        #print(z0.shape,'after cls added')
        return z0

In [9]:
# h = n_heads
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self,d_model,h=12):
        super().__init__()
        assert d_model%h == 0 , "n_heads (h) must be a divisor of d_model"
        self.h = h
        self.d_model = d_model
        self.Wq = tf.keras.layers.Dense(d_model,use_bias=False)
        self.Wk = tf.keras.layers.Dense(d_model,use_bias=False)
        self.Wv = tf.keras.layers.Dense(d_model,use_bias=False)
        self.dense = tf.keras.layers.Dense(d_model)

    def call(self,input_):

        Q = self.Wq(input_)
        K = self.Wk(input_)
        V = self.Wv(input_)

        Q = rearrange(Q, 'b p (d h) -> b h p d',h=self.h)
        K = rearrange(K, 'b p (d h) -> b h p d',h=self.h)
        V = rearrange(V, 'b p (d h) -> b h d p',h=self.h) # transposed for matmul
        
        matmul = tf.matmul(K,V)
        mat_scaled = matmul/tf.cast(np.sqrt(self.d_model/self.h),'float32')
        softm = tf.nn.softmax(mat_scaled)
        out = tf.matmul(softm,Q)
        
        out = rearrange(out, 'b h p d -> b p (d h)')
        
        out = self.dense(out)
        
        return out

In [10]:
class MLP(tf.keras.layers.Layer):
    def __init__(self,dense_size):
        assert isinstance(dense_size,list) , "layer_size must be a list"
        super().__init__()
        self.dense_size = dense_size
        self.num_layer = len(dense_size)
        self.dense_array = []
        for i in range(self.num_layer):
            self.dense_array.append(tf.keras.layers.Dense(dense_size[i]))
            
    def call(self,input_):
        for dense in self.dense_array:
            input_ = dense(input_)
        return input_

class TransformerBlock(tf.keras.layers.Layer):
    def __init__(self,n_head=12, dense_size=[768], d_model=768):
        assert isinstance(dense_size,list) , "dense_size must be a list"
        super().__init__()
        self.nhead = n_head
        self.dense_size = dense_size
        self.batchnorm = tf.keras.layers.BatchNormalization()
        self.mha = MultiHeadAttention(d_model,n_head)
        self.mlp = MLP(dense_size)
    
    def call(self,input_):
        batch_n = self.batchnorm(input_)
        mha_out = self.mha(batch_n)

        added = tf.keras.layers.Add()([input_,mha_out])

        batch_n = self.batchnorm(added)
        mlp_out = self.mlp(batch_n)

        added = tf.keras.layers.Add()([added,mlp_out])

        return added

In [11]:
class Network(tf.keras.Model):
    def __init__(self,Layers=12,pad_dim=5,n_head=12, dense_size=[768], d_model=768, output_dense_size=768):
        super().__init__()
        self.Layers = Layers
        self.init_block = InitBlock(pad_dim=pad_dim,d_model=d_model)
        self.transformer_block = TransformerBlock(n_head=n_head, dense_size=dense_size, d_model=d_model)
        self.dense_layer1 = tf.keras.layers.Dense(output_dense_size,activation='gelu')
        self.dense_layer2 = tf.keras.layers.Dense(250,activation='gelu')
        self.flattener = tf.keras.layers.Flatten()
    
    def call(self,input_):
        print(input_.shape,'before init block')
        input_ = self.init_block(input_)
        print(input_.shape,'after init block')
        for _ in range(self.Layers):
            input_ = self.transformer_block(input_)
        
        #print(input_.shape)
        x = rearrange(input_,'b (h l) c -> b h (l c)', h=1)
        #print(x.shape)
        x = self.dense_layer1(x)
        x = self.dense_layer2(x)
        
        return x
        

In [12]:
train_size = int(0.9*DATA_LENGTH)
test_size = int(0.1*DATA_LENGTH)

dataset = dataset.shuffle(DATA_LENGTH)
train_dataset = dataset.take(train_size)
test_dataset = dataset.skip(train_size)

In [13]:
# ----------- Scope definition ---------------
with strategy.scope():
    model = Network(12,5,8,[256],256,256)
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3)
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction=tf.keras.losses.Reduction.NONE)
    test_loss = tf.keras.metrics.Mean(name='test_loss')
    train_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
    val_accuracy = tf.keras.metrics.SparseCategoricalAccuracy()
    
    def compute_loss(labels, predictions):
        per_example_loss = loss(labels,predictions)
        loss_val = tf.nn.compute_average_loss(per_example_loss,global_batch_size=GLOBAL_BATCH_SIZE)
        return loss_val

# ----------- Train and Val Step ---------------
def train_step(inputs):
    feature, label = inputs
    
    # forward and backward pass
    with tf.GradientTape() as tape:
        logits = model(feature,training=True)
        loss_val = compute_loss(label,logits)
    grads = tape.gradient(loss_val, model.trainable_weights)
    optimizer.apply_gradients(zip(grads,model.trainable_weights))
    
    # process accuracy
    train_accuracy.update_state(label,logits)
        
    return loss_val, train_accuracy

def val_step(inputs):
    feature, label = inputs
    
    val_logits = model(feature,training=False)
    val_loss = loss(label,val_logits)
    
    test_loss.update_state(val_loss)
    val_accuracy.update_state(label,val_logits)
    
    return val_accuracy,test_loss,val_loss

# ----------- Distribution Wrapper ---------------
@tf.function
def distributed_train_step(train_ds):
    per_replica_losses_and_acc = strategy.run(train_step,args=(train_ds,))
    return strategy.reduce(tf.distribute.ReduceOp.SUM,per_replica_losses_and_acc[0],axis=None), per_replica_losses_and_acc[1].result()

@tf.function
def distributed_test_step(test_ds):
    return strategy.run(test_step, args=(test_ds,))

In [14]:
# -------------- Training ----------------
epochs = 100
cost_val = 0

history = {}
history['train_accuracy'] = []
history['test_accuracy'] = []
history['train_loss'] = []
history['test_loss'] = []
history['mean_loss'] = []

for epoch in range(epochs):
    
    pb=tf.keras.utils.Progbar(target=85056)
    
    
    for x in strategy.experimental_distribute_dataset(train_dataset.batch(GLOBAL_BATCH_SIZE)):
        loss_val, acc = distributed_train_step(x)
        cost_val += loss_val
        
        #history['train_accuracy'].append(acc)
        #history['train_loss'].append(loss_val)
        
        pb.add(GLOBAL_BATCH_SIZE, values=[('acc',acc),('loss',loss_val)])
        
acc.reset_state()

# Process validation data at the end of each epoch
for x in strategy.experimental_distribute_dataset(test_dataset):
    val_accuracy, test_loss,val_loss = distributed_test_step(x)
    #history['test_accuracy'].append(val_accuracy.result())
    #history['test_loss'].append(val_loss)
    #history['mean_loss'].append(test_loss)
val_accuracy.reset_state()
test_loss.reset_state()

(16, 50, 115, 3) before init block
(16, 11, 256) after init block
(16, 50, 115, 3) before init block
(16, 11, 256) after init block
(16, 50, 115, 3) before init block
(16, 11, 256) after init block
(16, 50, 115, 3) before init block
(16, 11, 256) after init block
(5, 11, 256) after init block
(0, 50, 115, 3) before init block
(0, 11, 256) after init block


plt.plot(history['train_accuracy'])
plt.plot(history['test_accuracy'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

plt.plot(history['test_loss'])
plt.plot(history['train_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
model.save('/kaggle/working/my_model.h5')