In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'

import tensorflow as tf

"""
Everything in TensorFlow is based on Tensor operations.
Tensors are (kind of) like np.arrays.
All tensors are immutable: you can never update the contents of a
tensor, only create a new one.

 - nd-arrays (1d, 2d, or even 3d and higher)
 - GPU support
 - Computational graph / Track gradients / Backpropagation
 - Immutable!
"""

### 1 create tensor

In [2]:
# 1. create tensors
# scalar, rank-0 tensor
x = tf.constant(4)
print(x)

tf.Tensor(4, shape=(), dtype=int32)


In [3]:
x = tf.constant(4, shape=(1,1), dtype=tf.float32)
print(x)

tf.Tensor([[4.]], shape=(1, 1), dtype=float32)


In [4]:
# vector, rank-1
x = tf.constant([1,2,3])
print(x)

tf.Tensor([1 2 3], shape=(3,), dtype=int32)


In [5]:
# matrix, rank-2
x = tf.constant([[1,2,3], [4,5,6]])
print(x)

tf.Tensor(
[[1 2 3]
 [4 5 6]], shape=(2, 3), dtype=int32)


### 2 zeros, ones

In [6]:
x = tf.ones((3,3))
print(x)

x = tf.zeros((3,3))
print(x)

tf.Tensor(
[[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]], shape=(3, 3), dtype=float32)
tf.Tensor(
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]], shape=(3, 3), dtype=float32)


In [7]:
x = tf.eye(3)
print(x)

tf.Tensor(
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]], shape=(3, 3), dtype=float32)


### 3 std deviation .. normal distribution

In [8]:
x = tf.random.normal((3,3), mean=0, stddev=1)
print(x)

tf.Tensor(
[[-0.23364508 -1.0022961   0.7207432 ]
 [-0.8189681  -2.3865094  -1.219961  ]
 [ 1.549602    1.8618294   0.48249286]], shape=(3, 3), dtype=float32)


In [9]:
x = tf.random.uniform((3,3), minval=0, maxval=1)        ## values are between 0 and 1
print(x)

tf.Tensor(
[[0.71256196 0.76354194 0.75633585]
 [0.40745318 0.31332183 0.7819538 ]
 [0.963189   0.83648896 0.37974346]], shape=(3, 3), dtype=float32)


In [10]:
x = tf.range(10)
print(x)

tf.Tensor([0 1 2 3 4 5 6 7 8 9], shape=(10,), dtype=int32)


In [11]:
x = tf.range(start=1, limit=15, delta=3, dtype=tf.float32)
print(x)

tf.Tensor([ 1.  4.  7. 10. 13.], shape=(5,), dtype=float32)


In [13]:
x = tf.range(start=1, limit=15, delta=3, dtype=tf.float32)
x = tf.cast(x, dtype=tf.float16)
print(x)

tf.Tensor([ 1.  4.  7. 10. 13.], shape=(5,), dtype=float16)


### 4 mathematical operations

In [14]:
x = tf.constant([1,2,3])
y = tf.constant([4,5,6])

In [15]:
z = tf.add(x,y)
# z = x + y
print(z)

tf.Tensor([5 7 9], shape=(3,), dtype=int32)


In [16]:
# z = tf.subtract(x,y)
z = x - y
print(z)

tf.Tensor([-3 -3 -3], shape=(3,), dtype=int32)


In [17]:
z = tf.divide(x,y)
# z = x / y
print(z)

tf.Tensor([0.25 0.4  0.5 ], shape=(3,), dtype=float64)


In [18]:
z = tf.multiply(x,y)
# z = x * y
print(z)

tf.Tensor([ 4 10 18], shape=(3,), dtype=int32)


### 5 tensordot

In [21]:
x = tf.constant([1,2,3])
y = tf.constant([4,5,6])

print(x)
print(y)
z = tf.tensordot(x,y, axes=1)       ## it will do element wise multiplication and then summation
print(z)                            ## [ (1*4) + (2*5) + (3*6) ]  => 32

tf.Tensor([1 2 3], shape=(3,), dtype=int32)
tf.Tensor([4 5 6], shape=(3,), dtype=int32)
tf.Tensor(32, shape=(), dtype=int32)


In [29]:
x = tf.random.uniform(shape=(2,3), minval=1, maxval=100)
y = tf.random.uniform(shape=(2,3), minval=1, maxval=100)
print(x)
print()
print(y)
print()
z = tf.tensordot(x,y, axes=0)
print(z)
"""
x =>  [[ x11, x12, x13 ]                    y =>  [[ y11, y12, y13 ]
       [ x21, x22, x23 ]]                          [ y21, y22, y23 ]]

x and y both are having shape of (2 row 3 columns)

dot product of both matrices will be the shape of (2, 3, 2, 3)

Example => 
a1 = [[ (x11 * y11), (x11 * y12), (x11 * y13) ]
      [ (x11 * y21), (x11 * y22), (x11 * y23) ]]

a2 = [[ (x12 * y11), (x12 * y12), (x12 * y13) ]
      [ (x12 * y21), (x12 * y22), (x12 * y23) ]]

a2 = [[ (x13 * y11), (x13 * y12), (x13 * y13) ]
      [ (x13 * y21), (x13 * y22), (x13 * y23) ]]

the above 3 result will create one metrix (1, 2, 3) of this shape ...
like this it has 3 values in one row .. it maens (3, 2, 3) metrix
there are 2 rows .. so the metrix shape will be (2, 3, 2, 3)
"""

tf.Tensor(
[[15.475281  1.328076 83.7622  ]
 [75.37005  27.678991 34.7625  ]], shape=(2, 3), dtype=float32)

tf.Tensor(
[[20.479328 58.241833 59.625076]
 [85.806725 96.346695 73.575   ]], shape=(2, 3), dtype=float32)

tf.Tensor(
[[[[ 316.92334   901.3087    922.7148  ]
   [1327.8832   1490.9922   1138.5938  ]]

  [[  27.198105   77.34958    79.18663 ]
   [ 113.957855  127.955734   97.71319 ]]

  [[1715.3936   4878.464    4994.3276  ]
   [7187.36     8070.211    6162.8037  ]]]


 [[[1543.528    4389.69     4493.945   ]
   [6467.257    7261.6553   5545.351   ]]

  [[ 566.84717  1612.0752   1650.3619  ]
   [2375.0437   2666.7793   2036.4817  ]]

  [[ 711.91266  2024.6317   2072.7168  ]
   [2982.8564   3349.252    2557.651   ]]]], shape=(2, 3, 2, 3), dtype=float32)


### 6 reduce_sum , reduce_max , reduce_mean

In [45]:
x = tf.constant([1,2,3])
y = tf.constant([4,5,6])

print(x)
print(y)
# z = tf.reduce_sum(x*y, axis=0)       ## it will do element wise multiplication and then summation
# z = tf.reduce_max(x+y, axis=0)
z = tf.reduce_mean(x+y, axis=0)
print(z)  

tf.Tensor([1 2 3], shape=(3,), dtype=int32)
tf.Tensor([4 5 6], shape=(3,), dtype=int32)
tf.Tensor(7, shape=(), dtype=int32)


In [35]:
x = tf.constant([1,2,3])
# elementwise exponentiate
z = x ** 3
print(z)

tf.Tensor([ 1  8 27], shape=(3,), dtype=int32)


### 7 matrix multiplication
matrix multiplication (shapes must match: number of columns A = number of rows B)

In [47]:
x = tf.random.normal((2,3))
y = tf.random.normal((3,4))

z = tf.matmul(x,y)
    ## or
# z = x @ y

print(z)

tf.Tensor(
[[ 2.3663712 -3.5711892  1.4013865 -0.2188822]
 [ 1.7917428 -3.320992   3.9433231 -0.9963043]], shape=(2, 4), dtype=float32)


### 8 indexing, slicing

In [48]:
x = tf.constant([[1,2,3,4],[5,6,7,8]])
print(x[0])
print(x[:, 0]) # all rows, column 0
print(x[1, :]) # row 1, all columns
print(x[1,1]) # element at 1, 1

tf.Tensor([1 2 3 4], shape=(4,), dtype=int32)
tf.Tensor([1 5], shape=(2,), dtype=int32)
tf.Tensor([5 6 7 8], shape=(4,), dtype=int32)
tf.Tensor(6, shape=(), dtype=int32)


In [60]:
x = tf.constant([1,2,3,4,5,6,7,8,9])

# print(x[::-1])      ## reverse order of the rows
# print(x[1:3])

## specific indexes values
list_of_indexes = tf.constant([0, 3, 5])
print(tf.gather(x, indices= [0, 3, 5]))
# print(tf.gather(x, indices= list_of_indexes))

tf.Tensor([1 4 6], shape=(3,), dtype=int32)


In [65]:
x = tf.random.uniform(shape=(4,4), minval=1, maxval=100)
print(x)
print()
# print(x[::-1])      ## reverse order of the rows
# print(x[1:3, :])

tf.Tensor(
[[60.59735  31.790888 97.06744  32.855957]
 [46.455185 60.288548 95.49846   4.691118]
 [69.06895  42.845722 50.542732 29.319006]
 [34.29755  19.225727 36.389027 12.523647]], shape=(4, 4), dtype=float32)

tf.Tensor(
[[46.455185 60.288548 95.49846   4.691118]
 [69.06895  42.845722 50.542732 29.319006]], shape=(2, 4), dtype=float32)


### 9 reshape

In [50]:
x = tf.random.normal((2,3))
print(x.shape)
x = tf.reshape(x, (3,2))
print(x)

(2, 3)
tf.Tensor(
[[ 1.1189082   0.5278398 ]
 [-1.5588489   0.75679934]
 [-1.3474648  -0.22449367]], shape=(3, 2), dtype=float32)


In [51]:
x = tf.reshape(x, (-1,2))
print(x)

tf.Tensor(
[[ 1.1189082   0.5278398 ]
 [-1.5588489   0.75679934]
 [-1.3474648  -0.22449367]], shape=(3, 2), dtype=float32)


In [52]:
x = tf.reshape(x, (6))
print(x)

tf.Tensor([ 1.1189082   0.5278398  -1.5588489   0.75679934 -1.3474648  -0.22449367], shape=(6,), dtype=float32)


### 10 transpose

In [70]:
x = tf.random.uniform(shape=(4,4), minval=1, maxval=100)
print(x)
print()
print(tf.transpose(x, perm=[1,0]))          ## it will convert row to columns and column to row
"""
if the metrix shape is (3, 4)  .. and you will do transpose (0,1)
it means you wnat to swap the shape number ( 3,4 ) to ( 4,3 )
"""

tf.Tensor(
[[ 5.92245   22.345451  36.214386  81.14195  ]
 [77.403366  43.368362  67.66347   45.653587 ]
 [ 5.3935204 57.316578  27.601301   3.370151 ]
 [53.748318  52.02648    3.3957963 81.215614 ]], shape=(4, 4), dtype=float32)

tf.Tensor(
[[ 5.92245   77.403366   5.3935204 53.748318 ]
 [22.345451  43.368362  57.316578  52.02648  ]
 [36.214386  67.66347   27.601301   3.3957963]
 [81.14195   45.653587   3.370151  81.215614 ]], shape=(4, 4), dtype=float32)
tf.Tensor(
[[ 5.92245   22.345451  36.214386  81.14195  ]
 [77.403366  43.368362  67.66347   45.653587 ]
 [ 5.3935204 57.316578  27.601301   3.370151 ]
 [53.748318  52.02648    3.3957963 81.215614 ]], shape=(4, 4), dtype=float32)


In [74]:
x = tf.random.uniform(shape=(2,5,4), minval=1, maxval=100)
print(x)
print()
print(tf.transpose(x, perm=[2,0,1]))          ## it will convert row to columns and column to row

"""
if the metrix shape is (2,5,4)  .. and you will do transpose (2,0,1)
it means you wnat to swap the shape number ( 2,5,4 ) to ( 4,2,5 )

earlier it was the combination of two (5,4) matrixs ...
after transpose it has become combination of four (2,5) matrixs
"""

tf.Tensor(
[[[36.09559    1.8213997 28.157858  81.84591  ]
  [31.362295  58.8386    68.57423   47.697845 ]
  [88.42637   43.03055   66.372696  78.609764 ]
  [71.64769   43.65539    5.654315  42.290417 ]
  [55.471344  41.993793  63.49531   13.460185 ]]

 [[80.150024  78.15693    4.6433916 85.59229  ]
  [88.83932   31.578775  28.680048  89.34331  ]
  [34.975086  56.58408   64.980515  78.35124  ]
  [41.07825   97.173744  15.388102  70.12429  ]
  [70.03381   71.39263   54.687664  90.60505  ]]], shape=(2, 5, 4), dtype=float32)

tf.Tensor(
[[[36.09559   31.362295  88.42637   71.64769   55.471344 ]
  [80.150024  88.83932   34.975086  41.07825   70.03381  ]]

 [[ 1.8213997 58.8386    43.03055   43.65539   41.993793 ]
  [78.15693   31.578775  56.58408   97.173744  71.39263  ]]

 [[28.157858  68.57423   66.372696   5.654315  63.49531  ]
  [ 4.6433916 28.680048  64.980515  15.388102  54.687664 ]]

 [[81.84591   47.697845  78.609764  42.290417  13.460185 ]
  [85.59229   89.34331   78.35124   70.12

### 11 numpy to tensor

In [53]:
x = x.numpy()
print(type(x))

<class 'numpy.ndarray'>


In [54]:
x = tf.convert_to_tensor(x)
print(type(x))
# -> eager tensor = evaluates operations immediately
# without building graphs

<class 'tensorflow.python.framework.ops.EagerTensor'>


### 12 string tensor

In [55]:
## string tensor
x = tf.constant("alpha")
print(x)

x = tf.constant(["alpha", "beta", "gamma"])
print(x)

tf.Tensor(b'alpha', shape=(), dtype=string)
tf.Tensor([b'alpha' b'beta' b'gamma'], shape=(3,), dtype=string)
<tf.Variable 'Variable:0' shape=(1, 3) dtype=float32, numpy=array([[1., 2., 3.]], dtype=float32)>
<class 'tensorflow.python.ops.resource_variable_ops.ResourceVariable'>


### 13 variable

In [56]:
# Variable
# A tf.Variable represents a tensor whose value can be
# changed by running ops on it
# Used to represent shared, persistent state your program manipulates
# Higher level libraries like tf.keras use tf.Variable to store model parameters.
b = tf.Variable([[1.0, 2.0, 3.0]])
print(b)
print(type(b))

<tf.Variable 'Variable:0' shape=(1, 3) dtype=float32, numpy=array([[1., 2., 3.]], dtype=float32)>
<class 'tensorflow.python.ops.resource_variable_ops.ResourceVariable'>


In [None]:
import tensorflow as tf
from tensorflow.keras import layers

class ScaledDotProductAttention(layers.Layer):
    def __init__(self):
        super(ScaledDotProductAttention, self).__init__()

    def call(self, queries, keys, values, mask):
        # Scaled dot product attention
        d_k = tf.cast(tf.shape(keys)[-1], tf.float32)
        scores = tf.matmul(queries, keys, transpose_b=True) / tf.sqrt(d_k)

        if mask is not None:
            scores += (mask * -1e9)

        attention_weights = tf.nn.softmax(scores, axis=-1)
        output = tf.matmul(attention_weights, values)
        return output, attention_weights

class MultiHeadAttention(layers.Layer):
    def __init__(self, embed_size, num_heads):
        super(MultiHeadAttention, self).__init__()
        assert embed_size % num_heads == 0, "Embedding size must be divisible by num_heads"

        self.num_heads = num_heads
        self.head_dim = embed_size // num_heads

        self.query_dense = layers.Dense(embed_size)
        self.key_dense = layers.Dense(embed_size)
        self.value_dense = layers.Dense(embed_size)
        self.final_dense = layers.Dense(embed_size)

    def split_heads(self, x, batch_size):
        # Split the last dimension into (num_heads, head_dim)
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.head_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])

    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]

        queries = self.split_heads(self.query_dense(queries), batch_size)
        keys = self.split_heads(self.key_dense(keys), batch_size)
        values = self.split_heads(self.value_dense(values), batch_size)

        attention_output, attention_weights = ScaledDotProductAttention()(queries, keys, values, mask)
        attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(attention_output, (batch_size, -1, self.num_heads * self.head_dim))
        output = self.final_dense(concat_attention)

        return output, attention_weights

class PositionWiseFeedForward(layers.Layer):
    def __init__(self, embed_size, forward_expansion):
        super(PositionWiseFeedForward, self).__init__()
        self.fc1 = layers.Dense(forward_expansion * embed_size, activation='relu')
        self.fc2 = layers.Dense(embed_size)

    def call(self, x):
        return self.fc2(self.fc1(x))

class EncoderLayer(layers.Layer):
    def __init__(self, embed_size, num_heads, forward_expansion, dropout):
        super(EncoderLayer, self).__init__()
        self.mha = MultiHeadAttention(embed_size, num_heads)
        self.ffn = PositionWiseFeedForward(embed_size, forward_expansion)

        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)

    def call(self, x, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        out1 = self.layernorm1(x + self.dropout1(attn_output))
        ffn_output = self.ffn(out1)
        out2 = self.layernorm2(out1 + self.dropout2(ffn_output))
        return out2

class DecoderLayer(layers.Layer):
    def __init__(self, embed_size, num_heads, forward_expansion, dropout):
        super(DecoderLayer, self).__init__()
        self.mha1 = MultiHeadAttention(embed_size, num_heads)
        self.mha2 = MultiHeadAttention(embed_size, num_heads)
        self.ffn = PositionWiseFeedForward(embed_size, forward_expansion)

        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
        self.dropout1 = layers.Dropout(dropout)
        self.dropout2 = layers.Dropout(dropout)
        self.dropout3 = layers.Dropout(dropout)

    def call(self, x, enc_output, src_mask, trg_mask):
        attn1, _ = self.mha1(x, x, x, trg_mask)
        out1 = self.layernorm1(x + self.dropout1(attn1))

        attn2, _ = self.mha2(out1, enc_output, enc_output, src_mask)
        out2 = self.layernorm2(out1 + self.dropout2(attn2))

        ffn_output = self.ffn(out2)
        out3 = self.layernorm3(out2 + self.dropout3(ffn_output))
        return out3

class Encoder(layers.Layer):
    def __init__(self, src_vocab_size, embed_size, num_layers, num_heads, forward_expansion, dropout, max_length):
        super(Encoder, self).__init__()
        self.embed_size = embed_size
        self.embedding = layers.Embedding(src_vocab_size, embed_size)
        self.pos_encoding = self.positional_encoding(max_length, embed_size)
        self.layers = [EncoderLayer(embed_size, num_heads, forward_expansion, dropout) for _ in range(num_layers)]
        self.dropout = layers.Dropout(dropout)

    def positional_encoding(self, max_length, embed_size):
        pos = tf.range(max_length)[:, tf.newaxis]
        i = tf.range(embed_size)[tf.newaxis, :]
        angle_rates = 1 / tf.pow(10000, (2 * (i // 2)) / tf.cast(embed_size, tf.float32))
        angle_rads = pos * angle_rates

        # Apply sin to even indices, cos to odd indices
        sines = tf.sin(angle_rads[:, 0::2])
        cosines = tf.cos(angle_rads[:, 1::2])

        pos_encoding = tf.concat([sines, cosines], axis=-1)
        pos_encoding = pos_encoding[tf.newaxis, ...]
        return tf.cast(pos_encoding, dtype=tf.float32)

    def call(self, x, mask):
        seq_length = tf.shape(x)[1]
        x = self.embedding(x) * tf.math.sqrt(tf.cast(self.embed_size, tf.float32))
        x += self.pos_encoding[:, :seq_length, :]
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, mask)

        return x

class Decoder(layers.Layer):
    def __init__(self, trg_vocab_size, embed_size, num_layers, num_heads, forward_expansion, dropout, max_length):
        super(Decoder, self).__init__()
        self.embed_size = embed_size
        self.embedding = layers.Embedding(trg_vocab_size, embed_size)
        self.pos_encoding = self.positional_encoding(max_length, embed_size)
        self.layers = [DecoderLayer(embed_size, num_heads, forward_expansion, dropout) for _ in range(num_layers)]
        self.fc_out = layers.Dense(trg_vocab_size)
        self.dropout = layers.Dropout(dropout)

    def positional_encoding(self, max_length, embed_size):
        return Encoder.positional_encoding(self, max_length, embed_size)

    def call(self, x, enc_output, src_mask, trg_mask):
        seq_length = tf.shape(x)[1]
        x = self.embedding(x) * tf.math.sqrt(tf.cast(self.embed_size, tf.float32))
        x += self.pos_encoding[:, :seq_length, :]
        x = self.dropout(x)

        for layer in self.layers:
            x = layer(x, enc_output, src_mask, trg_mask)

        return self.fc_out(x)

class Transformer(keras.Model):
    def __init__(self, src_vocab_size, trg_vocab_size, src_pad_idx, trg_pad_idx, embed_size=512, num_layers=6, 
                 num_heads=8, forward_expansion=4, dropout=0.1, max_length=100):
        super(Transformer, self).__init__()
        self.encoder = Encoder(src_vocab_size, embed_size, num_layers, num_heads, forward_expansion, dropout, max_length)
        self.decoder = Decoder(trg_vocab_size, embed_size, num_layers, num_heads, forward_expansion, dropout, max_length)
        self.src_pad_idx = src_pad_idx
        self.trg_pad_idx = trg_pad_idx

    def make_src_mask(self, src):
        mask = tf.cast(tf.math.not_equal(src, self.src_pad_idx), dtype=tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]  # (batch_size, 1, 1, src_len)

    def make_trg_mask(self, trg):
        seq_len = tf.shape(trg)[1]
        trg_mask = tf.linalg.band_part(tf.ones
