In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

from tensorflow.keras.preprocessing.text import one_hot 
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import tensorflow as tf

In [None]:
file_path = "/kaggle/input/dataset/data.csv"
df = pd.read_csv(file_path)
df.head()

In [None]:
df.shape

In [None]:
df = df.sample(n=80000, random_state=42)

In [None]:
print(f"\n{type(df).__name__} shape: {df.shape}")
print(f'\nMissing Data: \n{df.isnull().sum()}')
print(f'\nDuplicates: {df.duplicated().sum()}')

In [None]:
df.dropna(inplace=True)

In [None]:
df = df.drop_duplicates()

We can see that the median for text is equal to 272 so 50 % of the data has at least 272 words and the median for the summary is equal to 37 so 50% of the data has at least 37 words in it. 
So we will be using these threshold to fix our data for the tokenization.

* Q3(text)=450
* Q3(summary)=63

Since our data is huge (186 446 rows) and it will cause memory exhaustion, we need to reduce the size of the dataset and the preprocessing and data analysis was done to have an idea about our data to reduce it in a significant and clean method.

In [None]:
# Define the maximum allowed lengths
max_text_length = 450
max_headline_length = 63

# Drop rows based on conditions
df = df[(df['text_length'] <= max_text_length) & (df['headline_length'] <= max_headline_length)]

In [None]:
index_to_drop = 16189
df = df.drop(index_to_drop, axis=0)

In [None]:
df.head()

In [None]:
df.shape

In [None]:
# List of columns to drop
columns_to_drop = ['text_length', 'headline_length']

# Drop the specified columns
df = df.drop(columns=columns_to_drop)

In [None]:
df.head()

In [None]:
print(f"\n{type(df).__name__} shape: {df.shape}")
print(f'\nMissing Data: \n{df.isnull().sum()}')
print(f'\nDuplicates: {df.duplicated().sum()}')

 Adding special tokens like "sostok" (start of sequence) and "eostok" (end of sequence) is typically done in the target sequence (summary) for sequence-to-sequence tasks, including text summarization. The reason for this is to explicitly indicate the beginning and end of the target sequence, which helps the model during training and decoding. 

In [None]:
df['headline'] = df['headline'].apply(lambda x: '<go> ' + x + ' <stop>')

In [None]:
df.head()

In [None]:
# First, split the data into training and validation sets
df_train, df_val = train_test_split(df, test_size=0.1, random_state=42)

# Print the sizes of the sets
print("Training set size:", len(df_train))
print("Validation set size:", len(df_val))

In [None]:
df_train.head()

# **<span style="color:darkblue">  Tokenization and Padding**

Now for the Tokenization, we will be splitting the columns. In one dataframe we have the text, which is the input of our encoder and the dataframe heading which is the summary and will be the target and the output we are trying to achieve.

A tokenizer builds the vocabulary and converts a word sequence to an integer sequence. Go ahead and build tokenizers for text and summary:

### **<span style="color:darkred">Train Set and Validation Set**

In [None]:
text = df_train['text']
summary = df_train['headline']

In [None]:
text_val = df_val['text']
summary_val = df_val['headline']

In [None]:
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'

In [None]:
text_tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token='<unk>')
summary_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters=filters,oov_token='<unk>')


text_tokenizer.fit_on_texts(text)
summary_tokenizer.fit_on_texts(summary)


In [None]:
encoder_vocab_size = len(text_tokenizer.word_index) + 1
decoder_vocab_size = len(summary_tokenizer.word_index) + 1

# vocab_size
encoder_vocab_size, decoder_vocab_size

In [None]:
inputs = text_tokenizer.texts_to_sequences(text)
targets = summary_tokenizer.texts_to_sequences(summary)

inputs_val = text_tokenizer.texts_to_sequences(text_val)
targets_val = summary_tokenizer.texts_to_sequences(summary_val)

In [None]:
example_index = 9897
print("Original Text Sequence:")
print(summary[example_index])

print("\nTokenized Text Sequence:")
print(targets[example_index])

In [None]:
encoder_maxlen = 400
decoder_maxlen = 75

In [None]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, maxlen=encoder_maxlen, padding='post', truncating='post')
targets = tf.keras.preprocessing.sequence.pad_sequences(targets, maxlen=decoder_maxlen, padding='post', truncating='post')

inputs_val = tf.keras.preprocessing.sequence.pad_sequences(inputs_val, maxlen=encoder_maxlen, padding='post', truncating='post')
targets_val = tf.keras.preprocessing.sequence.pad_sequences(targets_val, maxlen=decoder_maxlen, padding='post', truncating='post')

To ensure that the data types of the tensors match the expected types for the model.

In [None]:
inputs = tf.cast(inputs, dtype=tf.int32)
targets = tf.cast(targets, dtype=tf.int32)

inputs_val = tf.cast(inputs_val, dtype=tf.int32)
targets_val = tf.cast(targets_val, dtype=tf.int32)

* batch size determines the number of samples used in each training iterations
* Shuffling is important during training to introduce randomness and prevent the model from learning the order of the data

In [None]:
BUFFER_SIZE = 10000
BATCH_SIZE = 64

In [None]:
dataset = tf.data.Dataset.from_tensor_slices((inputs, targets)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
dataset_val = tf.data.Dataset.from_tensor_slices((inputs_val,targets_val)).shuffle(BUFFER_SIZE).batch(BATCH_SIZE)

# **<span style="color:darkblue">5.  Model Creation - <i>Transformer</i>**


In [None]:
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam

In [None]:
import matplotlib.pyplot as plt

def hist(history):
    plt.title('Loss')

    x= [i[0] for i in history['val']]
    y=[i[1] for i in history['val']]
    plt.plot(x,y,'x-')
    
    x= [i[0] for i in history['train']]
    y=[i[1] for i in history['train']]    
    plt.plot(x,y,'o-')

    plt.legend(['validation','train'])
    plt.show()
    print('smallest val loss:', sorted(history['val'],key=lambda x: x[1])[0])

#### Positional Encoding for adding notion of position among words

Positional Encoding is used for adding the notion of position among words.
They use wave frequencies to capture position information.

In [None]:
def get_angles(position, i, d_model):
    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / np.float32(d_model))
    return position * angle_rates

In [None]:
def positional_encoding(position, d_model):
    angle_rads = get_angles(
        np.arange(position)[:, np.newaxis],
        np.arange(d_model)[np.newaxis, :],
        d_model
    )

    # apply sin to even indices in the array; 2i
    angle_rads[:, 0::2] = np.sin(angle_rads[:, 0::2])

    # apply cos to odd indices in the array; 2i+1
    angle_rads[:, 1::2] = np.cos(angle_rads[:, 1::2])

    pos_encoding = angle_rads[np.newaxis, ...]

    return tf.cast(pos_encoding, dtype=tf.float32)

### **<span style="color:darkred">Multi-Head Attention**


The term `d_model` refers to the total dimensionality or size of the model's representation for each element in the input sequence. It represents **the size of the hidden state**.
In the Transformer architecture, the input sequence is embedded into vectors of size `d_model` before being processed by the attention mechanism and feedforward neural networks.

**Multi-head attention** performs different parallel computations for the same word to achieve different results. These results are then connected to SoftMax to output the best suitable word.
* The number of heads **`(num_heads)`** represents how many attention heads will be used in parallel.

* The total dimension of the model **`(d_model)`** should be divisible by the number of heads.

* **Weight Matrices**: Three dense layers are created for linear transformations of the input **(query, key, and value)** using weight matrices (`self.wq`, `self.wk`, `self.wv`).

* **Output Transformation**: Another dense layer (self.dense) is used to transform the concatenated output of attention heads back to the original dimension (d_model)

<!-- Centering the image -->
<div style="text-align:center">
  <!-- Inserting the image with a URL or file path -->
  <img src="https://i.ibb.co/72q2k8Z/lalala.png" alt="decod" width="700" height="400">
</div>

In [None]:
def scaled_dot_product_attention(q, k, v, mask):
    matmul_qk = tf.matmul(q, k, transpose_b=True)

    dk = tf.cast(tf.shape(k)[-1], tf.float32)
    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

    if mask is not None:
        scaled_attention_logits += (mask * -1e9)  

    attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1)

    output = tf.matmul(attention_weights, v)
    return output, attention_weights

In [None]:
class MultiHeadAttention(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads
        self.d_model = d_model

        assert d_model % self.num_heads == 0

        self.depth = d_model // self.num_heads

        self.wq = tf.keras.layers.Dense(d_model)
        self.wk = tf.keras.layers.Dense(d_model)
        self.wv = tf.keras.layers.Dense(d_model)

        self.dense = tf.keras.layers.Dense(d_model)
        
    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
        return tf.transpose(x, perm=[0, 2, 1, 3])
    
    def call(self, v, k, q, mask):
        batch_size = tf.shape(q)[0]

        q = self.wq(q)
        k = self.wk(k)
        v = self.wv(v)

        q = self.split_heads(q, batch_size)
        k = self.split_heads(k, batch_size)
        v = self.split_heads(v, batch_size)

        scaled_attention, attention_weights = scaled_dot_product_attention(
            q, k, v, mask)

        scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])

        concat_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model))
        output = self.dense(concat_attention)
            
        return output, attention_weights

#### Feed Forward Network

The **Feed-Forward Network** consists of a fully connected (dense) layers with non-linear activation functions. It takes the output from the attention mechanism, processes it through one or more hidden layers, and produces an output that is then normalized and added to the original input.

In [None]:
def point_wise_feed_forward_network(d_model, dff):
    return tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model)
    ])

#### Masks

In [None]:
def create_masks(inp, tar):
    enc_padding_mask = create_padding_mask(inp)
    dec_padding_mask = create_padding_mask(inp)

    look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
    dec_target_padding_mask = create_padding_mask(tar)
    combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)
  
    return enc_padding_mask, combined_mask, dec_padding_mask


def create_padding_mask(seq):
    seq = tf.cast(tf.math.equal(seq, 0), tf.float32)
    return seq[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(size):
    mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
    return mask

### **<span style="color:darkred">Encoder**


The **Encoder** class represents the entire encoder of the transformer model, consisting of multiple encoder layers.It includes:
* **Embedding layer:** `(self.embedding)` to convert input tokens into vectors.
* **Positional encoding:** `(self.pos_encoding)` to provide positional information to the model.
* **EncoderLayer instances:** `(self.enc_layers)`.

The **EncoderLayer** class represents one layer within that encoder.
It takes as input the output of the encoder layer and it contains three main sub-layers:
* **1. Multi-Head Self-Attention:** `(self.mha)` Processes the input sequence with attention to itself.

* **Dropout and Normalization:** `(out1)` : used to avoid overfitting

* **2. Feed-Forward Network :** `(ffn_output)` to convert target tokens into vectors.

* **Dropout and Normalization:** `(out2)` Normalization ensures that the mean of each feature is close to zero, and the standard deviation is close to one, and so for stability of values. 



<!-- Centering the image -->
<div style="text-align:center">
  <!-- Inserting the image with a URL or file path -->
  <img src="https://i.ibb.co/P18VGqw/wowow.png" alt="decod" width="300" height="200">
</div>

In [None]:
 class Encoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
        super(Encoder, self).__init__()

        self.d_model = d_model
        self.num_layers = num_layers
        # Initialization of the Embedding Layer
        self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
        
        # Initialization of the Positional Encoding
        self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)
        
        # Initialization of the EncoderLayer and loop through the different layers
        self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

        self.dropout = tf.keras.layers.Dropout(rate)
        
    def call(self, x, training, mask):
        seq_len = tf.shape(x)[1]

        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[:, :seq_len, :]

        x = self.dropout(x, training=training)
    
        for i in range(self.num_layers):
            x = self.enc_layers[i](x, training, mask)
    
        return x

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(EncoderLayer, self).__init__()
        
        #Initialization of MultiHeadAttention
        self.mha = MultiHeadAttention(d_model, num_heads)
        #Initialization of FFN 
        self.ffn = point_wise_feed_forward_network(d_model, dff)

        #Initialization of the Normalization and Dropout Layers
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
    
    def call(self, x, training, mask):
        attn_output, _ = self.mha(x, x, x, mask)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(x + attn_output)

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        out2 = self.layernorm2(out1 + ffn_output)

        return out2

### **<span style="color:darkred">Decoder**

 The **Decoder** class represents the entire decoder of the transformer model, consisting of multiple decoder layers.It includes:
* **Embedding layer:** `(self.embedding)` to convert target tokens into vectors.
* **Positional encoding:** `(self.pos_encoding)` to provide positional information to the model.
* **DecoderLayer instances:** `(self.dec_layers)`.


The **DecoderLayer** class represents one layer within that decoder.
It takes as input the output of the encoder layer and it contains three main sub-layers:
* **1. Multi-Head Self-Attention:** `(self.mha1)` Processes the input sequence with attention to itself.

* **Dropout and Normalization:** `(out1)` : used to avoid overfitting

* **2. Multi-Head Attention with Encoder Output :** `(self.mha2)` to convert target tokens into vectors.

* **Dropout and Normalization:** `(out2)` Normalization ensures that the mean of each feature is close to zero, and the standard deviation is close to one, and so for stability of values. 

* **3. Feed-Forward Network :** `(ffn_output)` to convert target tokens into vectors.

* **Dropout and Normalization:** `(out3)`

<!-- Centering the image -->
<div style="text-align:center">
  <!-- Inserting the image with a URL or file path -->
  <img src="https://i.ibb.co/tb7GGhY/out2.png" alt="decod" width="300" height="100">
</div>

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
        super(Decoder, self).__init__()
        
        #Represents the entire decoder of the transformer model, consisting of multiple layers 
        self.d_model = d_model
        self.num_layers = num_layers
        
        # Initialization of the Embedding Layer
        self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
        
        # Initialization of the Positional Encoding
        self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)
        
        # Initialization of the DecoderLayer and loop through the different layers
        self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]
        self.dropout = tf.keras.layers.Dropout(rate)
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        seq_len = tf.shape(x)[1]
        attention_weights = {}
        # The variable x represents the input sequence to the decoder.
        #It is first passed through the Embedding Layer
        x = self.embedding(x)
        
        #It is then scaled 
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        #The positional encoding is added to x.
        x += self.pos_encoding[:, :seq_len, :]
        
        #Droput is applied for regularization.
        x = self.dropout(x, training=training)
        
        #The loop iterates through the decoder layers (self.dec_layers).
        for i in range(self.num_layers):
            #For every input of the decoder, it will take the encoders output
            x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)
            #the attention mechanism applied to the decoder's input sequence.
            attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
            attention_weights['decoder_layer{}_block2'.format(i+1)] = block2
    
        return x, attention_weights

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self, d_model, num_heads, dff, rate=0.1):
        super(DecoderLayer, self).__init__()
        #Initialization of MultiHeadAttention 
        self.mha1 = MultiHeadAttention(d_model, num_heads)
        self.mha2 = MultiHeadAttention(d_model, num_heads)
        
        #Initialization of FFN 
        self.ffn = point_wise_feed_forward_network(d_model, dff)
        
        #Initialization of the Normalization and Dropout Layers
        self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

        self.dropout1 = tf.keras.layers.Dropout(rate)
        self.dropout2 = tf.keras.layers.Dropout(rate)
        self.dropout3 = tf.keras.layers.Dropout(rate)
    
    
    def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
        # Multi-Head Attention (First Sub-Layer)
        attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask)
        
        #Dropout and Normalization
        attn1 = self.dropout1(attn1, training=training)
        out1 = self.layernorm1(attn1 + x)
        
        # Multi-Head Attention with Encoder Output (Second Sub-Layer)
        attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask)
        
        #Dropout and Normalization
        attn2 = self.dropout2(attn2, training=training)
        out2 = self.layernorm2(attn2 + out1)
        
        #Feed-Forward Network (Third Sub-Layer)
        ffn_output = self.ffn(out2)
        ffn_output = self.dropout3(ffn_output, training=training)
        out3 = self.layernorm3(ffn_output + out2)

        return out3, attn_weights_block1, attn_weights_block2

<!-- Centering the image -->
<div style="text-align:center">
  <!-- Inserting the image with a URL or file path -->
  <img src="https://i.ibb.co/8PCxvvV/input-tokens.jpg" alt="decod" width="600" height="400">
</div>

### **<span style="color:darkred">Transformer**

 The **Transformer** class inherits from `tf.keras.Model`. The parameter for initializing an instance(object) of the class are the following:
* **`num_layers`:** Number of layers in both the encoder and decoder.
* **`d_model`:** Dimensionality of the model, which is the size of the embedding vectors and the expected output of each sub-layer.
* **`num_heads`:** Number of attention heads in the multi-head attention models.
* **`dff`:** Dimensionality of the feedforward network.
* **`input_vocab_size`:** Vocabulary size of the input.
* **`target_vocab_size`:** Vocabulary size of the target output.
* **`pe_input`:** Maximum sequence length for positional encoding in the input.
* **`pe_target`:** Maximum sequence length for positional encoding in the target.
* **`rate`:** Dropout rate (default is 0.1).
 
The **__init__** method is responsible for initializing the components, while the **call** method defines how these components are applied to input data during the forward pass.

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, target_vocab_size, pe_input, pe_target, rate=0.1):
        super(Transformer, self).__init__()
        #creates an instance of the Encoder class
        self.encoder = Encoder(num_layers, d_model, num_heads, dff, input_vocab_size, pe_input, rate)
        #creates an instance of the Encoder class
        self.decoder = Decoder(num_layers, d_model, num_heads, dff, target_vocab_size, pe_target, rate)

        self.final_layer = tf.keras.layers.Dense(target_vocab_size)
    
    def call(self, inp, tar, training, enc_padding_mask, look_ahead_mask, dec_padding_mask):
        #Encoder that takes the data as the input
        enc_output = self.encoder(inp, training, enc_padding_mask)
        #Decoder that takes as input the encoder output 
        dec_output, attention_weights = self.decoder(tar, enc_output, training, look_ahead_mask, dec_padding_mask)
        #Final layer 
        final_output = self.final_layer(dec_output)

        return final_output, attention_weights

<!-- Centering the image -->
<div style="text-align:center">
  <!-- Inserting the image with a URL or file path -->
  <img src="https://d2mk45aasx86xg.cloudfront.net/Transformer_architecture_a1d5ffc1e9.webp" alt="decod" width="300" height="200">
</div>

### Training

#### Adam optimizer with custom learning rate scheduling

In [None]:
# hyper-params
num_layers = 5
d_model = 128
dff = 512
num_heads = 8
EPOCHS = 15

#### Defining losses and other metrics

* This algorithm is used to accelerate the gradient descent algorithm by taking into consideration the ‘exponentially weighted average’ of the gradients. Using averages makes the algorithm converge towards the minima in a faster pace. 

In [None]:
# determines the step size during optimization
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)

* SparseCategoricalCrossentropy is suitable for integer-encoded targets.
Each word in the vocabulary is assigned a unique integer index.

In [None]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

* This function calculates the loss for a batch of predictions (pred) given the true labels (real) using the **sparse categorical crossentropy loss**.

In [None]:
def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    
    #Calculates the loss for each example in the batch.
    loss_ = loss_object(real, pred)
    
    #Converts the mask to the same data type as the loss.
    mask = tf.cast(mask, dtype=loss_.dtype)
    
    #Applies the mask to the calculated loss, setting the loss to 0 for padded elements.
    loss_ *= mask
    #Computes the average loss over the non-padded elements in the batch
    return tf.reduce_sum(loss_)/tf.reduce_sum(mask)

In [None]:
history={'val':[],'train':[]}
train_loss = tf.keras.metrics.Mean(name='train_loss')
val_loss = tf.keras.metrics.Mean(name='val_loss')

### Defining the Transformer for Training

In [None]:
transformer = Transformer(
    num_layers, 
    d_model, 
    num_heads, 
    dff,
    input_vocab_size=encoder_vocab_size, 
    target_vocab_size=decoder_vocab_size, 
    pe_input=encoder_vocab_size, 
    pe_target=decoder_vocab_size,
)

#### Training steps

In [None]:
@tf.function
def train_step(inp, tar):
    tar_inp = tar[:, :-1]
    tar_real = tar[:, 1:]
    # Create masks for encoder, decoder
    enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
    
    #This block starts a gradient tape to record operations for automatic differentiation.
    with tf.GradientTape() as tape:
        predictions, _ = transformer(
            inp, tar_inp, 
            True, 
            enc_padding_mask, 
            combined_mask, 
            dec_padding_mask
        )
        #This line calculates the gradients of the loss 
        loss = loss_function(tar_real, predictions)
    
    #applies the calculated gradients to the model's trainable variables using the specified optimizer.
    gradients = tape.gradient(loss, transformer.trainable_variables)    
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
    
    #records the training loss
    train_loss(loss)
  

In [None]:
def validate():
    print('validation started ...')
    val_loss.reset_states()
    for (batch, (inp, tar)) in enumerate(dataset_val):    
        tar_inp = tar[:, :-1] # <go> ...
        tar_real = tar[:, 1:] #  ... <stop>

        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(inp, tar_inp)
        
        predictions, _ = transformer(
            inp, tar_inp, 
            False, 
            enc_padding_mask, 
            combined_mask, 
            dec_padding_mask
        )
        
        loss = loss_function(tar_real, predictions)
        val_loss(loss)
        
    print('\n* Validation loss: {} '.format(val_loss.result()) )
    return val_loss.result()
# validate()

In [None]:
import time
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
  
    for (batch, (inp, tar)) in enumerate(dataset):
        #It computes gradients, applies them to update model parameters, and accumulates the training loss.
        train_step(inp, tar)
    
        if batch % 429 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))
            
    val_loss_ = validate().numpy()
    history['val'].append((epoch,val_loss_))
    
    print ('\n* Train Loss {:.4f}'.format(train_loss.result()))
    history['train'].append((epoch,train_loss.result().numpy()))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))
    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

In [None]:
hist(history)

In [None]:
import time
for epoch in range(EPOCHS):
    start = time.time()

    train_loss.reset_states()
  
    for (batch, (inp, tar)) in enumerate(dataset):
        train_step(inp, tar)
    
        if batch % 429 == 0:
            print ('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, train_loss.result()))
            
    val_loss_ = validate().numpy()
    history['val'].append((epoch,val_loss_))
    print ('\n* Train Loss {:.4f}'.format(train_loss.result()))
    history['train'].append((epoch,train_loss.result().numpy()))

    print ('Epoch {} Loss {:.4f}'.format(epoch + 1, train_loss.result()))
    print ('Time taken for 1 epoch: {} secs\n'.format(time.time() - start))

In [None]:
hist(history)

### **<span style="color:darkred">Inference**

Predicting on unseen data

In [None]:
def evaluate(input_document):
    input_document = text_tokenizer.texts_to_sequences([input_document])
    input_document = tf.keras.preprocessing.sequence.pad_sequences(input_document, maxlen=encoder_maxlen, padding='post', truncating='post')

    encoder_input = tf.expand_dims(input_document[0], 0)

    decoder_input = [summary_tokenizer.word_index["<go>"]]
    output = tf.expand_dims(decoder_input, 0)
    
    for i in range(decoder_maxlen):
        enc_padding_mask, combined_mask, dec_padding_mask = create_masks(encoder_input, output)

        predictions, attention_weights = transformer(
            encoder_input, 
            output,
            False,
            enc_padding_mask,
            combined_mask,
            dec_padding_mask
        )
        #
        predictions = predictions[: ,-1:, :]
        predicted_id = tf.cast(tf.argmax(predictions, axis=-1), tf.int32)

        if predicted_id == summary_tokenizer.word_index["<stop>"]:
            return tf.squeeze(output, axis=0), attention_weights

        output = tf.concat([output, predicted_id], axis=-1)

    return tf.squeeze(output, axis=0), attention_weights


def summarize(input_document):
    # not considering attention weights for now, can be used to plot attention heatmaps in the future
    summarized = evaluate(input_document=input_document)[0].numpy()
    summarized = np.expand_dims(summarized[1:], 0)  # not printing <go> token
    
    return summary_tokenizer.sequences_to_texts(summarized)[0]  # since there is just one translated document

Try n°1:

In [None]:
i=51560
print(text[i])
print()
print((summary[i]))
print()
print(summarize(summary[i]))

Try n°2:

In [None]:
summarize("add music to the background during certain games and activities or allow sound\
effects to play every time a button or link is clicked on within the website when children\
interact with happy characters they may be influenced to revisit the website again in the\
future due to having a positive experience of their own use animations or photos of real people\
who are smiling and displaying positive nonthreatening body language ")

## **<span style="color:darkblue">  Conclusion**

This is one of the most challenging NLP tasks as it requires a range of abilities, such as understanding long passages and generating coherent text that captures the main topics in a document. 
However, when done well, text summarization is a powerful tool that can speed up various business processes by relieving the burden of domain experts to read long documents in detail.

## **<span style="color:darkblue"> References**
    
* https://www.turing.com/kb/brief-introduction-to-transformers-and-their-power
* https://jalammar.github.io/illustrated-transformer/

<!-- Centering the image -->
<div style="text-align:center">
  <!-- Inserting the image with a URL or file path -->
  <img src="https://i.ibb.co/yWVQ6x8/bro.png" alt="decod" width="900" height="600">
</div>