## Install Tensorflow 2.0

In [None]:
#packages have to be installed in requirements.txt before running this colab notebook.
#Importing essential packages.

#https://www.tensorflow.org/install/pip
import tensorflow as tf
#https://numpy.org/doc/stable/user/absolute_beginners.html
import numpy as np
#https://docs.python.org/3/library/unicodedata.html
import unicodedata
#https://docs.python.org/3/library/re.html
import re
#https://docs.python.org/3/library/time.html
import time
#https://pandas.pydata.org/pandas-docs/stable/user_guide/10min.html
import pandas as pd

## Import packages and define data

In [None]:
#we had took the dataset from " https://www.kaggle.com/sunnysai12345/news-summary "
data_set=pd.read_csv("news_summary_more.csv",nrows=300)

In [None]:
#we are implementing our data to 25 only for computation limitations.
raw_data=data_set['text'].head(25)
summary_data=data_set['headlines'].head(25)

## Preprocessing

In [None]:
#This function "convert_uni_2_ascii" convert various unicode data based on their ascii values.
def convert_uni_2_ascii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn')

#noramlizing the string (preprocessing data).
def normalize(s):
    s = convert_uni_2_ascii(s)
    s = re.sub(r'([!.?])', r' \1', s)
    s = re.sub(r'[^a-zA-Z.!?]+', r' ', s)
    s = re.sub(r'\s+', r' ', s)
    return s

#Assigning raw_data and summarized data in data_set into lists.
raw_data, summary_data = list(raw_data), list(summary_data)
#applying pre processing over the data.
raw_data = [normalize(data) for data in raw_data]
#modifying the data with "<start>" and "<end>" quotes to feed the network.
raw_data_in = ['<start> ' + normalize(data) for data in summary_data]
summary_data_out = [normalize(data) + ' <end>' for data in summary_data]

## Tokenization

In [None]:
#Intializing the tensorflow tokenizer for raw_data.
raw_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
#Fitting the data.
raw_tokenizer.fit_on_texts(raw_data)
data_raw = raw_tokenizer.texts_to_sequences(raw_data)
#Applying padding to get all the data into same length vectors.
data_raw = tf.keras.preprocessing.sequence.pad_sequences(data_raw,padding='post')
#Intializing the tensorflow tokenizer for summary_data in and out.
summary_tokenizer = tf.keras.preprocessing.text.Tokenizer(filters='')
#Fitting the data.
summary_tokenizer.fit_on_texts(raw_data_in)
summary_tokenizer.fit_on_texts(summary_data_out)
data_summ_in = summary_tokenizer.texts_to_sequences(raw_data_in)
#Applying padding to get all the data into same length vectors.
data_summ_in = tf.keras.preprocessing.sequence.pad_sequences(data_summ_in, padding='post')
data_summ_out = summary_tokenizer.texts_to_sequences(summary_data_out)
#Applying padding to get all the data into same length vectors.
data_summ_out = tf.keras.preprocessing.sequence.pad_sequences(data_summ_out,padding='post')

## Create tf.data.Dataset object

In [None]:
#Slicing the data into batches to feed into the network.
size = 5
dataset = tf.data.Dataset.from_tensor_slices((data_raw, data_summ_in, data_summ_out))
#we are shuffle rate to 10 because we had limited our data.
dataset = dataset.shuffle(10).batch(size)

## Create the Positional Embedding

In [None]:
#Defining the basic block of ATTENTION mechanism positional embedding.
#https://datascience.stackexchange.com/questions/51065/what-is-the-positional-encoding-in-the-transformer-model
def positional_embedding(pos, model_size):
    #creating numpy array to store the values.
    Pos_E = np.zeros((1, model_size))
    for i in range(model_size):
        if i % 2 == 0:
            Pos_E[:, i] = np.sin(pos / 10000 ** (i / model_size))
        else:
            Pos_E[:, i] = np.cos(pos / 10000 ** ((i - 1) / model_size))
    return Pos_E
#assigning the parameters.
max_length = max(len(data_raw[0]), len(data_summ_in[0]))
MODEL_SIZE = 128

pes = []
for i in range(max_length):
    pes.append(positional_embedding(i, MODEL_SIZE))

pes = np.concatenate(pes, axis=0)
pes = tf.constant(pes, dtype=tf.float32)

In [None]:
#In this class CrossHeadAttention we are implementing cross head attention.
#https://towardsdatascience.com/transformers-explained-visually-part-3-multi-head-attention-deep-dive-1c1ff1024853
class CrossHeadAttention(tf.keras.Model):
    def __init__(self, model_size, h):
      #calling main class.
        super(CrossHeadAttention, self).__init__()
        self.query_size = model_size 
        self.key_size = model_size 
        self.value_size = model_size 
        self.h = h
        self.wq = [tf.keras.layers.Dense(self.query_size) for _ in range(h)]
        self.wk = [tf.keras.layers.Dense(self.key_size) for _ in range(h)]
        self.wv = [tf.keras.layers.Dense(self.value_size) for _ in range(h)]
        self.wo = tf.keras.layers.Dense(model_size)

    def call(self, decoder_output, encoder_output):
        #Intializing heads lists to store the weights(parameters) of attention.
        heads = []
        for i in range(self.h):
            score = tf.matmul(self.wq[i](decoder_output), self.wk[i](encoder_output), transpose_b=True) / tf.math.sqrt(tf.dtypes.cast(self.key_size, tf.float32))
            alignment = tf.nn.softmax(score, axis=2)
            head = tf.matmul(alignment, self.wv[i](encoder_output))
            heads.append(head)
        heads = tf.concat(heads, axis=2)
        heads = self.wo(heads)
        return heads

## Create the Encoder

In [None]:
#In this class Encoder we are implementing encoder mechanism.
#https://jalammar.github.io/illustrated-transformer/
class Encoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
      #calling main class.
        super(Encoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h
        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)
        self.attention = [CrossHeadAttention(model_size, h) for _ in range(num_layers)]
        self.attention_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.dense_1 = [tf.keras.layers.Dense(512, activation='relu') for _ in range(num_layers)]
        self.dense_2 = [tf.keras.layers.Dense(model_size) for _ in range(num_layers)]
        self.ffn_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        
    def call(self, sequence):
    #Intializing sub_in lists to store the weights(parameters) of encoder.
        sub_in = []
        for i in range(sequence.shape[1]):
            embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))
            sub_in.append(embed + pes[i, :])   
        sub_in = tf.concat(sub_in, axis=1)
        for i in range(self.num_layers):
            sub_out = []
            for j in range(sub_in.shape[1]):
                attention = self.attention[i](
                    tf.expand_dims(sub_in[:, j, :], axis=1), sub_in)
                sub_out.append(attention)
            sub_out = tf.concat(sub_out, axis=1)
            sub_out = sub_in + sub_out
            sub_out = self.attention_norm[i](sub_out)
            ffn_in = sub_out
            ffn_out = self.dense_2[i](self.dense_1[i](ffn_in))
            ffn_out = ffn_in + ffn_out
            ffn_out = self.ffn_norm[i](ffn_out)
            sub_in = ffn_out
            
        return ffn_out

In [None]:
#In this class Decoder we are implementing decoder mechanism.
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, model_size, num_layers, h):
        super(Decoder, self).__init__()
        self.model_size = model_size
        self.num_layers = num_layers
        self.h = h
        self.embedding = tf.keras.layers.Embedding(vocab_size, model_size)
        self.attention_bot = [CrossHeadAttention(model_size, h) for _ in range(num_layers)]
        self.attention_bot_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.attention_mid = [CrossHeadAttention(model_size, h) for _ in range(num_layers)]
        self.attention_mid_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)] 
        self.dense_1 = [tf.keras.layers.Dense(512, activation='relu') for _ in range(num_layers)]
        self.dense_2 = [tf.keras.layers.Dense(model_size) for _ in range(num_layers)]
        self.ffn_norm = [tf.keras.layers.BatchNormalization() for _ in range(num_layers)]
        self.dense = tf.keras.layers.Dense(vocab_size)
        
    def call(self, sequence, encoder_output):
    #Intializing embed_out lists to store the weights(parameters) of embedding and post embedding.
        embed_out = []
        for i in range(sequence.shape[1]):
            embed = self.embedding(tf.expand_dims(sequence[:, i], axis=1))
            embed_out.append(embed + pes[i, :])
        embed_out = tf.concat(embed_out, axis=1)
        bot_sub_in = embed_out        
        for i in range(self.num_layers):
            bot_sub_out = []
            for j in range(bot_sub_in.shape[1]):
                values = bot_sub_in[:, :j, :]
                attention = self.attention_bot[i](
                    tf.expand_dims(bot_sub_in[:, j, :], axis=1), values)
                bot_sub_out.append(attention)
            bot_sub_out = tf.concat(bot_sub_out, axis=1)
            bot_sub_out = bot_sub_in + bot_sub_out
            bot_sub_out = self.attention_bot_norm[i](bot_sub_out)
            mid_sub_in = bot_sub_out
            mid_sub_out = []
            for j in range(mid_sub_in.shape[1]):
                attention = self.attention_mid[i](
                    tf.expand_dims(mid_sub_in[:, j, :], axis=1), encoder_output)
                mid_sub_out.append(attention)
            mid_sub_out = tf.concat(mid_sub_out, axis=1)
            mid_sub_out = mid_sub_out + mid_sub_in
            mid_sub_out = self.attention_mid_norm[i](mid_sub_out)
            ffn_in = mid_sub_out
            ffn_out = self.dense_2[i](self.dense_1[i](ffn_in))
            ffn_out = ffn_out + ffn_in
            ffn_out = self.ffn_norm[i](ffn_out)
            bot_sub_in = ffn_out
        logits = self.dense(ffn_out)    
        return logits

In [None]:
#Using SparseCategoricalCrossentropy to train the model.
crossentropy = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
def loss_func(targets, logits):
    mask = tf.math.logical_not(tf.math.equal(targets, 0))
    mask = tf.cast(mask, dtype=tf.int64)
    loss = crossentropy(targets, logits, sample_weight=mask)
    return loss

#Using adam optimizer.
optimizer = tf.keras.optimizers.Adam()

In [None]:
#predict function to get summized vector for query data.
def predict(test_source_text=None):
    if test_source_text is None:
        test_source_text = raw_data_en[np.random.choice(len(raw_data_en))]
    print(test_source_text)
    test_source_seq = en_tokenizer.texts_to_sequences([test_source_text])
    print(test_source_seq)

    en_output = encoder(tf.constant(test_source_seq))

    de_input = tf.constant([[fr_tokenizer.word_index['<start>']]], dtype=tf.int64)

    out_words = []

    while True:
        de_output = decoder(de_input, en_output)
        new_word = tf.expand_dims(tf.argmax(de_output, -1)[:, -1], axis=1)
        out_words.append(fr_tokenizer.index_word[new_word.numpy()[0][0]])

        de_input = tf.concat((de_input, new_word), axis=-1)

        if out_words[-1] == '<end>' or len(out_words) >= 14:
            break

    print(' '.join(out_words))

In [None]:
#Training step
@tf.function
def train_step(source_seq, target_seq_in, target_seq_out):
    with tf.GradientTape() as tape:
        encoder_output = encoder(source_seq)
        decoder_output = decoder(target_seq_in, encoder_output)
        loss = loss_func(target_seq_out, decoder_output)
    variables = encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(loss, variables)
    optimizer.apply_gradients(zip(gradients, variables))
    return loss

In [None]:
NUM_EPOCHS = 10

start_time = time.time()
for e in range(NUM_EPOCHS):
    for batch, (source_seq, target_seq_in, target_seq_out) in enumerate(dataset.take(-1)):
        loss = train_step(source_seq, target_seq_in,
                          target_seq_out)

    print('Epoch {} Loss {:.4f}'.format(
          e + 1, loss.numpy()))

Epoch 1 Loss nan
Epoch 2 Loss nan
Epoch 3 Loss nan
Epoch 4 Loss nan
Epoch 5 Loss nan
Epoch 6 Loss nan
Epoch 7 Loss nan
Epoch 8 Loss nan
Epoch 9 Loss nan
Epoch 10 Loss nan
Average elapsed time: 9.91s
Your idea is not entirely crazy .
[[24, 25, 6, 26, 27, 28, 1]]
0


In [None]:
for test_sent in raw_data[:5]:
    test_sequence = normalize(test_sent)
    predict(test_sequence)

Saurav Kant an alumnus of upGrad and IIIT B s PG Program in Machine learning and Artificial Intelligence was a Sr Systems Engineer at Infosys with almost years of work experience . The program and upGrad s degree career support helped him transition to a Data Scientist at Tech Mahindra with salary hike . upGrad s Online Power Learning has powered lakh careers .
[[37, 46, 30, 9, 46, 5, 37, 48, 1, 4, 46, 30, 39, 5, 1, 30, 1]]


KeyError: ignored