# Text Summarizer Model Building

### Setup

In [2]:
## Get the embeddings 
import os 
import numpy as np 
import pandas as pd
import tensorflow as tf
from tensorflow import keras 
import tensorflow_hub as hub

# Loading the tokenization class for bert
# The tokenization file is present in the directory of the ipynb
import tokenization 

### Tokenizer & BERT loading

In [3]:
# Calling the FullTokenizer
# Establishing the tokenizer attributes
FullTokenizer = tokenization.FullTokenizer
# URL for the 12-layered BERT embeddings 
# this is uncased - meaning all the words are converted to lower case 
BERT_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'

# Creating the BERT keras layer 
bert_layer = hub.KerasLayer(BERT_URL, trainable = True)
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
vocabulary_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
tokenizer = FullTokenizer(vocabulary_file, to_lower_case)

# URL for the 12-layered BERT embeddings 
# this is uncased - meaning all the words are converted to lower case 
BERT_URL = 'https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2'

# Creating the BERT keras layer 
bert_layer = hub.KerasLayer(BERT_URL, trainable = True)

### 40 words Extractive Model

In [4]:
# Inputs 
max_len = 40 # Need this as the argument 
input_ids = keras.layers.Input(shape = (max_len, ), 
                              dtype = tf.int32, name = "input_ids")
input_mask = keras.layers.Input(shape = (max_len, ), 
                              dtype = tf.int32, name = "input_mask")
input_segments= keras.layers.Input(shape = (max_len, ), 
                              dtype = tf.int32, name = "input_segment")


# Bert layer 
# Works with teh BERT_URL
BERT_PATH = "./bert_uncased/bert_cased_L-12_H-768_A-12_1.tar/bert_cased_L-12_H-768_A-12_1"
bert_layer = hub.KerasLayer(BERT_URL, trainable=True)
# Bert Outputs 
sentence_pooled_outputs, sentence_embeddings = bert_layer([input_ids, input_segments, input_mask])

# End Lambda Layer
lambda_layer = keras.layers.Lambda(lambda inputs: tf.reduce_mean(inputs, axis = 1))
out = lambda_layer(sentence_pooled_outputs)

# Model Building 
model = keras.models.Model(inputs = [input_ids, input_segments, input_mask], 
                          outputs = [out])

In [5]:
# Model Summary 
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_ids (InputLayer)          [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_segment (InputLayer)      [(None, 40)]         0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 40)]         0                                            
__________________________________________________________________________________________________
keras_layer_2 (KerasLayer)      [(None, 768), (None, 109482241   input_ids[0][0]                  
                                                                 input_segment[0][0]   

In [None]:
# Saving the model 
save_path = "./models/"
model_name = "ExtractiveSummarizer_v1_40_words"
model.save(save_path, model_name)

### Embeddings Visualization

In [11]:
# Save file
vocab_file = "./vocab.txt"
with open(vocab_file, "r", encoding="utf-8") as fp:
    vocab_words = fp.readlines()

In [26]:
# Writing vocab files into metadata.tsv 
metadata_file = "./metadata.tsv"
with open(metadata_file, "w", encoding="utf-8") as fp:
    metadata_string = "".join(vocab_words)
    fp.write(metadata_string)

In [28]:
# Weights saving 
weights = tf.Variable(model.layers[3].get_weights()[0][1:])

In [36]:
# Checkpoints
log_dir = "./logs/"
checkpoint = tf.train.Checkpoint(embedding=weights)
checkpoint.save(os.path.join(log_dir, "embedding.ckpt"))

'./logs/embedding.ckpt-1'

In [37]:
# Embeddings 
from tensorboard.plugins import projector
config = projector.ProjectorConfig()
embedding = config.embeddings.add()

# The name of the tensor will be suffixed by `/.ATTRIBUTES/VARIABLE_VALUE`.
embedding.tensor_name = "embedding/.ATTRIBUTES/VARIABLE_VALUE"
embedding.metadata_path = 'metadata.tsv'
projector.visualize_embeddings(log_dir, config)