# Using pretrained BERT Base model to create vector for each text 

In [1]:
# %load_ext tensorboard

import tensorflow as tf
import datetime

# Clear any logs from previous runs
# !rm -rf ./logs/ 

In [2]:
#all imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import tensorflow_hub as hub
from tensorflow.keras.models import Model
from tqdm import  tqdm
import pickle
from sklearn.metrics import  accuracy_score

import warnings
warnings.filterwarnings("ignore")

In [3]:
pip install  sentencepiece

Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/f5/99/e0808cb947ba10f575839c43e8fafc9cc44e4a7a2c8f79c60db48220a577/sentencepiece-0.1.95-cp37-cp37m-manylinux2014_x86_64.whl (1.2MB)
[K     |████████████████████████████████| 1.2MB 9.8MB/s 
[?25hInstalling collected packages: sentencepiece
Successfully installed sentencepiece-0.1.95


<pre><font size=6> Preprocessing</font></pre>

In [5]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
X_train_p,y_train = pickle.load(open("/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/train_data_bert.pkl", 'rb')) 
X_test_p = pickle.load(open("/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/test_data_bert.pkl",'rb'))
X_cv_p,y_cv =  pickle.load(open("/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/cv_data_bert.pkl",'rb'))

<pre><font size=6>Creating BERT Model</font> 

In [7]:
## Loading the Pretrained Model from tensorflow HUB
tf.keras.backend.clear_session()

# maximum length of a seq in the data we have, for now i am making it as 55. You can change this
max_seq_length = 512

#BERT takes 3 inputs

#this is input words. Sequence of words represented as integers
input_word_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_word_ids")

#mask vector if you are padding anything
input_mask = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="input_mask")

#segment vectors. If you are giving only one sentence for the classification, total seg vector is 0. 
#If you are giving two sentenced with [sep] token separated, first seq segment vectors are zeros and 
#second seq segment vector are 1's
segment_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype=tf.int32, name="segment_ids")


In [8]:
#bert layer 
bert_layer = hub.KerasLayer("https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/1", trainable=False)
pooled_output, sequence_output = bert_layer([input_word_ids, input_mask, segment_ids])

#Bert model
#We are using only pooled output not sequence out. 
#If you want to know about those, please read https://www.kaggle.com/questions-and-answers/86510
bert_model = Model(inputs=[input_word_ids, input_mask, segment_ids], outputs=pooled_output)
#bert_model = Model(inputs=[input_word_ids],outputs=pooled_output)

In [9]:
bert_model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_word_ids (InputLayer)     [(None, 512)]        0                                            
__________________________________________________________________________________________________
input_mask (InputLayer)         [(None, 512)]        0                                            
__________________________________________________________________________________________________
segment_ids (InputLayer)        [(None, 512)]        0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_word_ids[0][0]             
                                                                 input_mask[0][0]             

In [10]:
bert_model.output

<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'keras_layer')>

<pre><font size=6>Tokenization</font></pre>

In [12]:
#getting Vocab file
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
do_lower_case = bert_layer.resolved_object.do_lower_case.numpy()

In [13]:
import sys
sys.path.append('/content/drive/My Drive/ Deep learning assignments/ NLP with Transfer Leraning')

from tokenization import FullTokenizer
tokenizer=FullTokenizer(vocab_file,do_lower_case)

In [16]:
def tokenize(X):
    X_tokens = np.array([0]*max_seq_length)
    X_mask = np.array([0]*max_seq_length)
    X_segment = np.array([0]*max_seq_length)  

    for i in tqdm(range(len(X))):
      tokens = tokenizer.tokenize(X[i]) #if the article length is more than 512 take first and last half portions
      if len(tokens)>=(max_seq_length-2):
        tokens1=tokens[0:(max_seq_length-2)//2]
        tokens2=tokens[(max_seq_length-2)//2:max_seq_length-2]
        tokens=np.append(tokens1,tokens2)
      tokens=['[CLS]',*tokens,'[SEP]']
      t=len(tokens)
    
      if len(tokens)<max_seq_length:
        for i in range(len(tokens),max_seq_length):
          tokens.append('[PAD]')

      a=np.array(tokenizer.convert_tokens_to_ids(tokens))
      X_tokens=np.vstack((X_tokens,a))

      b=np.array([1]*t+[0]*(max_seq_length-t))
      X_mask=np.vstack((X_mask,b))

      b=np.array([0]*max_seq_length)
      X_segment = np.vstack((X_segment,b))
     
    X_tokens=X_tokens[1:,]
    X_mask=X_mask[1:,]
    X_segment=X_segment[1:,]
    return (X_tokens,X_mask,X_segment)

In [17]:
X_train_tokens,X_train_mask,X_train_segment =  tokenize(X_train_p)
X_test_tokens,X_test_mask,X_test_segment =  tokenize(X_test_p)
X_cv_tokens,X_cv_mask,X_cv_segment =  tokenize(X_cv_p)

100%|██████████| 6712/6712 [01:30<00:00, 74.42it/s]
100%|██████████| 2748/2748 [00:15<00:00, 179.36it/s]
100%|██████████| 916/916 [00:02<00:00, 327.56it/s]


In [18]:
print('shape of X_train_tokens : ',X_train_tokens.shape)
print('shape of X_train_mask : ',X_train_mask.shape)
print('shape of X_train_segment : ',X_train_segment.shape)

print('='*50)

print('shape of X_test_tokens : ',X_test_tokens.shape)
print('shape of X_test_mask : ',X_test_mask.shape)
print('shape of X_test_segment : ',X_test_segment.shape)

print('='*50)

print('shape of X_cv_tokens : ',X_cv_tokens.shape)
print('shape of X_cv_mask : ',X_cv_mask.shape)
print('shape of X_cv_segment : ',X_cv_segment.shape)

shape of X_train_tokens :  (6712, 512)
shape of X_train_mask :  (6712, 512)
shape of X_train_segment :  (6712, 512)
shape of X_test_tokens :  (2748, 512)
shape of X_test_mask :  (2748, 512)
shape of X_test_segment :  (2748, 512)
shape of X_cv_tokens :  (916, 512)
shape of X_cv_mask :  (916, 512)
shape of X_cv_segment :  (916, 512)


In [19]:
 
pickle.dump((X_train_tokens,X_train_mask,X_train_segment, y_train),open('/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/train_bert.pkl','wb'))
pickle.dump((X_train_tokens,X_train_mask,X_train_segment),open('/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/test_bert.pkl','wb'))
pickle.dump((X_train_tokens,X_train_mask,X_train_segment, y_cv),open('/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/cv_bert.pkl','wb'))

In [20]:
X_train_tokens,X_train_mask,X_train_segment,y_train = pickle.load(open("/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/train_bert.pkl", 'rb')) 
X_train_tokens,X_train_mask,X_train_segment = pickle.load(open("/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/test_bert.pkl",'rb'))
X_train_tokens,X_train_mask,X_train_segment,y_cv =  pickle.load(open("/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/cv_bert.pkl",'rb'))

In [21]:
print('shape of X_train_tokens : ',X_train_tokens.shape)
print('shape of X_train_mask : ',X_train_mask.shape)
print('shape of X_train_segment : ',X_train_segment.shape)

print('='*50)

print('shape of X_test_tokens : ',X_test_tokens.shape)
print('shape of X_test_mask : ',X_test_mask.shape)
print('shape of X_test_segment : ',X_test_segment.shape)

print('='*50)

print('shape of X_cv_tokens : ',X_cv_tokens.shape)
print('shape of X_cv_mask : ',X_cv_mask.shape)
print('shape of X_cv_segment : ',X_cv_segment.shape)

shape of X_train_tokens :  (6712, 512)
shape of X_train_mask :  (6712, 512)
shape of X_train_segment :  (6712, 512)
shape of X_test_tokens :  (2748, 512)
shape of X_test_mask :  (2748, 512)
shape of X_test_segment :  (2748, 512)
shape of X_cv_tokens :  (916, 512)
shape of X_cv_mask :  (916, 512)
shape of X_cv_segment :  (916, 512)


In [22]:
print('Token array : ',X_train_tokens[0])
print('Mask array  : ',X_train_mask[0])
print('Segment array : ',X_train_segment[0])

Token array :  [  101  2250  9834  1998 29536  2850 14876  2638  2801  3613  2000  8292
  3207  2598  2000 17975 10147  2080  1010  2007  3006  3745  6409  1999
  2048  5486  7728  2005  2119  3316  1012  1006 12927  1007  2096  1996
  6393  1997 17975 10147  2080  2003  7058  5310  2918 13134  2038  9784
  1010  2009  2003  2145  4089  2041 19498  2075  2250  9834  1998 29536
  2850 14876  2638  2801 17975 10147  2080  2038  4227  3745  1999  3923
  2752  2750  2250  9834  1998 29536  2850 14876  2638  2801  1005  1055
  3579  2006  2122  4655   102     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0

<pre><font size=6> Getting Embeddings from BERT Model</font>

In [23]:
bert_model.input

[<KerasTensor: shape=(None, 512) dtype=int32 (created by layer 'input_word_ids')>,
 <KerasTensor: shape=(None, 512) dtype=int32 (created by layer 'input_mask')>,
 <KerasTensor: shape=(None, 512) dtype=int32 (created by layer 'segment_ids')>]

In [24]:
bert_model.output

<KerasTensor: shape=(None, 768) dtype=float32 (created by layer 'keras_layer')>

In [25]:
# get the train output, BERT model will give one output so save in
# X_train_pooled_output
X_train_pooled_output=bert_model.predict([X_train_tokens,X_train_mask,X_train_segment])

In [26]:
X_cv_pooled_output=bert_model.predict([X_cv_tokens,X_cv_mask,X_cv_segment])

In [27]:
# get the test output, BERT model will give one output so save in
# X_test_pooled_output
X_test_pooled_output=bert_model.predict([X_test_tokens,X_test_mask,X_test_segment])

In [28]:
##save all your results to disk so that, no need to run all again. 
pickle.dump((X_train_pooled_output, X_test_pooled_output,X_cv_pooled_output),open('/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/final_bert_output.pkl','wb'))

In [29]:
X_train_pooled_output, X_test_pooled_output,X_cv_pooled_output= pickle.load(open('/content/drive/MyDrive/ Competitions/ Predict_the_news_category/ data/final_bert_output.pkl', 'rb'))

In [30]:
print(X_train_pooled_output.shape)
print(X_test_pooled_output.shape)
print(X_cv_pooled_output.shape)

(6712, 768)
(2748, 768)
(916, 768)
