### Exploring BERT, sentence_BERT, and pt, tf BERT

#### BERT

In [None]:
!pip install transformers

In [None]:
import torch
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel, BertModel

# initialize tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# initialize model and set it to evaluation mode coz we not gonna train it
model_tf = TFBertModel.from_pretrained('bert-base-uncased', output_hidden_states=False)
model_pt = BertModel.from_pretrained('bert-base-uncased', output_hidden_states=False)
model_pt.eval()

In [3]:
sample_sent1 = 'I like this movie very much.'
sample_sent2 = 'I love the movie, it is awesome.'
sample_sent3 = 'I actually prefer horror movie.'

In [5]:
encoded_input_tf1 = tokenizer(sample_sent1, return_tensors='tf')
encoded_input_tf2 = tokenizer(sample_sent2, return_tensors='tf')
encoded_input_tf3 = tokenizer(sample_sent3, return_tensors='tf')
# encoded_input_pt = tokenizer(sample_sent, return_tensors='pt')
output_tf1 = model_tf(encoded_input_tf1)
output_tf2 = model_tf(encoded_input_tf2)
output_tf3 = model_tf(encoded_input_tf3)
# output_pt = model_pt(**encoded_input_pt)

In [34]:
sample_tf_emb1 = tf.math.reduce_mean(output_tf1[0],axis=1)
sample_tf_emb2 = tf.math.reduce_mean(output_tf2[0],axis=1)
sample_tf_emb3 = tf.math.reduce_mean(output_tf3[0],axis=1)

# This pooled_output from the BERT output = tf.layers.dense([cls]_token_embeddings_tensor, embed_size=768, activation=tanh,...)
# is different from any other embed output, just as a representaion of this sentence regardless of padding, masking, etc.
# sample_tf_emb1 = output_tf1[1]
# sample_tf_emb2 = output_tf2[1]
# sample_tf_emb3 = output_tf3[1]

#### Sentence BERT

In [None]:
!pip install sentence-transformers

In [None]:
from sentence_transformers import SentenceTransformer
sbert_model = SentenceTransformer('bert-base-nli-mean-tokens')

In [17]:
sample_emb1 = sbert_model.encode([sample_sent1])
sample_emb2 = sbert_model.encode([sample_sent2])
sample_emb3 = sbert_model.encode([sample_sent3])

In [21]:
sample_emb1.shape

(1, 768)

In [35]:
from scipy.spatial.distance import cosine
print('different model on same sentence')
print(1-cosine(sample_emb1[0],sample_tf_emb1[0]))
print(1-cosine(sample_emb2[0],sample_tf_emb2[0]))
print(1-cosine(sample_emb3[0],sample_tf_emb3[0]))
# Make sense! different models output 768 dimensions all have different meaning, of course they can not be compared

different model on same sentence
0.29537123441696167
0.28814637660980225
0.3951141834259033


In [36]:
print('model performance for each model')
print('sentenceBERT')
print(sample_sent1,sample_sent2,1-cosine(sample_emb1[0],sample_emb2[0]))
print(sample_sent1,sample_sent3,1-cosine(sample_emb1[0],sample_emb3[0]))
print('BERT')
print(sample_sent1,sample_sent2,1-cosine(sample_tf_emb1[0],sample_tf_emb2[0]))
print(sample_sent1,sample_sent3,1-cosine(sample_tf_emb1[0],sample_tf_emb3[0]))

model performance for each model
sentenceBERT
I like this movie very much. I love the movie, it is awesome. 0.935429036617279
I like this movie very much. I actually prefer horror movie. 0.5741518139839172
BERT
I like this movie very much. I love the movie, it is awesome. 0.849561333656311
I like this movie very much. I actually prefer horror movie. 0.7852483987808228


In [None]:
# Seems for sentence contextual similarity, sentence BERT is better.

### Building the subsequent CNN model

In [37]:
import pandas as pd
import numpy as np

from keras import layers,Model
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, GlobalMaxPooling1D,Reshape, Dense, Dropout, Flatten, MaxPooling1D, Input, Concatenate
from keras.models import load_model

In [39]:
DOC_PER_INSTANCE = 1000
EMB_SIZE = 768

In [40]:
model = Sequential()
model.add(Conv1D(1024, 3, activation='relu',input_shape=(DOC_PER_INSTANCE,EMB_SIZE)))
model.add(GlobalMaxPooling1D())
model.add(Dense(2, activation='sigmoid'))
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 conv1d_1 (Conv1D)           (None, 998, 1024)         2360320   
                                                                 
 global_max_pooling1d_1 (Glo  (None, 1024)             0         
 balMaxPooling1D)                                                
                                                                 
 dense_1 (Dense)             (None, 2)                 2050      
                                                                 
Total params: 2,362,370
Trainable params: 2,362,370
Non-trainable params: 0
_________________________________________________________________
