In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub
import tensorflow_text as text
from sklearn import model_selection
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from official.nlp import optimization

In [2]:
def start_end(doc):
    split = doc.split(" ")
    if len(split) <= 512:
        return doc
    elif len(doc) <= 1024:
        start = " ".join(split[0:512])
        rest = len(split)- 512 
        end = " ".join(split[-rest])
        return start + end
    else:
        start = " ".join(split[0:512])
        end = " ".join(split[-512])
        return start + end

In [3]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]
data["Split Text"] = data["Document"].apply(start_end)

train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Split Text'],data['Sensitivity'],test_size=0.2,random_state=5)

In [4]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3'
tfhub_handle_preprocess =   'https://tfhub.dev/tensorflow/bert_en_cased_preprocess/3'

In [5]:
text_input = tf.keras.layers.Input(shape=(), dtype=tf.string, name='text')
preprocessing_layer = hub.KerasLayer(tfhub_handle_preprocess, name='preprocessing')
encoder_inputs = preprocessing_layer(text_input)
encoder = hub.KerasLayer(tfhub_handle_encoder, trainable=True, name='BERT_encoder')
outputs = encoder(encoder_inputs)
net = outputs['pooled_output']
net = tf.keras.layers.Dropout(0.1)(net)
net = tf.keras.layers.Dense(1,activation=None)(net)
model = tf.keras.Model(text_input, net)
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 text (InputLayer)              [(None,)]            0           []                               
                                                                                                  
 preprocessing (KerasLayer)     {'input_word_ids':   0           ['text[0][0]']                   
                                (None, 128),                                                      
                                 'input_type_ids':                                                
                                (None, 128),                                                      
                                 'input_mask': (Non                                               
                                e, 128)}                                                      

In [6]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

epochs = 5
steps_per_epoch = 3300000000 / (256*512)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')


model.compile(optimizer=optimizer,loss=loss,metrics=metrics)

In [7]:
model.fit(train_x,train_y,batch_size=32, epochs=epochs, verbose=2)

Epoch 1/5
95/95 - 1686s - loss: 0.5386 - binary_accuracy: 0.8648 - 1686s/epoch - 18s/step
Epoch 2/5
95/95 - 1675s - loss: 0.4419 - binary_accuracy: 0.8655 - 1675s/epoch - 18s/step
Epoch 3/5
95/95 - 1574s - loss: 0.3995 - binary_accuracy: 0.8658 - 1574s/epoch - 17s/step
Epoch 4/5
95/95 - 1564s - loss: 0.3974 - binary_accuracy: 0.8658 - 1564s/epoch - 16s/step
Epoch 5/5
95/95 - 1572s - loss: 0.3814 - binary_accuracy: 0.8658 - 1572s/epoch - 17s/step


<keras.callbacks.History at 0x229d0b76a60>

In [8]:
model.evaluate(test_x,test_y)



[0.3547478914260864, 0.8764783143997192]

In [12]:
results = tf.sigmoid(model(tf.constant(test_x)))
for i in range(len(results)):
    print(results[i][0])

tf.Tensor(0.12657157, shape=(), dtype=float32)
tf.Tensor(0.1826132, shape=(), dtype=float32)
tf.Tensor(0.069251925, shape=(), dtype=float32)
tf.Tensor(0.2400643, shape=(), dtype=float32)
tf.Tensor(0.13382244, shape=(), dtype=float32)
tf.Tensor(0.10512707, shape=(), dtype=float32)
tf.Tensor(0.124637574, shape=(), dtype=float32)
tf.Tensor(0.101462215, shape=(), dtype=float32)
tf.Tensor(0.09741455, shape=(), dtype=float32)
tf.Tensor(0.22783718, shape=(), dtype=float32)
tf.Tensor(0.13705257, shape=(), dtype=float32)
tf.Tensor(0.17475387, shape=(), dtype=float32)
tf.Tensor(0.20406651, shape=(), dtype=float32)
tf.Tensor(0.12693131, shape=(), dtype=float32)
tf.Tensor(0.07123074, shape=(), dtype=float32)
tf.Tensor(0.16072416, shape=(), dtype=float32)
tf.Tensor(0.13418049, shape=(), dtype=float32)
tf.Tensor(0.17211026, shape=(), dtype=float32)
tf.Tensor(0.22416821, shape=(), dtype=float32)
tf.Tensor(0.12226176, shape=(), dtype=float32)
tf.Tensor(0.10986805, shape=(), dtype=float32)
tf.Tensor(0.