In [1]:
# Imports 
import pandas as pd
import numpy as np
from sklearn import model_selection
import tensorflow_hub as hub
import tensorflow as tf
import tensorflow_text
from tensorflow.keras.optimizers import Adam

# Set random seed for reproducability
np.random.seed(500)

In [2]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [3]:
# Prepare training data for BERT

def get_split(text1):
  text1 = str(text1)
  l_total = []
  l_parcial = []
  if len(text1.split())//150 >0:
    n = len(text1.split())//150
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      l_parcial = text1.split()[:200]
      l_total.append(" ".join(l_parcial))
    else:
      l_parcial = text1.split()[w*150:w*150 + 200]
      l_total.append(" ".join(l_parcial))
  return l_total

train_x = pd.DataFrame(train_x)
train_x["Text Split"] = train_x["Document"].apply(get_split)
train_x["Class"] = train_y

In [4]:
train_l = []
label_l = []
index_l =[]
for idx,row in train_x.iterrows():
  for l in row['Text Split']:
    train_l.append(l)
    label_l.append(row['Class'])
    index_l.append(idx)
len(train_l), len(label_l), len(index_l)

(18921, 18921, 18921)

In [5]:
# Prepare testing data for BERT
def get_split(text1):
  text1 = str(text1)
  l_total = []
  l_parcial = []
  if len(text1.split())//150 >0:
    n = len(text1.split())//150
  else: 
    n = 1
  for w in range(n):
    if w == 0:
      l_parcial = text1.split()[:200]
      l_total.append(" ".join(l_parcial))
    else:
      l_parcial = text1.split()[w*150:w*150 + 200]
      l_total.append(" ".join(l_parcial))
  return l_total

test_x = pd.DataFrame(test_x)
test_x["Text Split"] = test_x["Document"].apply(get_split)
test_x["Class"] = test_y

In [6]:
val_l = []
val_label_l = []
val_index_l = []
for idx,row in test_x.iterrows():
  for l in row['Text Split']:
    val_l.append(l)
    val_label_l.append(row['Document'])
    val_index_l.append(idx)
len(val_l), len(val_label_l), len(val_index_l)

(4656, 4656, 4656)

In [7]:
# Final datasets 
train_df = pd.DataFrame({"Document":train_l, "Class":label_l})
val_df = pd.DataFrame({"Document":val_l, "Class":val_label_l})

In [None]:
preprocessor = hub.load(
    "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

# Step 1: tokenize batches of text inputs.
text_inputs = tf.keras.layers.Input(shape=(), dtype=tf.string)
tokenize = hub.KerasLayer(preprocessor.tokenize)
tokenized_inputs = [tokenize(segment) for segment in text_inputs]

seq_length = 200  # Your choice here.
bert_pack_inputs = hub.KerasLayer(
    preprocessor.bert_pack_inputs,
    arguments=dict(seq_length=seq_length))  # Optional argument.
encoder_inputs = bert_pack_inputs(tokenized_inputs)

encoder = hub.KerasLayer("https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-2_H-768_A-12/1", trainable=True)
outputs = encoder(encoder_inputs)
net = outputs['pooled_output']
reshaped = tf.reshape(net,[-1, 768, 1])
lstm = tf.keras.layers.LSTM(512,return_sequences=True)(reshaped)
dense = tf.keras.layers.Dense(1, activation=None, name='classifier')(net)
model = tf.keras.Model(text_inputs, dense)
model.summary()

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.keras.metrics.Precision()
optimizer = tf.keras.optimizers.Adam()

In [None]:
model.compile(optimizer=optimizer,loss=loss,metrics=metrics)

In [None]:
history = model.fit(np.array(train_x["Text Split"]),np.array(train_y),epochs=5)

In [8]:
preprocessor = hub.load("https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/3")

text_inputs = train_x["Text Split"]
tokenize = hub.KerasLayer(preprocessor.tokenize)
tokenized_inputs = [tokenize(segment,False) for segment in text_inputs]

len(tokenized_inputs)

3040

In [11]:

'''
seq_length = 3040  # Your choice here.
bert_pack_inputs = hub.KerasLayer(preprocessor.bert_pack_inputs,arguments=dict(seq_length=seq_length))
encoder_inputs = bert_pack_inputs(tokenized_inputs)
'''

for i in tokenized_inputs:
    print(i)]

AttributeError: 'list' object has no attribute 'to_list'