In [2]:
import pandas as pd
from transformers import TFBertForSequenceClassification, BertTokenizer
import torch
import tensorflow as tf
from tensorflow.nn import softmax
from tensorflow.math import reduce_mean
import numpy as np
from sklearn.model_selection import train_test_split
from datasets import Dataset
from datasets import flatten_nest_dict
import tensorflow_text as text
import tensorflow_hub as hub

In [None]:
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]

test_data = data.sample(1)["Document"].values[0]
test_data_label = data.sample(1)["Sensitivity"].values[0]
test_data_label

In [4]:
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased",num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokens = tokenizer.encode_plus(test_data, add_special_tokens=False,return_tensors="pt")

In [None]:
input_ids_chunks = tokens["input_ids"][0].split(510)
mask_chunks = tokens["attention_mask"][0].split(510)

for tensor in input_ids_chunks:
    print(len(tensor))

In [None]:
chunksize = 512
input_ids_chunks = list(input_ids_chunks)
mask_chunks = list(mask_chunks)

for i in range(len(input_ids_chunks)):
    input_ids_chunks[i] = torch.cat([torch.Tensor([101]),input_ids_chunks[i],torch.Tensor([102])])
    mask_chunks[i] = torch.cat([torch.Tensor([1]),mask_chunks[i],torch.Tensor([1])])

    pad_len = chunksize - input_ids_chunks[i].shape[0]
    if pad_len > 0:
        input_ids_chunks[i] = torch.cat([input_ids_chunks[i], torch.Tensor([0] * pad_len)])
        mask_chunks[i] = torch.cat([mask_chunks[i], torch.Tensor([0] * pad_len)])

for tensor in input_ids_chunks:
    print(len(tensor))

tensor

In [None]:
input_ids = torch.stack(input_ids_chunks)
attention_mask = torch.stack(mask_chunks)

np_tensor = input_ids.numpy()
input_ids = tf.convert_to_tensor(np_tensor)
input_ids = tf.cast(input_ids, tf.int64)

np_tensor = attention_mask.numpy()
attention_mask = tf.convert_to_tensor(np_tensor)
attention_mask = tf.cast(attention_mask, tf.int32)

input_dict = {'input_ids' : input_ids,'attention_mask':attention_mask}

input_dict

In [None]:
outputs = model(**input_dict)
outputs

In [None]:
probs = tf.nn.softmax(outputs[0],axis=-1)
mean = tf.math.reduce_mean(probs,axis=0)

mean

In [None]:
######################################
## TRAIN MODEL ON THIS CHUNKED BERT ##
######################################

In [7]:
def chunked_tokenise(txt):
    tokens = tokenizer.encode_plus(txt, add_special_tokens=False,return_tensors="pt")

    input_ids_chunks = tokens["input_ids"][0].split(510)
    mask_chunks = tokens["attention_mask"][0].split(510)

    chunksize = 512
    input_ids_chunks = list(input_ids_chunks)
    mask_chunks = list(mask_chunks)

    for i in range(len(input_ids_chunks)):
        input_ids_chunks[i] = torch.cat([torch.Tensor([101]),input_ids_chunks[i],torch.Tensor([102])])
        mask_chunks[i] = torch.cat([torch.Tensor([1]),mask_chunks[i],torch.Tensor([1])])

        pad_len = chunksize - input_ids_chunks[i].shape[0]
        if pad_len > 0:
            input_ids_chunks[i] = torch.cat([input_ids_chunks[i], torch.Tensor([0] * pad_len)])
            mask_chunks[i] = torch.cat([mask_chunks[i], torch.Tensor([0] * pad_len)])

    input_ids = torch.stack(input_ids_chunks)
    attention_mask = torch.stack(mask_chunks)

    np_tensor = input_ids.numpy()
    input_ids = tf.convert_to_tensor(np_tensor)
    input_ids = tf.cast(input_ids, tf.int64)

    np_tensor = attention_mask.numpy()
    attention_mask = tf.convert_to_tensor(np_tensor)
    attention_mask = tf.cast(attention_mask, tf.int32)

    input_dict = {'input_ids' : input_ids,'attention_mask':attention_mask}
    return input_dict

In [None]:
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]

train_x, test_x, train_y, test_y = train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)
train_x = np.array(train_x)
train_y = np.array(train_y)

dataset = []
for i in range(len(train_x)):
    dataset.append({"label" : train_y[i], "text" : train_x[i]})

dataset = pd.DataFrame(dataset)
dataset = Dataset.from_pandas(dataset)

def preprocess_function(examples):
    return chunked_tokenise(examples["text"])

tokenised_dataset = dataset.map(preprocess_function, batched=False)
tokenised_dataset.features['input_ids'] = tokenised_dataset.features['input_ids'].feature
tokenised_dataset.features['attention_mask'] = tokenised_dataset.features['attention_mask'].feature
tokenised_dataset.features

In [None]:
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer, return_tensors="tf")

tf_train_dataset = tokenised_dataset.to_tf_dataset(
    columns=['attention_mask', 'input_ids', 'label'],
    label_cols=['label'],
    shuffle=True,
    batch_size=1,
    collate_fn=data_collator,
)

In [None]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 16
num_epochs = 5
batches_per_epoch = len(tf_train_dataset) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(
    init_lr=2e-5, 
    num_warmup_steps=0, 
    num_train_steps=total_train_steps
)

In [None]:
model.compile(loss="binary_crossentropy",optimizer=optimizer)
model.fit(tf_train_dataset,epochs=5)

In [None]:
##############################################
## ATTEMPT MANUAL MODEL USING POOLER OUTPUT ##
##############################################

In [5]:
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]

train_x, test_x, train_y, test_y = train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)
train_x = np.array(train_x)
train_y = np.array(train_y)
test_x = np.array(test_x)
test_y = np.array(test_y)

In [None]:
###################################
## UNDERSAMPLED BALANCED DATASET ##
###################################

data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]

train_x, test_x, train_y, test_y = train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)
train_x = pd.DataFrame([train_x,train_y]).T

classes_zero = train_x[train_x['Sensitivity'] == 0]
classes_one = train_x[train_x['Sensitivity'] == 1]

classes_zero = classes_zero.sample(len(classes_one))

undersampled_dataset = pd.concat([classes_zero,classes_one])
undersampled_dataset = undersampled_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

train_x = np.array(undersampled_dataset["Document"])
train_y = np.array(undersampled_dataset["Sensitivity"])
train_y = tf.convert_to_tensor(train_y,dtype=tf.float32)

In [None]:
###################################
## OVERSAMPLED BALANCED DATASET ##
###################################

data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]

train_x, test_x, train_y, test_y = train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)
train_x = pd.DataFrame([train_x,train_y]).T

classes_zero = train_x[train_x['Sensitivity'] == 0]
classes_one = train_x[train_x['Sensitivity'] == 1]
print(len(classes_zero),len(classes_one))

classes_one = classes_one.sample(len(classes_zero), replace=True)

oversampled_dataset = pd.concat([classes_zero,classes_one])
oversampled_dataset = oversampled_dataset.sample(frac=1, random_state=42).reset_index(drop=True)

train_x = np.array(oversampled_dataset["Document"])
train_y = np.array(oversampled_dataset["Sensitivity"])
train_y = tf.convert_to_tensor(train_y,dtype=tf.float32)

In [7]:
tfhub_handle_encoder = 'https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/3'
tokenizer = BertTokenizer.from_pretrained("bert-base-cased")
model = TFBertForSequenceClassification.from_pretrained("bert-base-cased",num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
#bert_model = hub.KerasLayer(tfhub_handle_encoder,trainable=False)
def get_pooled_outputs(txt):
    inputs = chunked_tokenise(txt)
    model_outputs = model(**inputs,output_hidden_states=True)
    hidden_states = model_outputs.hidden_states
    len_list= np.arange(0,len(hidden_states))
    pooled_output = tf.concat(tuple([hidden_states[i] for i in len_list]), axis=-1)
    pooled_output = pooled_output[:, 0, :]
    #pooled_output = tf.reduce_mean(pooled_output,axis=0)
    return pooled_output


pooled_outputs = []
i = 1
for sample in train_x:
    pooled_outputs.append(get_pooled_outputs(sample))
    print(str(i) + " Documents Processed..." + str( round( (i/len(train_x)) * 100, 2)) + "%")
    i += 1


In [None]:
tester = pooled_outputs
averaged_tester = []
for elt in tester:
    averaged_tester.append(np.mean(elt,axis=0))

averaged_tester = tf.convert_to_tensor(averaged_tester)
averaged_tester.shape

In [None]:
def elt_wise_mean(x):
    return tf.convert_to_tensor(np.mean(x,axis=0))

class Chunk(tf.keras.layers.Layer):
    def __init__(self, input_dim):
        super(Chunk, self).__init__()

    def call(self, inputs):
        return chunked_tokenise(inputs)

class Model_Call(tf.keras.layers.Layer):
    def __init__(self):
        super(Model_Call, self).__init__()

    def call(self, inputs):
        return model(**inputs)

def model_call(encoder_inputs):
    return model(**encoder_inputs)

pooled_input = tf.keras.layers.Input(shape=(9984,), dtype=tf.float32, name='Pooled Input')
dense = tf.keras.layers.Dense(2, activation='softmax', input_dim=1, name='Dense')(pooled_input)

model = tf.keras.Model(pooled_input, dense)
model.summary()

In [None]:
from transformers import create_optimizer
import tensorflow as tf

optimiser = tf.optimizers.Adam(learning_rate= 5e-5)
#BINARY CROSSENTROPY SHOULD BE HERE, NO?
model.compile(loss="sparse_categorical_crossentropy",optimizer=optimiser)

In [None]:
model.fit(averaged_tester,train_y,epochs=5,batch_size=1)

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer

test_pooled_outputs = []
i = 1
for sample in test_x:
    test_pooled_outputs.append(get_pooled_outputs(sample))
    print(str(i) + " Documents Processed..." + str( round( (i/len(test_x)) * 100, 2)) + "%")
    i += 1

In [None]:
test_samples = []
for elt in test_pooled_outputs:
    test_samples.append(np.mean(elt,axis=0))

test_samples = tf.convert_to_tensor(test_samples)

In [None]:
predictions = model.predict(test_samples)
bin_preds = []
for pred in predictions:
    if pred[1] > pred[0]:
        bin_preds.append(1)
    else:
        bin_preds.append(0)

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer

precision = precision_score(test_y, bin_preds)
bac = balanced_accuracy_score(test_y, bin_preds)
f2 = fbeta_score(test_y, bin_preds, beta=2.0)
print(precision,bac,f2)

In [None]:
######################################################
## TRY TO MAKE SURE USING ALL OF THE POOLED OUTPUTS ##
######################################################

In [8]:
def get_pooled_outputs_v2(txt):
    complete_pooled_outputs = []
    inputs = chunked_tokenise(txt)
    model_outputs = model(**inputs,output_hidden_states=True)
    hidden_states = model_outputs.hidden_states
    for i in range(len(hidden_states)):
        for j in range(len(hidden_states[i])):
            complete_pooled_outputs.append(hidden_states[i][j])

    complete_pooled_outputs = tf.convert_to_tensor(complete_pooled_outputs)
    return tf.reduce_mean(complete_pooled_outputs,axis=0)

pooled_outputs = []
i = 1
for sample in train_x:
    pooled_outputs.append(get_pooled_outputs_v2(sample))
    print(str(i) + " Documents Processed..." + str( round( (i/len(train_x)) * 100, 2)) + "%")
    i += 1

Token indices sequence length is longer than the specified maximum sequence length for this model (1238 > 512). Running this sequence through the model will result in indexing errors


1 Documents Processed...0.03%
2 Documents Processed...0.07%
3 Documents Processed...0.1%
4 Documents Processed...0.13%
5 Documents Processed...0.16%
6 Documents Processed...0.2%
7 Documents Processed...0.23%
8 Documents Processed...0.26%
9 Documents Processed...0.3%
10 Documents Processed...0.33%
11 Documents Processed...0.36%
12 Documents Processed...0.39%
13 Documents Processed...0.43%
14 Documents Processed...0.46%
15 Documents Processed...0.49%
16 Documents Processed...0.53%
17 Documents Processed...0.56%
18 Documents Processed...0.59%
19 Documents Processed...0.62%
20 Documents Processed...0.66%
21 Documents Processed...0.69%
22 Documents Processed...0.72%
23 Documents Processed...0.76%
24 Documents Processed...0.79%
25 Documents Processed...0.82%
26 Documents Processed...0.86%
27 Documents Processed...0.89%
28 Documents Processed...0.92%
29 Documents Processed...0.95%
30 Documents Processed...0.99%
31 Documents Processed...1.02%
32 Documents Processed...1.05%
33 Documents Process

In [13]:
averaged_tester = tf.convert_to_tensor(pooled_outputs)
averaged_tester.shape

TensorShape([3040, 512, 768])

In [51]:
pooled_input = tf.keras.layers.Input(shape=(512, 768), dtype=tf.float32, name='Pooled Input')
dropout = tf.keras.layers.Dropout(0.1)(pooled_input)
dense = tf.keras.layers.Dense(2, activation='softmax', input_dim=1, name='Dense')(dropout)

model = tf.keras.Model(pooled_input, dense)
model.summary()

from transformers import create_optimizer
import tensorflow as tf

optimiser = tf.optimizers.Adam(learning_rate= 5e-5)
#BINARY CROSSENTROPY SHOULD BE HERE, NO?
model.compile(loss="sparse_categorical_crossentropy",optimizer=optimiser)

model.fit(averaged_tester,train_y,epochs=5,batch_size=1)

Model: "model_34"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 Pooled Input (InputLayer)   [(None, 512, 768)]        0         
                                                                 
 dropout_41 (Dropout)        (None, 512, 768)          0         
                                                                 
 Dense (Dense)               (None, 512, 2)            1538      
                                                                 
Total params: 1,538
Trainable params: 1,538
Non-trainable params: 0
_________________________________________________________________
Epoch 1/5


ValueError: in user code:

    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\engine\training.py", line 878, in train_function  *
        return step_function(self, iterator)
    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\engine\training.py", line 867, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\engine\training.py", line 860, in run_step  **
        outputs = model.train_step(data)
    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\engine\training.py", line 809, in train_step
        loss = self.compiled_loss(
    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\engine\compile_utils.py", line 201, in __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\losses.py", line 141, in __call__
        losses = call_fn(y_true, y_pred)
    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\losses.py", line 245, in call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\losses.py", line 1737, in sparse_categorical_crossentropy
        return backend.sparse_categorical_crossentropy(
    File "C:\Users\jack-\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.9_qbz5n2kfra8p0\LocalCache\local-packages\Python39\site-packages\keras\backend.py", line 5113, in sparse_categorical_crossentropy
        res = tf.nn.sparse_softmax_cross_entropy_with_logits(

    ValueError: `labels.shape` must equal `logits.shape` except for the last dimension. Received: labels.shape=(1,) and logits.shape=(512, 2)


In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer

test_pooled_outputs = []
i = 1
for sample in test_x:
    test_pooled_outputs.append(get_pooled_outputs_v2(sample))
    print(str(i) + " Documents Processed..." + str( round( (i/len(test_x)) * 100, 2)) + "%")
    i += 1

In [None]:
predictions = model.predict(test_samples)
bin_preds = []
for pred in predictions:
    if pred[1] > pred[0]:
        bin_preds.append(1)
    else:
        bin_preds.append(0)

from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from sklearn.metrics import make_scorer

precision = precision_score(test_y, bin_preds)
bac = balanced_accuracy_score(test_y, bin_preds)
f2 = fbeta_score(test_y, bin_preds, beta=2.0)
print(precision,bac,f2)