In [None]:
import pandas as pd
import numpy as np
from sklearn import model_selection
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score
from imblearn.over_sampling import RandomOverSampler
from official.nlp import optimization

In [None]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [None]:
tokenizer = Tokenizer(num_words=500000)
tokenizer.fit_on_texts(train_x)

words_to_index = tokenizer.word_index

In [None]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
      
  return word_to_vec_map

word_to_vec_map = read_glove_vector("..\deep_learning\glove.6B.300d.txt")

In [None]:
maxLen = 15000

vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [None]:
x_indices = tf.keras.Input(shape=(maxLen,))
embeddings = embedding_layer(x_indices)
x = tf.keras.layers.LSTM(128, return_sequences=True)(embeddings)
x = tf.keras.layers.Dropout(0.6)(x)
x = tf.keras.layers.LSTM(128, return_sequences=True)(x)
x = tf.keras.layers.Dropout(0.6)(x)
x = tf.keras.layers.LSTM(128)(x)
x = tf.keras.layers.Dense(1, activation=None)(x)
model = tf.keras.Model(inputs=x_indices,outputs=x)

model.summary()

In [None]:
over_sampler = RandomOverSampler(random_state=5)
balanced_x, balanced_y = over_sampler.fit_resample(train_x, train_y)

train_x_indices = tokenizer.texts_to_sequences(balanced_x)
train_x_indices = pad_sequences(train_x_indices, maxlen=maxLen, padding='post')
train_x_indices.shape

In [None]:
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.metrics.BinaryAccuracy()

epochs = 5
steps_per_epoch = 3300000000 / (256*512)
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = int(0.1*num_train_steps)

init_lr = 3e-5
optimizer = optimization.create_optimizer(init_lr=init_lr,
                                          num_train_steps=num_train_steps,
                                          num_warmup_steps=num_warmup_steps,
                                          optimizer_type='adamw')


model.compile(optimizer=optimizer,loss=loss,metrics=metrics)

In [None]:
model.fit(train_x_indices,balanced_y,batch_size=32, epochs=epochs, verbose=2)

In [None]:
test_x_indices = tokenizer.texts_to_sequences(test_x)
test_x_indices = pad_sequences(test_x_indices,maxlen=maxLen,padding='post')

preds_non_binary = model.predict(test_x_indices)