In [25]:
import pandas as pd
import numpy as np
from sklearn import model_selection
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from tensorflow.keras.layers import TextVectorization
from tensorflow.keras.layers import Embedding
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score

In [2]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Filename","Date","Sensitivity","Document"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [3]:
vectoriser = TextVectorization(max_tokens=20000, output_sequence_length=200)
text_ds = tf.data.Dataset.from_tensor_slices(train_x).batch(128)
vectoriser.adapt(text_ds)

voc = vectoriser.get_vocabulary()
word_index = dict(zip(voc, range(len(voc))))

glove_path = "..\deep_learning\glove.6B.100d.txt"
embeddings_index = {}
with open(glove_path,encoding="utf8") as f:
    for line in f:
        word, coefs = line.split(maxsplit=1)
        coefs = np.fromstring(coefs,"f",sep= " ")
        embeddings_index[word] = coefs

num_tokens = len(voc) + 2
embedding_dim = 100
hits = 0
misses = 0

embedding_matrix = np.zeros((num_tokens,embedding_dim))

for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector
        hits += 1
    else:
        misses += 1


embedding_layer = Embedding(num_tokens,embedding_dim,embeddings_initializer=tf.keras.initializers.Constant(embedding_matrix),trainable=False)

In [4]:
int_sequences_input = tf.keras.Input(shape=(None,), dtype="int64")
embedded_sequences = embedding_layer(int_sequences_input)
lstm = tf.keras.layers.LSTM(512,return_sequences=True)(embedded_sequences)
dense = tf.keras.layers.Dense(1)(lstm)
glove_model = tf.keras.Model(int_sequences_input, dense)
glove_model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None)]            0         
_________________________________________________________________
embedding (Embedding)        (None, None, 100)         2000200   
_________________________________________________________________
lstm (LSTM)                  (None, None, 512)         1255424   
_________________________________________________________________
dense (Dense)                (None, None, 1)           513       
Total params: 3,256,137
Trainable params: 1,255,937
Non-trainable params: 2,000,200
_________________________________________________________________


In [77]:
string_input = tf.keras.Input(shape=(1,), dtype="string")
x = vectoriser(string_input)
preds = glove_model(x)
end_to_end_model = tf.keras.Model(string_input, preds)

end_to_end_model.summary()

Model: "model_4"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_6 (InputLayer)         [(None, 1)]               0         
_________________________________________________________________
text_vectorization (TextVect (None, 200)               0         
_________________________________________________________________
model (Functional)           (None, None, 1)           3256137   
Total params: 3,256,137
Trainable params: 1,255,937
Non-trainable params: 2,000,200
_________________________________________________________________


In [20]:
optimiser = tf.keras.optimizers.Adam()
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)
metrics = tf.keras.metrics.BinaryAccuracy()

glove_model.compile(optimizer=optimiser,loss='binary_crossentropy',metrics=metrics)

In [21]:
x_train = vectoriser(np.array([[s] for s in train_x])).numpy()
x_val = vectoriser(np.array([[s] for s in test_x])).numpy()

y_train = np.array(train_y)
y_val = np.array(test_y)

history = glove_model.fit(x=x_train,y=y_train,epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [90]:
predictions = glove_model.predict(x_val)

In [91]:
n = np.random.randint(0,700)
if predictions[n] > 0.5:
  print('predicted sentiment : positive')
else: 
  print('precicted sentiment : negative')

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()