In [1]:
import pandas as pd
import numpy as np
from sklearn import model_selection
import tensorflow as tf
import tensorflow_text as text
import tensorflow_hub as hub
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import precision_score

In [2]:
# Load in dataset
data = pd.read_csv("../../data/sensitivity_data/sensitivity_dataset.csv")
data = data[["Document","Sensitivity"]]

# Train / Test split
train_x, test_x, train_y, test_y = model_selection.train_test_split(data['Document'],data['Sensitivity'],test_size=0.2,random_state=5)

In [3]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_x)

words_to_index = tokenizer.word_index

In [4]:
def read_glove_vector(glove_vec):
  with open(glove_vec, 'r', encoding='UTF-8') as f:
    words = set()
    word_to_vec_map = {}
    for line in f:
      w_line = line.split()
      curr_word = w_line[0]
      word_to_vec_map[curr_word] = np.array(w_line[1:], dtype=np.float64)
      
  return word_to_vec_map

word_to_vec_map = read_glove_vector("..\deep_learning\glove.6B.300d.txt")


In [5]:
maxLen = 15000

vocab_len = len(words_to_index)
embed_vector_len = word_to_vec_map['moon'].shape[0]

emb_matrix = np.zeros((vocab_len, embed_vector_len))

for word, index in words_to_index.items():
  embedding_vector = word_to_vec_map.get(word)
  if embedding_vector is not None:
    emb_matrix[index, :] = embedding_vector

embedding_layer = Embedding(input_dim=vocab_len, output_dim=embed_vector_len, input_length=maxLen, weights = [emb_matrix], trainable=False)

In [6]:
x_indices = tf.keras.Input(shape=(maxLen,))
embeddings = embedding_layer(x_indices)
x = tf.keras.layers.LSTM(128, return_sequences=True)(embeddings)
x = tf.keras.layers.Dropout(0.6)(x)
x = tf.keras.layers.LSTM(128, return_sequences=True)(x)
x = tf.keras.layers.Dropout(0.6)(x)
x = tf.keras.layers.LSTM(128)(x)
x = tf.keras.layers.Dense(1, activation='sigmoid')(x)
model = tf.keras.Model(inputs=x_indices,outputs=x)

model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 15000)]           0         
_________________________________________________________________
embedding (Embedding)        (None, 15000, 300)        20621700  
_________________________________________________________________
lstm (LSTM)                  (None, 15000, 128)        219648    
_________________________________________________________________
dropout (Dropout)            (None, 15000, 128)        0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 15000, 128)        131584    
_________________________________________________________________
dropout_1 (Dropout)          (None, 15000, 128)        0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 128)               131584

In [7]:
train_x_indices = tokenizer.texts_to_sequences(train_x)
train_x_indices = pad_sequences(train_x_indices, maxlen=maxLen, padding='post')
train_x_indices.shape

(3040, 15000)

In [8]:
adam = tf.keras.optimizers.Adam(learning_rate = 0.0001)
model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

In [9]:
model.fit(train_x_indices,train_y,batch_size=64, epochs=15, verbose=2)

Epoch 1/15
48/48 - 4763s - loss: 0.5711 - accuracy: 0.8658
Epoch 2/15
48/48 - 4714s - loss: 0.3977 - accuracy: 0.8658
Epoch 3/15
48/48 - 4873s - loss: 0.3963 - accuracy: 0.8658
Epoch 4/15
48/48 - 4797s - loss: 0.3961 - accuracy: 0.8658
Epoch 5/15
48/48 - 4646s - loss: 0.3952 - accuracy: 0.8658
Epoch 6/15
48/48 - 4656s - loss: 0.3953 - accuracy: 0.8658
Epoch 7/15
48/48 - 4623s - loss: 0.3959 - accuracy: 0.8658
Epoch 8/15
48/48 - 4678s - loss: 0.3957 - accuracy: 0.8658
Epoch 9/15
48/48 - 4656s - loss: 0.3961 - accuracy: 0.8658
Epoch 10/15
48/48 - 4642s - loss: 0.3960 - accuracy: 0.8658
Epoch 11/15
48/48 - 4916s - loss: 0.3954 - accuracy: 0.8658
Epoch 12/15
48/48 - 5204s - loss: 0.3964 - accuracy: 0.8658
Epoch 13/15
48/48 - 5133s - loss: 0.3952 - accuracy: 0.8658
Epoch 14/15
48/48 - 5418s - loss: 0.3951 - accuracy: 0.8658
Epoch 15/15
48/48 - 6874s - loss: 0.3943 - accuracy: 0.8658


<keras.callbacks.History at 0x123c45bcbb0>

In [10]:
test_x_indices = tokenizer.texts_to_sequences(test_x)
test_x_indices = pad_sequences(test_x_indices,maxlen=maxLen,padding='post')

In [11]:
model.evaluate(test_x_indices,test_y)



[0.3807060122489929, 0.8764783143997192]

In [16]:
preds_non_binary = model.predict(test_x_indices)

sentiments = []
for sentiment in test_y:
    sentiments.append(sentiment)

preds_binary = []
for pred in preds_non_binary:
    if pred > 0.5:
        preds_binary.append(1)
    else:
        preds_binary.append(0)

print(sentiments,preds_non_binary)

[0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 

In [13]:
precision = precision_score(sentiments, preds_binary)
bac = balanced_accuracy_score(sentiments, preds_binary)
f2 = fbeta_score(sentiments, preds_binary, beta=2.0)

print(precision,bac,f2)

0.0 0.5 0.0


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
for i in range(len(preds_non_binary)):
    if(sentiments[i] == 1):
        if(preds_non_binary[i] == 1):
            print("Correctly classified")
        else:
            print("Incorrectly classified")

Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly classified
Incorrectly