In [0]:
%tensorflow_version 1.x
first_time = True

In [3]:
if(first_time):
  from google.colab import drive
  drive.mount('/content/drive')

import numpy as np
import pandas as pd
import datetime
from random import randint
from gensim.models import word2vec
from sklearn.model_selection import train_test_split
%tensorflow_version 1.x
import tensorflow as tf
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from gensim.models.fasttext import FastText

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


/content/drive/My Drive/University/FYP/Sentiment Analysis/Research by Isuru/Word_Embedding

In [0]:
folder_path = '/content/drive/My Drive/University/FYP/Sentiment Analysis/Implementation/'
fasttext_model_path = folder_path + 'word_embedding/fasttext/commen_docid_removed300_5'
dataset_path = folder_path + "corpus/analyzed/lankadeepa_tagged_2.csv"
data_vectors_path = folder_path + 'corpus/analyzed/vectors/'
RNN_path = "Sentiment Analysis/Sentiment_Tagger_Recreation/RNN/"
log_dir_path = folder_path + RNN_path + "logs/from_fasttext/"
model_save_path = folder_path + RNN_path + "models/from_fasttext/pretrained_lstm.ckpt"

num_features = 300
max_sentence_length = 50
batchSize = 24
lstmUnits = 64
numClasses = 2
iterations = 30000

labels = tf.placeholder(tf.int32, [batchSize, numClasses])
data = tf.placeholder(tf.float32, [batchSize, max_sentence_length, num_features])

In [0]:
def convert_to_vectors():
    comments = pd.read_csv(dataset_path, ";")
    train_data, test_data = train_test_split(comments, test_size=0.2, random_state=0)
    train_data_vectors, train_data_labels = comments_to_vectors(train_data)
    test_data_vectors, test_data_labels = comments_to_vectors(test_data)

    np.save(data_vectors_path + 'from_fasttext/train_data_vectors.npy', train_data_vectors)
    np.save(data_vectors_path + 'from_fasttext/train_data_labels.npy', train_data_labels)
    np.save(data_vectors_path + 'from_fasttext/test_data_vectors.npy', test_data_vectors)
    np.save(data_vectors_path + 'from_fasttext/test_data_labels.npy', test_data_labels)

In [0]:
def load_vectors():
    train_data_vectors = np.load(data_vectors_path + 'from_fasttext/train_data_vectors.npy')
    train_data_labels = np.load(data_vectors_path + 'from_fasttext/train_data_vectors.npy')
    test_data_vectors = np.load(data_vectors_path + 'from_fasttext/train_data_vectors.npy')
    test_data_labels = np.load(data_vectors_path + 'from_fasttext/train_data_vectors.npy')
    
    return train_data_vectors, train_data_labels, test_data_vectors, test_data_labels

In [0]:
def get_sentence_vector(model, sentence):
    sentence_vector = np.zeros([max_sentence_length, num_features])
    counter = 0
    index2word_set = set(model.wv.index2word)
    for word in sentence.split():
        if word in index2word_set:
            sentence_vector[counter] = model[word]
            counter += 1
            if (counter == max_sentence_length):
                break
        else:
            print("word not in word2vec model: " + word+"Counter : ",counter)
    return sentence_vector


In [0]:
def comments_to_vectors(data):
    model = word2vec.Word2Vec.load(fasttext_model_path)
    # model = FastText.load_fasttext_format(fasttext_model_name)
    comment_vectors = []
    comment_labels = []
    for comment in data["comment"]:
        comment_vectors.append(get_sentence_vector(model, comment))
    for label in data["label"]:
        if label == "POSITIVE":
            comment_labels.append([0, 1])
        else:
            comment_labels.append([1, 0])
    return np.array(comment_vectors), comment_labels

In [0]:
def get_batch(size, data, label):
    batch_data = np.empty((size, max_sentence_length, num_features), dtype=float)
    batch_label = []
    for i in range(size):
        random_int = randint(0, len(data) - 1)
        batch_data[i] = data[random_int]
        batch_label.append(label[random_int])
    return batch_data, batch_label

In [0]:
def get_batch_order(size, data, label, batch_no):
    batch_data = data[batch_no * size : (batch_no + 1) * size]
    batch_label = label[batch_no * size : (batch_no + 1) * size]
    return batch_data, batch_label

In [0]:
def neural_network_model():
    lstm_cell = tf.contrib.rnn.BasicLSTMCell(lstmUnits)

    lstm_cell = tf.contrib.rnn.DropoutWrapper(cell=lstm_cell, output_keep_prob=0.75)
    value, _ = tf.nn.dynamic_rnn(lstm_cell, data, dtype=tf.float32)

    weight = tf.Variable(tf.truncated_normal([lstmUnits, numClasses]))
    bias = tf.Variable(tf.constant(0.1, shape=[numClasses]))
    value = tf.transpose(value, [1, 0, 2])
    last = tf.gather(value, int(value.get_shape()[0]) - 1)
    prediction = (tf.matmul(last, weight) + bias)

    correct_prediction = tf.equal(tf.argmax(prediction,1), tf.argmax(labels,1))
    accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
    prediction_values = tf.argmax(prediction, 1)

    loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction, labels=labels))
    optimizer = tf.train.AdamOptimizer().minimize(loss)

    return loss, accuracy, prediction_values, optimizer

In [0]:
def train_neural_network(loss, accuracy, optimizer, train_data, train_labels):
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    sess.run(tf.global_variables_initializer())

    tf.summary.scalar('Loss', loss)
    tf.summary.scalar('Accuracy', accuracy)
    merged = tf.summary.merge_all()
    logdir = log_dir_path + datetime.datetime.now().strftime("%Y%m%d-%H%M%S") + "/"
    writer = tf.summary.FileWriter(logdir, sess.graph)

    for i in range(iterations):
        #Next Batch of reviews
        next_batch, next_batch_labels = get_batch(batchSize, train_data, train_labels)
        sess.run(optimizer, {data: next_batch, labels: next_batch_labels})

        #Write summary to Tensorboard
        if (i % 50 == 0):
            summary = sess.run(merged, {data: next_batch, labels: next_batch_labels})
            writer.add_summary(summary, i)

        #Save the network every 10,000 training iterations
        if (i % 9999 == 0 and i != 0):
            save_path = saver.save(sess, model_save_path , global_step=i)
            print("saved to %s" % save_path)
    writer.close()

In [0]:
def measure_neural_network(accuracy, prediction_values, test_data, test_labels):
    sess = tf.InteractiveSession()
    saver = tf.train.Saver()
    saver.restore(sess, tf.train.latest_checkpoint(model_save_path))

    overall_accuracy = 0
    all_predictions = []
    test_iterations = 80
    for i in range(test_iterations):
        next_batch, next_batch_labels = get_batch_order(batchSize, test_data, test_labels, i)
        accuracy_this_batch = (sess.run(accuracy, {data: next_batch, labels: next_batch_labels})) * 100
        predictions_this_batch = sess.run(prediction_values, {data: next_batch, labels: next_batch_labels})
        overall_accuracy = overall_accuracy + accuracy_this_batch
        all_predictions = all_predictions + predictions_this_batch.tolist()
        print("Accuracy for this batch:", accuracy_this_batch)

    true_labels = tf.argmax(test_labels, 1).eval()
    precision = precision_score(true_labels.tolist()[0:batchSize * test_iterations], all_predictions)
    f1 = f1_score(true_labels.tolist()[0:batchSize * test_iterations], all_predictions)
    recall = recall_score(true_labels.tolist()[0:batchSize * test_iterations], all_predictions)
    overall_accuracy = overall_accuracy / (test_iterations * 100)
    print(confusion_matrix(true_labels.tolist()[0:batchSize * test_iterations], all_predictions).ravel())

    return overall_accuracy, precision, recall, f1

# Main

In [0]:
# convert_to_vectors()
train_data_vectors, train_data_labels, test_data_vectors, test_data_labels = load_vectors()

In [15]:
print("Running tesnsorflow simulation.....")
loss, accuracy, prediction_values, optimizer = neural_network_model()

Running tesnsorflow simulation.....
The TensorFlow contrib module will not be included in TensorFlow 2.0.
For more information, please see:
  * https://github.com/tensorflow/community/blob/master/rfcs/20180907-contrib-sunset.md
  * https://github.com/tensorflow/addons
  * https://github.com/tensorflow/io (for I/O related ops)
If you depend on functionality not listed there, please file an issue.

Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.RNN(cell)`, which is equivalent to this API
Instructions for updating:
Please use `layer.add_weight` method instead.
Instructions for updating:
Call initializer instance with the dtype argument instead of passing it to the constructor
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits

In [16]:
train_neural_network(loss, accuracy, optimizer, train_data_vectors, train_data_labels)

ValueError: ignored

In [0]:
accuracy, precision, recall, f1 = measure_neural_network(accuracy, prediction_values, test_data_vectors, test_data_labels)
print("Accuracy: ", accuracy)
print("Precision: ", precision)
print("Recall: ", recall)
print("F1 Score: ", f1)