## Setup

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

import os

NEG_DIRECTORY_PATH = './review_polarity/txt_sentoken/neg'
POS_DIRECTORY_PATH = './review_polarity/txt_sentoken/pos'

## Load Text Data

1. iterate through negative and positive text files

2. concat all lines per file to a single string

3. create tensor dataset from list of strings

4. label tensor dataset with 0 - negative | 1 - positive

In [2]:
labeled_data_sets = []

neg_file_names = list(os.listdir(NEG_DIRECTORY_PATH))
pos_file_names = list(os.listdir(POS_DIRECTORY_PATH))

lines_list = []
for file_name in neg_file_names:
  file = open(os.path.join(NEG_DIRECTORY_PATH, file_name))
  lines = ''
  for line in file:
    lines += line.rstrip() + ' '
  lines_list.append(lines)
  file.close()

lines_dataset = tf.data.Dataset.from_tensor_slices(lines_list)
labeled_data_set = lines_dataset.map(lambda ex: (ex, 0))
labeled_data_sets.append(labeled_data_set)

lines_list = []
for file_name in pos_file_names:
  file = open(os.path.join(POS_DIRECTORY_PATH, file_name))
  lines = ''
  for line in file:
    lines += line.rstrip() + ' '
  lines_list.append(lines)
  file.close()

lines_dataset = tf.data.Dataset.from_tensor_slices(lines_list)
labeled_data_set = lines_dataset.map(lambda ex: (ex, 1))
labeled_data_sets.append(labeled_data_set)

## Prepare Data

1. Concat positive and negative reviews
2. Double check size of full dataset
3. Shuffle data

In [3]:
BUFFER_SIZE = 1000

neg_labeled_data = labeled_data_sets[0]
pos_labeled_data = labeled_data_sets[1]
print("Negative labeled data len:", len(list(neg_labeled_data)))
print("Negative data len:", len(neg_file_names))
print("Positive labeled data len:", len(list(pos_labeled_data)))
print("Positive data len:", len(pos_file_names))

pos_labeled_data = pos_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)
neg_labeled_data = neg_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

Negative labeled data len: 1000
Negative data len: 1000
Positive labeled data len: 1000
Positive data len: 1000


In [4]:
for ex in neg_labeled_data.take(1):
  print(ex)
print("--------------")
for ex in pos_labeled_data.take(1):
  print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b' " mercury rising " has numerous flaws , but there is one that really stands out : its central plot device is unnecessary . that\'s right . the major aspect of the film , that which is supposed to make it different from other routine government conspiracy / action flicks could be dropped from the beginning , and the movie would turn out exactly the same , if not better . this central device is the fact that a nine-year-old boy is autistic . his name is simon , and an evil government bureaucrat named nicholas kudrow ( alec baldwin ) wants him dead because he unknowingly cracked a supersecret government code slipped into the back of a puzzle magazine by its programmers just to see if someone could beat it . simon is intended to be the heart and soul of the film , and we are supposed to feel for him because he is a poor handicapped child thrown into a violent , unfair world against his will , with only a renegade fbi agent played by bruce willi

## Tokenize and Encode Words

1. Get unique vocabulary set among data
2. Create encoder based on vocabulary set
3. Encode data text -> int using vocabulary as dictionary

In [5]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in neg_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

for text_tensor, _ in pos_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

39696

In [6]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [7]:
example_text = next(iter(pos_labeled_data))[0].numpy()
print(example_text)

b'allen , star of many a brian depalma movie in the early eighties , has a brief , throwaway part towards the end of " out of sight , " as the maid of a crooked financier . in keeping with her past performances , allen wears little more than a green velvet victoria\'s secret ensemble which begs the question , did they really need a costume designer for this ?  " out of sight " is not likely to secure ms . allen the kind of plaudits bestowed upon pam grier , who likewise returned from out of obscurity to appear in another recent elmore leonard adaptation , " jackie brown , " but it is intriguing to see her name kicking off the " also starring " credits given her limited screen time . the real stars of " out of sight " are george clooney ( " batman & robin " by way of " e . r . " ) and jennifer lopez ( " selena " ) , whose winning chemistry , coupled with steven soderbergh\'s freeze-frame directorial technique , help the film retain the charismatic charm of leonard\'s original work . wit

In [8]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[5969, 1406, 9040, 24347, 23111, 14778, 13513, 3334, 10552, 19979, 39107, 23080, 15633, 23111, 8197, 35602, 6406, 5554, 19979, 16865, 9040, 14824, 9040, 27544, 30953, 19979, 12792, 9040, 23111, 23746, 13219, 10552, 6318, 26373, 35976, 34539, 6011, 5969, 24420, 8708, 26824, 18488, 23111, 24567, 10157, 35497, 26869, 4443, 19871, 5686, 23825, 19979, 39171, 18588, 3981, 10017, 38883, 23111, 22751, 1207, 19064, 30535, 14824, 9040, 27544, 23642, 33384, 18214, 32041, 10717, 21962, 5969, 19979, 1683, 9040, 35844, 17172, 35444, 28403, 30182, 30592, 2435, 716, 35908, 14824, 9040, 21997, 32041, 3615, 10552, 17846, 37837, 29813, 38123, 901, 4818, 855, 39506, 26051, 23642, 23574, 32041, 34295, 35976, 31051, 26647, 17980, 19979, 32650, 11271, 33080, 13864, 35976, 13957, 28159, 28674, 19979, 5506, 38280, 9040, 14824, 9040, 27544, 1039, 20109, 31098, 14403, 23893, 405, 3312, 9040, 10641, 33459, 18747, 9209, 16936, 25726, 11198, 31168, 19218, 34428, 26373, 32429, 34069, 26869, 2988, 14622, 7323, 29359,

In [9]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int32))

  return encoded_text, label

In [10]:
pos_encoded_data = pos_labeled_data.map(encode_map_fn)

example_encoding = next(iter(pos_encoded_data))[0].numpy()
print(example_encoding)

neg_encoded_data = neg_labeled_data.map(encode_map_fn)

example_encoding = next(iter(neg_encoded_data))[0].numpy()
print(example_encoding)


[ 5969  1406  9040 24347 23111 14778 13513  3334 10552 19979 39107 23080
 15633 23111  8197 35602  6406  5554 19979 16865  9040 14824  9040 27544
 30953 19979 12792  9040 23111 23746 13219 10552  6318 26373 35976 34539
  6011  5969 24420  8708 26824 18488 23111 24567 10157 35497 26869  4443
 19871  5686 23825 19979 39171 18588  3981 10017 38883 23111 22751  1207
 19064 30535 14824  9040 27544 23642 33384 18214 32041 10717 21962  5969
 19979  1683  9040 35844 17172 35444 28403 30182 30592  2435   716 35908
 14824  9040 21997 32041  3615 10552 17846 37837 29813 38123   901  4818
   855 39506 26051 23642 23574 32041 34295 35976 31051 26647 17980 19979
 32650 11271 33080 13864 35976 13957 28159 28674 19979  5506 38280  9040
 14824  9040 27544  1039 20109 31098 14403 23893   405  3312  9040 10641
 33459 18747  9209 16936 25726 11198 31168 19218 34428 26373 32429 34069
 26869  2988 14622  7323 29359 33539 19979 25844 10749 19979  9148 24001
  9040 38123 26869 39688 15276 26373 19979 23229  9

## Split Train/Test Data

In [11]:
import math 

TRAIN_AMT = 0.8
BATCH_SIZE = 15

take_size = math.ceil(len(list(neg_encoded_data)) * (1 - TRAIN_AMT))
print(take_size)


200


In [12]:
train_data_pos = pos_encoded_data.skip(take_size).shuffle(BUFFER_SIZE)
train_data_neg = neg_encoded_data.skip(take_size).shuffle(BUFFER_SIZE)

all_labeled_train_data = train_data_pos.concatenate(train_data_neg)
train_data = all_labeled_train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_data_pos = pos_encoded_data.take(take_size)
test_data_neg = neg_encoded_data.take(take_size)
all_labeled_test_data = test_data_pos.concatenate(test_data_neg)
test_data = all_labeled_test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

print(len(list(train_data_pos)) + len(list(train_data_neg)))
print(len(list(test_data_pos)) + len(list(test_data_neg)))
train_data_size = len(list(train_data))
test_data_size = len(list(test_data))
print(train_data_size)
print(test_data_size)

1600
400
107
27


In [13]:
for batch in train_data.take(1):
    print(batch)

(<tf.Tensor: shape=(15, 1171), dtype=int64, numpy=
array([[ 8463, 33282,  7029, ..., 11137, 30472, 25257],
       [30003, 10351, 11362, ...,     0,     0,     0],
       [36448, 26869, 23111, ...,     0,     0,     0],
       ...,
       [ 1201, 38896, 22665, ...,     0,     0,     0],
       [38980, 23111, 39668, ...,     0,     0,     0],
       [30535, 11556, 23642, ...,     0,     0,     0]])>, <tf.Tensor: shape=(15,), dtype=int32, numpy=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)>)


In [14]:
s_train = set()
for text, labels in train_data:
    s_train.add(labels[0].numpy())
    
s_test = set()
for text, labels in test_data:
    s_test.add(labels[0].numpy())
print(s_train)
print(s_test)


{0, 1}
{0, 1}


In [15]:
sample_text, sample_labels = next(iter(test_data))

print(sample_text)
sample_text[0].numpy(), sample_labels[0].numpy()

tf.Tensor(
[[ 5969  1406  9040 ...     0     0     0]
 [  189 26489  8847 ...     0     0     0]
 [30051 23038  1039 ...     0     0     0]
 ...
 [23111  3334 13092 ...     0     0     0]
 [26051 26869 33384 ...     0     0     0]
 [30953 26373  2994 ...     0     0     0]], shape=(15, 1114), dtype=int64)


(array([5969, 1406, 9040, ...,    0,    0,    0]), 1)

In [16]:
vocab_size += 1 # we added 0 for the padding

## Word Embeddings

We are going to create 5 word embeddings:

1. Bag of words encoding
2. Manually trained word embedding on data vocabulary set (Continuous bag of words model)
3. Pre-trained Glove 100-dimension embedding
4. Pre-trained Glove 300-dimension embedding
5. Pre-trained Word2Vec embedding

### 1. Bag of words

Here's an example of how we can use tf.one_hot and bitwise or to create a bag of words encoding for our vocab

In [17]:
from tensorflow.python.ops import bitwise_ops
test_vocab = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
test_sentences = [[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9],
                  [0, 5, 1]]
depth = len(test_vocab)

def bitwise_or_multiple(tensors):
    res = tensors[0]
    for i in range(1, len(tensors)):
        res = bitwise_ops.bitwise_or(res, tensors[i])
    
    return res

bag_of_words_list = []
for sentence in tf.one_hot(test_sentences, depth):
    print(sentence)
    bag_of_words_list.append(bitwise_or_multiple(tf.cast(sentence, tf.uint32)))

bag_of_words = tf.stack(bag_of_words_list)
bag_of_words

tf.Tensor(
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(3, 10), dtype=float32)


<tf.Tensor: shape=(4, 10), dtype=uint32, numpy=
array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [1, 1, 0, 0, 0, 1, 0, 0, 0, 0]], dtype=uint32)>

Applying the above to our data

In [18]:
def bag_of_words(d, vocab_size):    
    bow_list = []
    for s in tf.one_hot(d, vocab_size):
        bow_list.append(bitwise_or_multiple(tf.cast(s, tf.uint32))[1:])

    return tf.stack(bow_list)

def bag_of_words_fn(text, labels):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text = tf.py_function(bag_of_words, 
                                       inp=[text, vocab_size], 
                                       Tout=tf.uint32)

  return encoded_text, labels


print(bag_of_words(sample_text, vocab_size))

tf.Tensor(
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]], shape=(15, 39696), dtype=uint32)


In [19]:
## NOTE: This takes a while
# train_data_bow = train_data.map(bag_of_words_fn)

# sample_text_bow, sample_labels_bow = next(iter(train_data_bow))

# print(sample_text_bow)
# sample_text_bow[0].numpy(), sample_labels_bow[0].numpy()

In [20]:
# test_data_bow = test_data.map(bag_of_words_fn)

# sample_text_bow, sample_labels_bow = next(iter(test_data_bow))

# print(sample_text_bow)
# sample_text_bow[0].numpy(), sample_labels_bow[0].numpy()

### 2. Manually Trained Word Embedding

In [21]:
embedding_dim=16

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(encoder.vocab_size, embedding_dim),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          635168    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 635,457
Trainable params: 635,457
Non-trainable params: 0
_________________________________________________________________


In [22]:
sample_text

<tf.Tensor: shape=(15, 1114), dtype=int64, numpy=
array([[ 5969,  1406,  9040, ...,     0,     0,     0],
       [  189, 26489,  8847, ...,     0,     0,     0],
       [30051, 23038,  1039, ...,     0,     0,     0],
       ...,
       [23111,  3334, 13092, ...,     0,     0,     0],
       [26051, 26869, 33384, ...,     0,     0,     0],
       [30953, 26373,  2994, ...,     0,     0,     0]])>

In [23]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=10,
    validation_data=test_data, validation_steps=20)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100

KeyboardInterrupt: 