## Setup

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

import os

NEG_DIRECTORY_PATH = './review_polarity/txt_sentoken/neg'
POS_DIRECTORY_PATH = './review_polarity/txt_sentoken/pos'

## Load Text Data

1. iterate through negative and positive text files

2. concat all lines per file to a single string

3. create tensor dataset from list of strings

4. label tensor dataset with 0 - negative | 1 - positive

In [2]:
labeled_data_sets = []

neg_file_names = list(os.listdir(NEG_DIRECTORY_PATH))
pos_file_names = list(os.listdir(POS_DIRECTORY_PATH))

lines_list = []
for file_name in neg_file_names:
  file = open(os.path.join(NEG_DIRECTORY_PATH, file_name))
  lines = ''
  for line in file:
    lines += line.rstrip() + ' '
  lines_list.append(lines)
  file.close()

lines_dataset = tf.data.Dataset.from_tensor_slices(lines_list)
labeled_data_set = lines_dataset.map(lambda ex: (ex, 0))
labeled_data_sets.append(labeled_data_set)

lines_list = []
for file_name in pos_file_names:
  file = open(os.path.join(POS_DIRECTORY_PATH, file_name))
  lines = ''
  for line in file:
    lines += line.rstrip() + ' '
  lines_list.append(lines)
  file.close()

lines_dataset = tf.data.Dataset.from_tensor_slices(lines_list)
labeled_data_set = lines_dataset.map(lambda ex: (ex, 1))
labeled_data_sets.append(labeled_data_set)

## Prepare Data

1. Concat positive and negative reviews
2. Double check size of full dataset
3. Shuffle data

In [3]:
BUFFER_SIZE = 1000

neg_labeled_data = labeled_data_sets[0]
pos_labeled_data = labeled_data_sets[1]
print("Negative labeled data len:", len(list(neg_labeled_data)))
print("Negative data len:", len(neg_file_names))
print("Positive labeled data len:", len(list(pos_labeled_data)))
print("Positive data len:", len(pos_file_names))

pos_labeled_data = pos_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)
neg_labeled_data = neg_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

Negative labeled data len: 1000
Negative data len: 1000
Positive labeled data len: 1000
Positive data len: 1000


In [4]:
for ex in neg_labeled_data.take(1):
  print(ex)
print("--------------")
for ex in pos_labeled_data.take(1):
  print(ex)

(<tf.Tensor: shape=(), dtype=string, numpy=b'seen december 2 , 1997 at 6 : 50 p . m . at the glenwood movieplex cinemas ( oneida , ny ) , theater #3 , by myself for free ( free pass ) . [theater rating : * * * : good seats , sound , and picture] there are many philosophies as to why we are so fascinated with cartoons . they provide a method of total escapism in which anything will work within their context , from the outrageous slapstick of looney tunes to the intensity of japanimation . watching " flubber " really clinched this idea for me , because it\'s just a live action cartoon that presents itself as a regular comedy . it proves how painfully unfunny all those gags and slapstick would be in reality , and how important it is to actually have a story . the film wastes no time in establishing its lighthearted , cartoony atmosphere . we meet medfield college chemistry professor phillip brainard ( williams ) , the typical , supposedly likable mad scientist . within the first 10 minute

## Tokenize and Encode Words

1. Get unique vocabulary set among data
2. Create encoder based on vocabulary set
3. Encode data text -> int using vocabulary as dictionary

In [5]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in neg_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

for text_tensor, _ in pos_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

39696

In [6]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [7]:
example_text = next(iter(pos_labeled_data))[0].numpy()
print(example_text)

b'clue is an unfairly ignored comedy , very similar to 1976\'s murder by death . this big screen version of the classic board game ( what\'s next . . . chutes and ladders : the motion picture ? ) is filled with slapstick antics and silly dialogue . the plot , for what it\'s worth , has all the characters from the game ( in this case , the names from the game are used as aliases ) meeting in an isolated mansion to confront mr . boddy ( lee ving ) , the man who\'s been blackmailing them all . when he turns up dead , everyone ( including the audience ) must figure out whodunnit . . . and in what room , and with what object . while not as witty as neil simon\'s murder by death , clue definitely has it moments . it has so many moments in fact that i use a lot of the lines from the film when i\'m joking around with my friends . to this day , whenever someone says the phrase " well , to make a long story short " i have the follow up phrase " too late " ready to go . the cast ( all very good c

In [8]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[21508, 16125, 29981, 6207, 23693, 292, 31415, 33530, 3730, 6726, 33923, 21229, 31746, 27701, 5978, 29141, 11487, 26417, 4477, 30687, 28934, 5634, 29489, 39456, 33923, 39425, 11993, 13083, 9935, 30687, 23710, 25757, 16125, 30781, 23373, 25400, 1190, 13083, 10301, 734, 30687, 39065, 19557, 39456, 12625, 33923, 15004, 23136, 34663, 30687, 6807, 33622, 30687, 29489, 37023, 5978, 2782, 30687, 19229, 33622, 30687, 29489, 11366, 23037, 17908, 304, 37008, 37023, 29981, 32442, 23183, 3730, 30927, 11733, 15199, 36497, 17313, 30687, 31515, 33567, 33923, 28957, 25546, 7273, 34663, 20055, 35984, 2238, 38169, 39507, 35907, 4282, 30687, 9491, 22603, 22320, 1027, 22183, 13083, 37023, 39456, 33847, 13083, 23373, 39456, 9177, 25574, 3050, 17908, 23150, 17908, 17429, 20298, 33923, 21229, 31746, 27701, 21508, 26287, 23136, 12625, 17995, 12625, 23136, 7378, 14609, 17995, 37023, 10073, 27316, 34805, 2484, 26578, 23173, 4477, 30687, 19284, 33622, 30687, 4453, 20055, 34805, 12232, 34077, 11217, 23373, 32903,

In [9]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int32))

  return encoded_text, label

In [10]:
pos_encoded_data = pos_labeled_data.map(encode_map_fn)

example_encoding = next(iter(pos_encoded_data))[0].numpy()
print(example_encoding)

neg_encoded_data = neg_labeled_data.map(encode_map_fn)

example_encoding = next(iter(neg_encoded_data))[0].numpy()
print(example_encoding)


[21508 16125 29981  6207 23693   292 31415 33530  3730  6726 33923 21229
 31746 27701  5978 29141 11487 26417  4477 30687 28934  5634 29489 39456
 33923 39425 11993 13083  9935 30687 23710 25757 16125 30781 23373 25400
  1190 13083 10301   734 30687 39065 19557 39456 12625 33923 15004 23136
 34663 30687  6807 33622 30687 29489 37023  5978  2782 30687 19229 33622
 30687 29489 11366 23037 17908   304 37008 37023 29981 32442 23183  3730
 30927 11733 15199 36497 17313 30687 31515 33567 33923 28957 25546  7273
 34663 20055 35984  2238 38169 39507 35907  4282 30687  9491 22603 22320
  1027 22183 13083 37023 39456 33847 13083 23373 39456  9177 25574  3050
 17908 23150 17908 17429 20298 33923 21229 31746 27701 21508 26287 23136
 12625 17995 12625 23136  7378 14609 17995 37023 10073 27316 34805  2484
 26578 23173  4477 30687 19284 33622 30687  4453 20055 34805 12232 34077
 11217 23373 32903 24422  3730  5978 30300 21943  9119 17422 30687 11837
  7171  3730  6873 26578 22153 26812 20966 34805 17

## Split Train/Test Data

In [11]:
import math 

TRAIN_AMT = 0.8
BATCH_SIZE = 15

take_size = math.ceil(len(list(neg_encoded_data)) * (1 - TRAIN_AMT))
print(take_size)


200


In [12]:
train_data_pos = pos_encoded_data.skip(take_size).shuffle(BUFFER_SIZE)
train_data_neg = neg_encoded_data.skip(take_size).shuffle(BUFFER_SIZE)

all_labeled_train_data = train_data_pos.concatenate(train_data_neg)
train_data = all_labeled_train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_data_pos = pos_encoded_data.take(take_size)
test_data_neg = neg_encoded_data.take(take_size)
all_labeled_test_data = test_data_pos.concatenate(test_data_neg)
test_data = all_labeled_test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

print(len(list(train_data_pos)) + len(list(train_data_neg)))
print(len(list(test_data_pos)) + len(list(test_data_neg)))
train_data_size = len(list(train_data))
test_data_size = len(list(test_data))
print(train_data_size)
print(test_data_size)

1600
400
107
27


In [13]:
for batch in train_data.take(1):
    print(batch)

(<tf.Tensor: shape=(15, 1397), dtype=int64, numpy=
array([[39372, 32742, 13083, ...,     0,     0,     0],
       [30687,  8037,  5316, ...,     0,     0,     0],
       [32651, 11366, 21848, ...,     0,     0,     0],
       ...,
       [15799, 10903, 31746, ...,     0,     0,     0],
       [14609, 12429, 18441, ...,  9528, 19759, 39035],
       [37982, 20055, 26042, ...,     0,     0,     0]])>, <tf.Tensor: shape=(15,), dtype=int32, numpy=array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], dtype=int32)>)


In [14]:
s_train = set()
for text, labels in train_data:
    s_train.add(labels[0].numpy())
    
s_test = set()
for text, labels in test_data:
    s_test.add(labels[0].numpy())
print(s_train)
print(s_test)


{0, 1}
{0, 1}


In [15]:
sample_text, sample_labels = next(iter(test_data))

print(sample_text)
sample_text[0].numpy(), sample_labels[0].numpy()

tf.Tensor(
[[21508 16125 29981 ...     0     0     0]
 [26021 17908 26578 ...     0     0     0]
 [14442  9129  8695 ...     0     0     0]
 ...
 [30687 20480 39000 ...     0     0     0]
 [30687 26812  4477 ...     0     0     0]
 [17407 19759 18002 ...     0     0     0]], shape=(15, 1312), dtype=int64)


(array([21508, 16125, 29981, ...,     0,     0,     0]), 1)

In [16]:
vocab_size += 1 # we added 0 for the padding

## Word Embeddings

We are going to create 5 word embeddings:

1. Bag of words encoding
2. Manually trained word embedding on data vocabulary set (Continuous bag of words model)
3. Pre-trained Glove 100-dimension embedding
4. Pre-trained Glove 300-dimension embedding
5. Pre-trained Word2Vec embedding

### 1. Bag of words

Here's an example of how we can use tf.one_hot and bitwise or to create a bag of words encoding for our vocab

In [17]:
from tensorflow.python.ops import bitwise_ops
test_vocab = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
test_sentences = [[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9],
                  [0, 5, 1]]
depth = len(test_vocab)

def bitwise_or_multiple(tensors):
    res = tensors[0]
    for i in range(1, len(tensors)):
        res = bitwise_ops.bitwise_or(res, tensors[i])
    
    return res

bag_of_words_list = []
for sentence in tf.one_hot(test_sentences, depth):
    print(sentence)
    bag_of_words_list.append(bitwise_or_multiple(tf.cast(sentence, tf.uint32)))

bag_of_words = tf.stack(bag_of_words_list)
bag_of_words

tf.Tensor(
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(3, 10), dtype=float32)


<tf.Tensor: shape=(4, 10), dtype=uint32, numpy=
array([[0, 1, 1, 1, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 1, 1, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 1, 1, 1],
       [1, 1, 0, 0, 0, 1, 0, 0, 0, 0]], dtype=uint32)>

Applying the above to our data

In [18]:
def bag_of_words(d, vocab_size):    
    bow_list = []
    for s in tf.one_hot(d, vocab_size):
        bow_list.append(bitwise_or_multiple(tf.cast(s, tf.uint32))[1:])

    return tf.stack(bow_list)

def bag_of_words_fn(text, labels):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text = tf.py_function(bag_of_words, 
                                       inp=[text, vocab_size], 
                                       Tout=tf.uint32)

  return encoded_text, labels


print(bag_of_words(sample_text, vocab_size))

tf.Tensor(
[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]], shape=(15, 39696), dtype=uint32)


In [19]:
## NOTE: This takes a while
# train_data_bow = train_data.map(bag_of_words_fn)

# sample_text_bow, sample_labels_bow = next(iter(train_data_bow))

# print(sample_text_bow)
# sample_text_bow[0].numpy(), sample_labels_bow[0].numpy()

In [20]:
# test_data_bow = test_data.map(bag_of_words_fn)

# sample_text_bow, sample_labels_bow = next(iter(test_data_bow))

# print(sample_text_bow)
# sample_text_bow[0].numpy(), sample_labels_bow[0].numpy()

### 2. Manually Trained Word Embedding

In [21]:
embedding_dim=16

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(encoder.vocab_size, embedding_dim),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          635168    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 635,457
Trainable params: 635,457
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=10,
    validation_data=test_data, validation_steps=20)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
