## Setup

In [1]:
import tensorflow as tf
import tensorflow_datasets as tfds

import os

NEG_DIRECTORY_PATH = './review_polarity/txt_sentoken/neg'
POS_DIRECTORY_PATH = './review_polarity/txt_sentoken/pos'

## Load Text Data

1. iterate through negative and positive text files

2. concat all lines per file to a single string

3. create tensor dataset from list of strings

4. label tensor dataset with 0 - negative | 1 - positive

In [2]:
labeled_data_sets = []

neg_file_names = list(os.listdir(NEG_DIRECTORY_PATH))
pos_file_names = list(os.listdir(POS_DIRECTORY_PATH))

all_lines_list = []
neg_lines_list = []
for file_name in neg_file_names:
  file = open(os.path.join(NEG_DIRECTORY_PATH, file_name))
  lines = ''
  for line in file:
    lines += line.rstrip() + ' '
  neg_lines_list.append(lines)
  all_lines_list.append(lines)
  file.close()

lines_dataset = tf.data.Dataset.from_tensor_slices(neg_lines_list)
labeled_data_set = lines_dataset.map(lambda ex: (ex, 0))
labeled_data_sets.append(labeled_data_set)

pos_lines_list = []
for file_name in pos_file_names:
  file = open(os.path.join(POS_DIRECTORY_PATH, file_name))
  lines = ''
  for line in file:
    lines += line.rstrip() + ' '
  pos_lines_list.append(lines)
  all_lines_list.append(lines)
  file.close()

lines_dataset = tf.data.Dataset.from_tensor_slices(pos_lines_list)
labeled_data_set = lines_dataset.map(lambda ex: (ex, 1))
labeled_data_sets.append(labeled_data_set)

## Prepare Data

1. Concat positive and negative reviews
2. Double check size of full dataset
3. Shuffle data

In [3]:
BUFFER_SIZE = 1000

neg_labeled_data = labeled_data_sets[0]
pos_labeled_data = labeled_data_sets[1]
print("Negative labeled data len:", len(list(neg_labeled_data)))
print("Negative data len:", len(neg_file_names))
print("Positive labeled data len:", len(list(pos_labeled_data)))
print("Positive data len:", len(pos_file_names))

pos_labeled_data = pos_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)
neg_labeled_data = neg_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

Negative labeled data len: 1000
Negative data len: 1000
Positive labeled data len: 1000
Positive data len: 1000


## Data Exploration

In [4]:
import explore_data as ed

print("# Samples:", len(all_lines_list))
print("Median num words per sample:", ed.get_num_words_per_sample(all_lines_list))
ed.plot_frequency_distribution_of_ngrams(all_lines_list,num_ngrams=20)
ed.plot_sample_length_distribution(all_lines_list)

# Samples: 2000
Median num words per sample: 696.5


<Figure size 640x480 with 1 Axes>

<Figure size 640x480 with 1 Axes>

In [5]:
for ex in neg_labeled_data.take(1):
  print(ex)
print("--------------")
for ex in pos_labeled_data.take(1):
  print(ex)

(<tf.Tensor: id=4046, shape=(), dtype=string, numpy=b'the premise of this movie is , well , pretty far-fetched . tom berenger plays shale , a mercenary who is temporarily out of work ( those fools at the cia have denied his existence just because he and his buddies botched a job in cuba ) . fortunately , his girl friend ( diane venora ) , a teacher at christopher columbus high school in miami , gets her knee cap broken by a disgruntled student , creating a job opening for shale as a substitute teacher . not telling his girl friend , who might object on pedagogical grounds , he creates a number of fake higher degrees for himself ( from yale , harvard , princeton , et al ) and begins his tenure as a high school teacher . the students ( junkies , drug dealers , gang members , sleazy sluts , ice-pick wielders . . . you get the picture ) don\'t really take to him right away , so he hits one in the face with a can and breaks a few fingers . this gets their attention to a certain extent , so 

## Tokenize and Encode Words

1. Get unique vocabulary set among data
2. Create encoder based on vocabulary set
3. Encode data text -> int using vocabulary as dictionary

In [6]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in neg_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

for text_tensor, _ in pos_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

39696

In [7]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [8]:
example_text = next(iter(pos_labeled_data))[0].numpy()
print(example_text)

b"one of the last entries in the long-running carry on series , carry on behind is very similar to carry on camping in that it involves a group of holidaymakers descending on a 'caravan' site . professors anna vrooshka ( elke sommer ) and roland crump ( kenneth williams ) and a group of archaeology students stay in the caravan site owned by major leep ( kenneth connor ) so that they can explore the nearby roman settlement remains . anna has a little trouble understanding english and sometimes people get the wrong end of the stick : - for instance , when anna is asking for 'scrubbers for dirty caravan' , she means that she wants a scrubbing brush to clean the caravan ! arthur upmore ( bernard bresslaw ) and his wife linda ( patsy rowlands ) take her mother daphne barnes ( joan sims ) and her minah bird on holiday with them . mother-in-law jokes prevail . furthermore , the trouble that joe and norma baxter ( ian lavender and adrienne posta ) have with their large irish greyhound allows f

In [9]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[38635, 3705, 6109, 27587, 24761, 37778, 6109, 2397, 3577, 7693, 8479, 9288, 7693, 8479, 12253, 3459, 36172, 28713, 12203, 7693, 8479, 1050, 37778, 11518, 18749, 6422, 36736, 33415, 3705, 7878, 24492, 8479, 36736, 26686, 13023, 5157, 17300, 21543, 12734, 34329, 33040, 31650, 4404, 15735, 5599, 33040, 36736, 33415, 3705, 26484, 26429, 12120, 37778, 6109, 26686, 13023, 27713, 19355, 13829, 8937, 15735, 15879, 1214, 11518, 20143, 34334, 33650, 6109, 35197, 33787, 34774, 17964, 17300, 11878, 36736, 31922, 18710, 20689, 31265, 33040, 36495, 32644, 6604, 6109, 38802, 19724, 3705, 6109, 6942, 19266, 36468, 9529, 17300, 3459, 3315, 19266, 31812, 19266, 12161, 26686, 18991, 13000, 11518, 18991, 4264, 36736, 15844, 37992, 12203, 39093, 6109, 26686, 22223, 31308, 39069, 38258, 33040, 27995, 11994, 6675, 14187, 15160, 10742, 33965, 15923, 27477, 38970, 14590, 37180, 33040, 33965, 13219, 23107, 8479, 5613, 2300, 17438, 15923, 37778, 10142, 1055, 35536, 34371, 6109, 18710, 11518, 39550, 33040, 32082

In [10]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int32))

  return encoded_text, label

In [11]:
pos_encoded_data = pos_labeled_data.map(encode_map_fn)

example_encoding = next(iter(pos_encoded_data))[0].numpy()
print(example_encoding)

neg_encoded_data = neg_labeled_data.map(encode_map_fn)

example_encoding = next(iter(neg_encoded_data))[0].numpy()
print(example_encoding)


[38635  3705  6109 27587 24761 37778  6109  2397  3577  7693  8479  9288
  7693  8479 12253  3459 36172 28713 12203  7693  8479  1050 37778 11518
 18749  6422 36736 33415  3705  7878 24492  8479 36736 26686 13023  5157
 17300 21543 12734 34329 33040 31650  4404 15735  5599 33040 36736 33415
  3705 26484 26429 12120 37778  6109 26686 13023 27713 19355 13829  8937
 15735 15879  1214 11518 20143 34334 33650  6109 35197 33787 34774 17964
 17300 11878 36736 31922 18710 20689 31265 33040 36495 32644  6604  6109
 38802 19724  3705  6109  6942 19266 36468  9529 17300  3459  3315 19266
 31812 19266 12161 26686 18991 13000 11518 18991  4264 36736 15844 37992
 12203 39093  6109 26686 22223 31308 39069 38258 33040 27995 11994  6675
 14187 15160 10742 33965 15923 27477 38970 14590 37180 33040 33965 13219
 23107  8479  5613  2300 17438 15923 37778 10142  1055 35536 34371  6109
 18710 11518 39550 33040 32082  4204 27061 22511 33040 36013 30002  1673
  2300 33164 28094 32555  6783 24033 19266 24545 26

## Split Train/Test Data

In [12]:
import math 

TRAIN_AMT = 0.8
BATCH_SIZE = 5

take_size = math.ceil(len(list(neg_encoded_data)) * (1 - TRAIN_AMT))
print(take_size)


200


In [13]:
train_data_pos = pos_encoded_data.skip(take_size).shuffle(BUFFER_SIZE)
train_data_neg = neg_encoded_data.skip(take_size).shuffle(BUFFER_SIZE)

all_labeled_train_data = train_data_pos.concatenate(train_data_neg).shuffle(BUFFER_SIZE * 2)
train_data = all_labeled_train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_data_pos = pos_encoded_data.take(take_size)
test_data_neg = neg_encoded_data.take(take_size)
all_labeled_test_data = test_data_pos.concatenate(test_data_neg).shuffle(BUFFER_SIZE)
test_data = all_labeled_test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

print(len(list(train_data_pos)) + len(list(train_data_neg)))
print(len(list(test_data_pos)) + len(list(test_data_neg)))
train_data_size = len(list(train_data))
test_data_size = len(list(test_data))
print(train_data_size)
print(test_data_size)

1600
400
320
80


In [14]:
for batch in train_data.take(1):
    print(batch)

(<tf.Tensor: id=38402, shape=(5, 931), dtype=int64, numpy=
array([[30718, 24164,   349, ...,     0,     0,     0],
       [ 7195, 19355, 19555, ..., 20544,  6109, 27314],
       [ 1607, 21642,  2887, ...,     0,     0,     0],
       [ 4188, 32581,  6091, ...,     0,     0,     0],
       [ 6109, 13982, 35053, ...,     0,     0,     0]])>, <tf.Tensor: id=38403, shape=(5,), dtype=int32, numpy=array([1, 0, 0, 0, 0], dtype=int32)>)


In [15]:
s_train = set()
for text, labels in train_data:
    s_train.add(labels[0].numpy())
    
s_test = set()
for text, labels in test_data:
    s_test.add(labels[0].numpy())
print(s_train)
print(s_test)


{0, 1}
{0, 1}


In [16]:
sample_text, sample_labels = next(iter(test_data))

print(sample_text)
sample_text[0].numpy(), sample_labels[0].numpy()

tf.Tensor(
[[ 3033 32219  3517 ...     0     0     0]
 [ 4515 18827  5179 ...     0     0     0]
 [18749  7929 34072 ...  6363  5170  4160]
 [30570  7422 23207 ...     0     0     0]
 [ 1214  4188 21243 ...     0     0     0]], shape=(5, 1235), dtype=int64)


(array([ 3033, 32219,  3517, ...,     0,     0,     0]), 0)

In [17]:
vocab_size += 1 # we added 0 for the padding

## Word Embeddings

We are going to create 5 word embeddings:

1. Bag of words encoding
2. Manually trained word embedding on data vocabulary set (Continuous bag of words model)
3. Pre-trained Glove 100-dimension embedding
4. Pre-trained Glove 300-dimension embedding
5. Pre-trained Word2Vec embedding

### 1. Bag of words

Here's an example of how we can use tf.one_hot and bitwise or to create a bag of words encoding for our vocab

In [18]:
from tensorflow.python.ops import bitwise_ops
test_vocab = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9}
test_sentences = [[1, 2, 3],
                  [4, 5, 6],
                  [7, 8, 9],
                  [0, 5, 1]]
depth = len(test_vocab)

def bitwise_or_multiple(tensors):
    res = tensors[0]
    for i in range(1, len(tensors)):
        res = bitwise_ops.bitwise_or(res, tensors[i])
    
    return res

bag_of_words_list = []
for sentence in tf.one_hot(test_sentences, depth):
    print(sentence)
    bag_of_words_list.append(tf.cast(bitwise_or_multiple(tf.cast(sentence, tf.uint32)), tf.float32))

bag_of_words = tf.stack(bag_of_words_list)
bag_of_words

tf.Tensor(
[[0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[0. 0. 0. 0. 1. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]], shape=(3, 10), dtype=float32)
tf.Tensor(
[[1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]], shape=(3, 10), dtype=float32)


<tf.Tensor: id=49314, shape=(4, 10), dtype=float32, numpy=
array([[0., 1., 1., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 1., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
       [1., 1., 0., 0., 0., 1., 0., 0., 0., 0.]], dtype=float32)>

### 2. Manually Trained Word Embedding

In [23]:
embedding_dim=16

model = tf.keras.Sequential([
  tf.keras.layers.Embedding(encoder.vocab_size, embedding_dim),
  tf.keras.layers.GlobalAveragePooling1D(),
  tf.keras.layers.Dense(16, activation='relu'),
  tf.keras.layers.Dense(1)
])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 16)          635168    
_________________________________________________________________
global_average_pooling1d (Gl (None, 16)                0         
_________________________________________________________________
dense (Dense)                (None, 16)                272       
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 17        
Total params: 635,457
Trainable params: 635,457
Non-trainable params: 0
_________________________________________________________________


In [24]:
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

history = model.fit(
    train_data,
    epochs=10,
    validation_data=test_data)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [25]:
e = model.layers[0]
weights_manual = e.get_weights()[0]
print(weights_manual.shape) # shape: (vocab_size, embedding_dim)

print(weights_manual[0])

(39698, 16)
[-0.01200673 -0.04377279 -0.01682586 -0.05665943 -0.0985653   0.1759432
 -0.0046955  -0.05334025 -0.07617784  0.03212772  0.10440928 -0.06768385
  0.03056481 -0.0538655   0.01223458  0.02964083]


## Create Final Model and Test Predictions

In [26]:
model = tf.keras.Sequential()

model.add(tf.keras.layers.Embedding(input_dim=encoder.vocab_size, 
                                    output_dim=embedding_dim, 
                                    weights=[weights_manual],
                                    mask_zero=True,
                                    trainable=False))

model.add(tf.keras.layers.Conv1D(filters=64,
                          kernel_size=5,
                          activation='relu',
                                ))

model.add(tf.keras.layers.GlobalAveragePooling1D())
model.add(tf.keras.layers.Dropout(rate=0))
model.add(tf.keras.layers.Dense(16, activation='relu'))
model.add(tf.keras.layers.Dense(1))


# Compile and train model
model.compile(optimizer='adam',
              loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              metrics=['accuracy'])

model.fit(train_data, epochs=8, validation_data=test_data)
eval_loss, eval_acc = model.evaluate(test_data)

Epoch 1/8
Epoch 2/8
Epoch 3/8
Epoch 4/8
Epoch 5/8
Epoch 6/8
Epoch 7/8
Epoch 8/8


In [27]:
pos_review = "This movie is a gosh darn masterpiece. It will make you belly laugh, \
            it will chill you to the bone, and it will make you shed a tear. This \
            movie will stay with you long after the credits are over. If you plan \
            on watching this movie, AVOID SPOILERS AT ALL COSTS."

neg_review_1 = "Saving Christmas needed saving...and by that, I mean it should have \
been mercy killed. the acting is horrendous and it's story is only a reminder that \
anyone that paid to see it wasted their money. Even if you are a die-hard Christian, \
please do not see this movie, it's one of the worst movies you could ever watch ever, \
and considering stuff like The Room and Birdemic exists, That's saying A lot. oh, and \
that thing Kirk Cameron's doing to try to boost the rating on Rotten Tomatoes: Cameron,\
you should know the Bible verse 'Thou shalt not bear false witness' it's one of the 10 \
commandments. That's all I have to say on this matter, Don't watch the film. I mean \
Seriously Cameron, you should know better."

neg_review_2 = "There is no script. Action poor. Acting Poor. A strict no! Pleas save \
find your money on this one! I wouldn't even rate if possible. Worst ever music! No heads \
or tails!"


In [28]:
for review in [(pos_review, 1), (neg_review_1, 0),(neg_review_2, 0)]:
    print("Review:",review[0])
    print("Class:", review[1])
    
    encoded = encoder.encode(review[0])
    print("Encoded:", encoded)
    
    print("Prediction:", model.predict_classes([encoded]))
    print("----------")
   

Review: This movie is a gosh darn masterpiece. It will make you belly laugh,             it will chill you to the bone, and it will make you shed a tear. This             movie will stay with you long after the credits are over. If you plan             on watching this movie, AVOID SPOILERS AT ALL COSTS.
Class: 1
Encoded: [39697, 1246, 3459, 36736, 21134, 38181, 36855, 39697, 22613, 10614, 16371, 30159, 26688, 18749, 22613, 26613, 16371, 12203, 6109, 37840, 33040, 18749, 22613, 10614, 16371, 402, 36736, 38075, 39697, 1246, 22613, 12120, 2300, 16371, 2397, 14292, 6109, 31432, 349, 24049, 39697, 16371, 20410, 8479, 32833, 4137, 1246, 39697, 39697, 39697, 39697, 39697]
Prediction: [[1]]
----------
Review: Saving Christmas needed saving...and by that, I mean it should have been mercy killed. the acting is horrendous and it's story is only a reminder that anyone that paid to see it wasted their money. Even if you are a die-hard Christian, please do not see this movie, it's one of the worst 