## Setup

In [2]:
import tensorflow as tf
import tensorflow_datasets as tfds

import os

NEG_DIRECTORY_PATH = './review_polarity/txt_sentoken/neg'
POS_DIRECTORY_PATH = './review_polarity/txt_sentoken/pos'

## Load Text Data

1. iterate through negative and positive text files

2. concat all lines per file to a single string

3. create tensor dataset from list of strings

4. label tensor dataset with 0 - negative | 1 - positive

In [3]:
labeled_data_sets = []

neg_file_names = list(os.listdir(NEG_DIRECTORY_PATH))
pos_file_names = list(os.listdir(POS_DIRECTORY_PATH))

lines_list = []
for file_name in neg_file_names:
  file = open(os.path.join(NEG_DIRECTORY_PATH, file_name))
  lines = ''
  for line in file:
    lines += line.rstrip() + ' '
  lines_list.append(lines)
  file.close()

lines_dataset = tf.data.Dataset.from_tensor_slices(lines_list)
labeled_data_set = lines_dataset.map(lambda ex: (ex, 0))
labeled_data_sets.append(labeled_data_set)

lines_list = []
for file_name in pos_file_names:
  file = open(os.path.join(POS_DIRECTORY_PATH, file_name))
  lines = ''
  for line in file:
    lines += line.rstrip() + ' '
  lines_list.append(lines)
  file.close()

lines_dataset = tf.data.Dataset.from_tensor_slices(lines_list)
labeled_data_set = lines_dataset.map(lambda ex: (ex, 1))
labeled_data_sets.append(labeled_data_set)

## Prepare Data

1. Concat positive and negative reviews
2. Double check size of full dataset
3. Shuffle data

In [28]:
BUFFER_SIZE = 250

#all_labeled_data = labeled_data_sets[0].concatenate(labeled_data_sets[1])
neg_labeled_data = labeled_data_sets[0]
pos_labeled_data = labeled_data_sets[1]
print(len(neg_file_names))
print(len(pos_file_names))
#print(len(list(all_labeled_data)))

pos_labeled_data = pos_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)
neg_labeled_data = neg_labeled_data.shuffle(
    BUFFER_SIZE, reshuffle_each_iteration=False)

print(len(list(neg_labeled_data)))
print(len(list(pos_labeled_data)))

1000
1000
1000
1000


In [30]:
for ex in neg_labeled_data.take(1):
  print(ex)
print("--------------")
for ex in pos_labeled_data.take(1):
  print(ex)

(<tf.Tensor: id=41384, shape=(), dtype=string, numpy=b'one of the responses those that enjoy " detroit rock city " ( probably kiss fans , mostly ) might have upon first glance at the rating i\'ve given the film might be something like : " oh , that casey\'s gone and become a jaded critic on us . just what did he expect out of a dumb teenage rock n\' roll movie like this ? " i\'m wondering the same thing . i feel like i should have had a grand time with " detroit rock city . " it\'s the sort of movie i wish i could\'ve had a lot of fun with , but i didn\'t . i just didn\'t . surely this film isn\'t trying to win any major awards , so should i have expected an oscar-caliber film ? no , but i expected something . a funny joke . a clever prank . a plot development . anything . the movie never delivers . you\'ve got to marvel at how the filmmakers managed to come up with a movie that is truly about nothing .  " detroit rock city " is one of those films that you walk out of after the credits

## Tokenize and Encode Words

1. Get unique vocabulary set among data
2. Create encoder based on vocabulary set
3. Encode data text -> int using vocabulary as dictionary

In [31]:
tokenizer = tfds.features.text.Tokenizer()

vocabulary_set = set()
for text_tensor, _ in neg_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

for text_tensor, _ in pos_labeled_data:
  some_tokens = tokenizer.tokenize(text_tensor.numpy())
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
vocab_size

39696

In [32]:
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

In [34]:
example_text = next(iter(pos_labeled_data))[0].numpy()
print(example_text)
print("--------------------------------")
example_text = next(iter(neg_labeled_data))[0].numpy()
print(example_text)

b'jerry springer has got nothing on " wild things . " john mcnaughton\'s new thriller tackles more tawdry themes in less than two hours than springer\'s notoriously sleazy talk show broadcasts in two weeks -- bisexuality , threesomes , poolside catfights , slutty rich bimbos , even redneck gator-wrestling , they\'re all part of the movie\'s raucous , complex storyline . but even trash tv topicality can\'t drag " wild things " down -- this crazy campfest plays like something you\'d find late-night on the usa network , only infinitely more palatable and with a solid ensemble cast . despite a smattering of needless scenes ( most of them sexual in nature ) , there\'s wicked fun to be had here .  " wild things " would be a guilty pleasure , only there\'s no guilty feeling involved in having a good time with it . high school guidance counselor sam lombardo ( matt dillon ) is well-liked in the town of blue bay , especially by pretty , popular kelly van ryan ( denise richards ) , whose family 

In [35]:
encoded_example = encoder.encode(example_text)
print(encoded_example)

[34252, 8614, 13535, 24262, 14969, 16878, 3983, 7078, 27837, 24393, 8024, 14139, 26408, 15541, 18348, 25622, 10904, 5014, 11337, 30048, 13535, 20372, 17742, 35658, 26566, 13535, 19492, 18348, 3008, 7178, 23642, 16612, 16878, 17763, 2424, 37628, 4479, 20022, 10988, 25420, 32245, 21555, 19217, 4220, 34068, 29771, 18051, 30799, 7422, 8614, 10988, 38739, 4168, 27837, 18562, 23357, 192, 23642, 3975, 17742, 6773, 22230, 13535, 28599, 11627, 17742, 17107, 23642, 17742, 7085, 25622, 3031, 10988, 5432, 5083, 27241, 7078, 27837, 24393, 19235, 2424, 13535, 11799, 8614, 192, 17742, 38529, 17742, 11688, 35658, 3031, 10988, 38280, 8614, 13496, 27241, 19742, 17742, 34031, 2182, 17742, 4220, 34031, 2182, 21629, 3975, 19492, 34377, 2182, 23458, 23353, 3754, 31752, 7023, 17397, 22551, 7085, 17742, 25622, 7338, 39387, 19283, 8914, 19492, 28363, 19742, 17742, 7338, 7178, 10988, 33883, 922, 10988, 26852, 9092, 10988, 27713, 8428, 24039, 13535, 192, 9502, 2391, 14325, 35658, 25067, 23353, 19503, 30048, 1337

In [36]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, 
                                       inp=[text, label], 
                                       Tout=(tf.int64, tf.int32))

  return encoded_text, label

In [37]:
pos_encoded_data = pos_labeled_data.map(encode_map_fn)

example_encoding = next(iter(pos_encoded_data))[0].numpy()
#x = next(iter(all_encoded_data))[1].numpy()
print(example_encoding)

neg_encoded_data = neg_labeled_data.map(encode_map_fn)

example_encoding = next(iter(neg_encoded_data))[0].numpy()
#x = next(iter(all_encoded_data))[1].numpy()
print(example_encoding)
#print(x)
#print(labels)

[ 4084 33198 38012 25067 28716 21555 32927  1367 12267 11746  2424 12851
  7777 16416 15076 10460 11230 37926  7359 29228 27313 16565 29228 33198
  2424 18741 12811 35087 19352 23081 37926 27313  7544 31294  7578  8492
 13417  5596  5363   322 17433 31311 31020 16388  9377 15334 21846 11948
  8614 13535   192  2424 31025 24104 37411 19742 17433 35028 22275  6681
 13271  2182  4085 32927  1367 23196  3975 24731 23643 34720 23642  7178
 14325 13664 31496 37942 39311 21555 13535 22984 17581 31101 24922 15076
 33366  4479 27241 10988 21859 17881 30248 22466 10988 39447  8614 19961
  9528   141  8614 34896  1848 37926 28170 15780  2424 30254 13496 23353
  3008  3031 11569 32927  1367 10787  3008 10988 32763 21493 31101 15780
  2424 28363 32763 29487  9740 37926 38908 10988 13293  5083 27241 19235
 30000 28498 22722 19506   460 34028 21169 39692 23533 23449 15175 37926
 13535 37714  8614  8852 30834  1235 16472 35903 16128 20622  4038 35380
 20740  4375   644  4458 39582 23533 33545 13535  4

## Split Train/Test Data

In [38]:
import math 

TRAIN_AMT = 0.8
BATCH_SIZE = 25

take_size = math.ceil(len(list(pos_encoded_data)) * (1 - TRAIN_AMT))
print(take_size)

take_size = math.ceil(len(list(neg_encoded_data)) * (1 - TRAIN_AMT))
print(take_size)


200
200


In [46]:
train_data_pos = pos_encoded_data.skip(take_size).shuffle(BUFFER_SIZE)
train_data_neg = neg_encoded_data.skip(take_size).shuffle(BUFFER_SIZE)
#print(type(train_data_pos), type(train_data_neg))
all_labeled_train_data = train_data_pos.concatenate(train_data_neg)
#print(all_labeled_data)
#for ex in all_labeled_data.take(1):
#    print(ex)
#print(len(list(all_labeled_data)))
train_data = all_labeled_train_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

test_data_pos = pos_encoded_data.take(take_size)
test_data_neg = neg_encoded_data.take(take_size)
all_labeled_test_data = test_data_pos.concatenate(test_data_neg)
print(len(list(all_labeled_test_data)))
test_data = all_labeled_test_data.padded_batch(BATCH_SIZE, padded_shapes=([None],[]))

400


In [53]:
s_train = set()
for text, labels in train_data:
    #print(text[0].numpy(), labels[0].numpy())
    s_train.add(labels[0].numpy())
    
s_test = set()
for text, labels in test_data:
    #print(text[0].numpy(), labels[0].numpy())
    s_test.add(labels[0].numpy())
print(s_train)
print(s_test)
#sample_text, sample_labels = next(iter(test_data))

#print(sample_text)
#sample_text[0].numpy(), sample_labels[0].numpy()
#for i,j in test_data:
#    print(i, j)
        #print("Test Data Tensor:", i)
        #print("Test Data Tensor Length:", len(i))

{0, 1}
{0, 1}


In [48]:
sample_text, sample_labels = next(iter(test_data))

print(sample_text)
sample_text[0].numpy(), sample_labels[0].numpy()

tf.Tensor(
[[ 4084 33198 38012 ...     0     0     0]
 [21555 19148 14138 ...     0     0     0]
 [ 8385 28870  2424 ...     0     0     0]
 ...
 [19235 23533 28153 ...     0     0     0]
 [22979 32151 30038 ...     0     0     0]
 [33233 28168  6615 ...     0     0     0]], shape=(25, 1615), dtype=int64)


(array([ 4084, 33198, 38012, ...,     0,     0,     0]), 1)

In [54]:
vocab_size += 1

In [56]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])
history = model.fit(train_data, epochs=3,
                    validation_data=test_data,
                    validation_steps=30)
test_loss, test_acc = model.evaluate(test_data)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))
#plot_graphs(history, 'accuracy')
#plot_graphs(history, 'loss')

Epoch 1/3
Test Loss: 0.6931550316512585
Test Accuracy: 0.5


NameError: name 'plot_graphs' is not defined

In [50]:
from keras.layers import Dense, Dropout, Activation
from keras.layers import Conv1D, MaxPooling1D
model = tf.keras.Sequential()
pool_size = 4
filters = 64
kernel_size = 5

model.add(tf.keras.layers.Embedding(vocab_size, 8))
#model.add(tf.keras.layers.Flatten())
#model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.Dropout(0.25))
model.add(tf.keras.layers.Conv1D(filters, kernel_size, padding='valid', activation='relu', strides=1))
model.add(tf.keras.layers.MaxPooling1D(pool_size=pool_size))
#model.add(tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(50)))
#for units in [50, 50]:
#    model.add(tf.keras.layers.Dense(units, activation='relu'))
model.add(tf.keras.layers.LSTM(50))
model.add(tf.keras.layers.Dense(3))
model.add(tf.keras.layers.Activation('sigmoid'))
model.compile(optimizer='adam',
              loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
              metrics=['accuracy'])
model.fit(train_data, epochs=3, validation_data=test_data)
eval_loss, eval_acc = model.evaluate(test_data)

print('\nEval loss: {:.3f}, Eval accuracy: {:.3f}'.format(eval_loss, eval_acc))

Epoch 1/3
Epoch 2/3
Epoch 3/3

Eval loss: 0.874, Eval accuracy: 0.500


In [51]:
#embedding_layer = layers.Embedding(1000, 5)
##result = embedding_layer(tf.constant([1,2,3]))
#result.numpy()
#result = embedding_layer(tf.constant([[0,1,2],[3,4,5]]))
#result.shape
for i in test_data:
    print("Test Data Tensor:", i)
    print("Test Data Tensor Length:", len(i))
    print("Prediction:", model.predict_classes(i))

Test Data Tensor: (<tf.Tensor: id=102825, shape=(25, 1615), dtype=int64, numpy=
array([[ 4084, 33198, 38012, ...,     0,     0,     0],
       [21555, 19148, 14138, ...,     0,     0,     0],
       [ 8385, 28870,  2424, ...,     0,     0,     0],
       ...,
       [19235, 23533, 28153, ...,     0,     0,     0],
       [22979, 32151, 30038, ...,     0,     0,     0],
       [33233, 28168,  6615, ...,     0,     0,     0]])>, <tf.Tensor: id=102826, shape=(25,), dtype=int32, numpy=
array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1], dtype=int32)>)
Test Data Tensor Length: 2
Prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Test Data Tensor: (<tf.Tensor: id=103464, shape=(25, 1216), dtype=int64, numpy=
array([[ 4381, 13535, 37329, ...,     0,     0,     0],
       [13535, 34260,  8614, ...,     0,     0,     0],
       [21555, 13535,   823, ...,     0,     0,     0],
       ...,
       [14325, 25795, 37077, ...,     0,     0,     0],
 

Prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Test Data Tensor: (<tf.Tensor: id=108568, shape=(25, 1185), dtype=int64, numpy=
array([[35949, 37926,  4921, ...,     0,     0,     0],
       [ 3539, 22619, 23533, ...,     0,     0,     0],
       [11569,  2424, 10988, ...,     0,     0,     0],
       ...,
       [31424, 37926, 13535, ...,     0,     0,     0],
       [25379, 23533, 26258, ...,     0,     0,     0],
       [24440, 13260, 23486, ...,     0,     0,     0]])>, <tf.Tensor: id=108569, shape=(25,), dtype=int32, numpy=
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0], dtype=int32)>)
Test Data Tensor Length: 2
Prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
Test Data Tensor: (<tf.Tensor: id=109206, shape=(25, 951), dtype=int64, numpy=
array([[13712, 10763, 26055, ...,     0,     0,     0],
       [34252,  8614, 13535, ...,     0,     0,     0],
       [17742, 10787, 13276, ...,     0,     0,     0],
      

Prediction: [1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1]
