In [1]:
import tensorflow as tf

In [2]:
# Classify movie reviews as positive or negative based on the text content of the reviews.

### IMDB Data

- Set of 50000 highly polarized reviews from the IMDB dataset.
- Split into 25000 training set and 25000 test set.
- Each set contains 50% positive reviews; 50% negative reviews.
- Dataset comes preprocessed and packaged with Keras.
- The reviews (sequences of word) have been turned into sequences of integers.
- Each integer sttands for a specific word in the dictionary.

In [6]:
# load the IMDB dataset.

from keras.datasets import imdb

In [7]:
#num_words - means you keep the top 10000 most frequently occuring words in training data
#rare words discarded.
#manageable vector
#train/test data - list of reviews
#each review is a list of word indices
#train/test labels - 0s and 1s (negative, positive review)

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=10000)

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz


  x_train, y_train = np.array(xs[:idx]), np.array(labels[:idx])
  x_test, y_test = np.array(xs[idx:]), np.array(labels[idx:])


In [9]:
train_data[0]
train_labels[0]

1

In [10]:
# no word index will exceed 10000

max([max(sequence) for sequence in train_data])

9999

In [12]:
# decode one of the reviews back to English words
# word_index() - dictionary mapping words to an index.

word_index = imdb.get_word_index()

#reverses, mapping integer indices to words
reverse_word_index = dict([(value,key) for (key,value) in word_index.items()])

#decode review
decoded_review = "".join([reverse_word_index.get(i - 3, "?") for i in train_data[0]])
#indices are offset by 3 (0,1,2) - reserved indices for "padding", "start of sequence", "unknown"


### Preparing the data

- Can't feed list of integers into a neural network; turn them to tensors.
- Pad your lists so they all have same length.
- Turn them into an integer tensor of shape(sample, word_indices)
- Use as first layer in your network.(Embedding Layer) - integer tensors.

- One-hot encode your lists to turn them into vectors of 0s and 1s.
- Turning the sequence [3, 5] into a 10000 dimensional vector.
- All 0s except for indices 3 and 5 will be 1s.
- Use the first layer in your network a *Dense* Layer - handles floating pt data. 

In [13]:
# encoding integer sequences into a binary matrix.

import numpy as np

In [32]:
dimension = int(10000)

def vectorize_sequences (sequences, dimension=dimension):
    #create an all-zero of matrix of shape(len(sequences), dimension) 
    result = np.zeros(len(sequences), dimension)
    
    for i, sequence in enumerate(sequences):
        #set specific indices of results[i] to 1s
        results[i, sequence] = 1
    return results

In [33]:
train_data.shape

(25000,)

In [34]:
test_data.shape

(25000,)

In [38]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()

In [39]:
x_train = mlb.fit_transform(train_data)
x_test = mlb.fit_transform(test_data)

In [41]:
x_train[0]

array([1, 1, 1, ..., 0, 0, 0])

In [42]:
#vectorize labels

y_train = np.asarray(train_labels).astype("float32")
y_test = np.asarray(test_labels).astype("float32")

### Building The Network