# 20180110 - My Neural Network Lab notes

In [59]:
import numpy as np
import keras
from keras.datasets import imdb
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation
from keras.preprocessing.text import Tokenizer

import matplotlib.pyplot as plt
%matplotlib inline

np.random.seed(666)

## Data
Keras include sample datasets, e.g. movie revies from IMDB

Dataset of 25,000 movies reviews from IMDB, labeled by sentiment (positive/negative). 

Reviews have been preprocessed, and each review is encoded as a sequence of word indexes (integers). For convenience, words are indexed by overall frequency in the dataset, so that for instance the integer "3" encodes the 3rd most frequent word in the data. This allows for quick filtering operations such as: "only consider the top 10,000 most common words, but eliminate the top 20 most common words".

As a convention, "0" does not stand for a specific word, but instead is used to encode any unknown word.

This dataset comes preloaded with Keras, so one simple command will get us training and testing data. There is a parameter for how many words we want to look at. We've set it at 1000, but feel free to experiment.

Usage:

```python
from keras.datasets import imdb

(x_train, y_train), (x_test, y_test) = imdb.load_data(path="imdb.npz",
                                                      num_words=None,
                                                      skip_top=0,
                                                      maxlen=None,
                                                      seed=113,
                                                      start_char=1,
                                                      oov_char=2,
                                                      index_from=3)
```

Returns:

2 tuples:

x_train, x_test: list of sequences, which are lists of indexes (integers). If the num_words argument was specific, the maximum possible index value is num_words-1. If the maxlen argument was specified, the largest possible sequence length is maxlen.

y_train, y_test: list of integer labels (1 or 0).


Arguments:
path: if you do not have the data locally (at '~/.keras/datasets/' + path), it will be downloaded to this location.

num_words: integer or None. Top most frequent words to consider. Any less frequent word will appear as oov_char value in the sequence data.

skip_top: integer. Top most frequent words to ignore (they will appear as oov_char value in the sequence data).

maxlen: int. Maximum sequence length. Any longer sequence will be truncated.

seed: int. Seed for reproducible data shuffling.

start_char: int. The start of a sequence will be marked with this character. Set to 1 because 0 is usually the padding character.

oov_char: int. words that were cut out because of the num_words or skip_top limit will be replaced with this character.

index_from: int. Index actual words with this index and higher.

In [60]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=2000,skip_top=50)

print('\nLength of data vector')
print(len(x_train))

print('Shape of data vectors:')
print(x_train.shape)
print(x_test.shape)

print('\nFirst element of data vectors')
print(x_train[0])
print(y_train[0])


Length of data vector
25000
Shape of data vectors:
(25000,)
(25000,)

First element of data vectors
[2, 2, 2, 2, 2, 530, 973, 1622, 1385, 65, 458, 2, 66, 2, 2, 173, 2, 256, 2, 2, 100, 2, 838, 112, 50, 670, 2, 2, 2, 480, 284, 2, 150, 2, 172, 112, 167, 2, 336, 385, 2, 2, 172, 2, 1111, 2, 546, 2, 2, 447, 2, 192, 50, 2, 2, 147, 2, 2, 2, 2, 2, 1920, 2, 469, 2, 2, 71, 87, 2, 2, 2, 530, 2, 76, 2, 2, 1247, 2, 2, 2, 515, 2, 2, 2, 626, 2, 2, 2, 62, 386, 2, 2, 316, 2, 106, 2, 2, 2, 2, 2, 480, 66, 2, 2, 2, 130, 2, 2, 2, 619, 2, 2, 124, 51, 2, 135, 2, 2, 1415, 2, 2, 2, 2, 215, 2, 77, 52, 2, 2, 407, 2, 82, 2, 2, 2, 107, 117, 2, 2, 256, 2, 2, 2, 2, 2, 723, 2, 71, 2, 530, 476, 2, 400, 317, 2, 2, 2, 2, 1029, 2, 104, 88, 2, 381, 2, 297, 98, 2, 2, 56, 2, 141, 2, 194, 2, 2, 2, 226, 2, 2, 134, 476, 2, 480, 2, 144, 2, 2, 2, 51, 2, 2, 224, 92, 2, 104, 2, 226, 65, 2, 2, 1334, 88, 2, 2, 283, 2, 2, 2, 113, 103, 2, 2, 2, 2, 2, 178, 2]
1


## Data encoding
Converting the input vectors into one-hot encoded vectors. 

0	00000001

1	00000010

2	00000100

https://en.wikipedia.org/wiki/One-hot


In [61]:
tokenizer = Tokenizer(num_words=2000)

x_train = tokenizer.sequences_to_matrix(x_train, mode='binary')
x_test = tokenizer.sequences_to_matrix(x_test, mode='binary')

y_train = keras.utils.to_categorical(y_train, 2)
y_test = keras.utils.to_categorical(y_test, 2)

print(x_train.shape)
print(x_test.shape)

print(y_train.shape)
print(y_test.shape)

(25000, 2000)
(25000, 2000)
(25000, 2)
(25000, 2)


## Neural net model

In [None]:
# sequential model architecture with one layer of length 100
num_classes=2
model = Sequential()
model.add(Dense(512, activation='relu', input_dim=2000))
model.add(Dropout(0.5))
model.add(Dense(num_classes, activation='softmax'))
model.summary()

# Compiling the model using categorical_crossentropy loss, and rmsprop optimizer.
model.compile(loss='categorical_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_19 (Dense)             (None, 512)               1024512   
_________________________________________________________________
dropout_11 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_20 (Dense)             (None, 2)                 1026      
Total params: 1,025,538
Trainable params: 1,025,538
Non-trainable params: 0
_________________________________________________________________


## Model training

In [None]:
hist = model.fit(x_train, y_train,
          batch_size=16,
          epochs=10,
          validation_data=(x_test, y_test), 
          verbose=1)

Train on 25000 samples, validate on 25000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
 2176/25000 [=>............................] - ETA: 14s - loss: 0.2944 - acc: 0.9030

## Model evaluation

In [None]:
print('Quick visualization of model training history')
plt.plot(hist.history['acc'])
plt.plot(hist.history['val_acc'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

# summarize history for loss
plt.plot(hist.history['loss'])
plt.plot(hist.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'val'], loc='upper left')
plt.show()

score = model.evaluate(x_test, y_test, verbose=0)
print('accuracy: %.2f%%' % (score[1]*100))

## Prediction

Take a look at model.predict function

predict(self, x, batch_size=None, verbose=0, steps=None)

Generates output predictions for the input samples.

The input samples are processed batch by batch.

Arguments

x: the input data, as a Numpy array.

batch_size: Integer. If unspecified, it will default to 32.

verbose: verbosity mode, 0 or 1.

steps: Total number of steps (batches of samples) before declaring the prediction round finished. Ignored with the default value of None.

Returns

A Numpy array of predictions.

In [None]:
# TODO - correct the output of predict function

prediction = model.predict(x_test)
print(prediction)

print(abs(prediction-y_test))

# More advance exmaple

https://www.kaggle.com/c/sentiment-analysis-on-movie-reviews