In [1]:
import pandas as pd

import os
import re
import tarfile
import tqdm

import numpy as np  # Keras takes care of most of this but it likes to see Numpy arrays
from keras.preprocessing import sequence    # A helper module to handle padding input
from keras.models import Sequential         # The base keras Neural Network model
from keras.layers import Dense, Dropout, Activation   # The layer objects we will pile into the model
from keras.layers import Conv1D, GlobalMaxPooling1D

import glob
import os

from random import shuffle

# from pugnlp.futil import path_status, find_files
# from nlpia.web import requests_get

Using TensorFlow backend.


In [2]:
BIG_URLS = {
    'w2v': (
        'https://www.dropbox.com/s/965dir4dje0hfi4/GoogleNews-vectors-negative300.bin.gz?dl=1',
        1647046227,
    ),
    'slang': (
        'https://www.dropbox.com/s/43c22018fbfzypd/slang.csv.gz?dl=1',
        117633024,
    ),
    'tweets': (
        'https://www.dropbox.com/s/5gpb43c494mc8p0/tweets.csv.gz?dl=1',
        311725313,
    ),
    'lsa_tweets': (
        'https://www.dropbox.com/s/rpjt0d060t4n1mr/lsa_tweets_5589798_2003588x200.tar.gz?dl=1',
        3112841563,  # 3112841312,
    ),
    'imdb': (
        'https://www.dropbox.com/s/yviic64qv84x73j/aclImdb_v1.tar.gz?dl=1',
        3112841563,  # 3112841312,
    ),
}

In [3]:
# download_file(BIG_URLS['w2v'][0])

In [4]:
def pre_process_data(filepath):
    """
    This is dependent on your training data source but we will try to generalize it as best as possible.
    """
    positive_path = os.path.join(filepath, 'pos')
    negative_path = os.path.join(filepath, 'neg')

    pos_label = 1
    neg_label = 0

    dataset = []

    for filename in glob.glob(os.path.join(positive_path, '*.txt')):
        with open(filename, 'r',encoding="utf-8") as f:
            dataset.append((pos_label, f.read()))

    for filename in glob.glob(os.path.join(negative_path, '*.txt')):
        with open(filename, 'r',encoding="utf-8") as f:
            dataset.append((neg_label, f.read()))

    shuffle(dataset)

    return dataset

In [5]:
dataset = pre_process_data('./aclImdb/train')

In [6]:
pd.DataFrame(dataset).head()

Unnamed: 0,0,1
0,0,I have to admit that I'm a great fan of this s...
1,1,The plot of GOODNIGHT MR TOM on paper makes it...
2,0,well its official. they have just killed Ameri...
3,0,"Kitten Natividad, of Russ Meyer film fame, pla..."
4,0,"Follow-up to 1973's ""Walking Tall"" continues t..."


In [63]:
dataset

[(0,
  'I have to admit that I\'m a great fan of this show, so you must know how disappointed I got when I watched this movie. First of all, the plot was awful, I thought it was going to be something more interesting, like to see what happened to Arnold fathers, or something more interesting, but NOOOOOOO, a maniac wants to destroy Arnold\'s house, between many other places, so many people tries to stop this.<br /><br />I must admit that the plot wasn\'t so bad after all, but what really sucked were the steps that Arnold and his friends do to stop this maniac, they become friends of a spy,; they drive a bus (based on a video game, for God sake), and to worse everything, they make super-moves on the bus, things that many persons had already tried and died, but not Arnold, Gerald and Helga, \'cause they are experts on a video game.<br /><br />Honestly, my mom, my sister, even me got really disappointed after watching this movie, \'cause it was the worst way to finish a really good cartoo

In [7]:
print(dataset[0])

(0, 'I have to admit that I\'m a great fan of this show, so you must know how disappointed I got when I watched this movie. First of all, the plot was awful, I thought it was going to be something more interesting, like to see what happened to Arnold fathers, or something more interesting, but NOOOOOOO, a maniac wants to destroy Arnold\'s house, between many other places, so many people tries to stop this.<br /><br />I must admit that the plot wasn\'t so bad after all, but what really sucked were the steps that Arnold and his friends do to stop this maniac, they become friends of a spy,; they drive a bus (based on a video game, for God sake), and to worse everything, they make super-moves on the bus, things that many persons had already tried and died, but not Arnold, Gerald and Helga, \'cause they are experts on a video game.<br /><br />Honestly, my mom, my sister, even me got really disappointed after watching this movie, \'cause it was the worst way to finish a really good cartoon. 

In [8]:
len(dataset)

25000

In [9]:
pd.DataFrame(dataset).iloc[:, 0].sum()

12500

In [10]:
from nltk.tokenize import TreebankWordTokenizer
from gensim.models import KeyedVectors
word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True, limit=200000)

In [11]:
word_vectors

<gensim.models.keyedvectors.Word2VecKeyedVectors at 0x21c2ac3db70>

In [12]:
dataset[0][0]

0

In [13]:
dataset[0][1]

'I have to admit that I\'m a great fan of this show, so you must know how disappointed I got when I watched this movie. First of all, the plot was awful, I thought it was going to be something more interesting, like to see what happened to Arnold fathers, or something more interesting, but NOOOOOOO, a maniac wants to destroy Arnold\'s house, between many other places, so many people tries to stop this.<br /><br />I must admit that the plot wasn\'t so bad after all, but what really sucked were the steps that Arnold and his friends do to stop this maniac, they become friends of a spy,; they drive a bus (based on a video game, for God sake), and to worse everything, they make super-moves on the bus, things that many persons had already tried and died, but not Arnold, Gerald and Helga, \'cause they are experts on a video game.<br /><br />Honestly, my mom, my sister, even me got really disappointed after watching this movie, \'cause it was the worst way to finish a really good cartoon. I mu

In [14]:
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(dataset[0][1])
print(tokens)
print(len(tokens))

['I', 'have', 'to', 'admit', 'that', 'I', "'m", 'a', 'great', 'fan', 'of', 'this', 'show', ',', 'so', 'you', 'must', 'know', 'how', 'disappointed', 'I', 'got', 'when', 'I', 'watched', 'this', 'movie.', 'First', 'of', 'all', ',', 'the', 'plot', 'was', 'awful', ',', 'I', 'thought', 'it', 'was', 'going', 'to', 'be', 'something', 'more', 'interesting', ',', 'like', 'to', 'see', 'what', 'happened', 'to', 'Arnold', 'fathers', ',', 'or', 'something', 'more', 'interesting', ',', 'but', 'NOOOOOOO', ',', 'a', 'maniac', 'wants', 'to', 'destroy', 'Arnold', "'s", 'house', ',', 'between', 'many', 'other', 'places', ',', 'so', 'many', 'people', 'tries', 'to', 'stop', 'this.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'I', 'must', 'admit', 'that', 'the', 'plot', 'was', "n't", 'so', 'bad', 'after', 'all', ',', 'but', 'what', 'really', 'sucked', 'were', 'the', 'steps', 'that', 'Arnold', 'and', 'his', 'friends', 'do', 'to', 'stop', 'this', 'maniac', ',', 'they', 'become', 'friends', 'of', 'a', 'spy', ','

In [15]:
tokenizer = TreebankWordTokenizer()
tokens = tokenizer.tokenize(dataset[1][1])
print(tokens)
print(len(tokens))

['The', 'plot', 'of', 'GOODNIGHT', 'MR', 'TOM', 'on', 'paper', 'makes', 'it', 'seem', 'we', 'are', 'in', 'for', 'a', 'large', 'dose', 'of', 'maudlin', ',', 'sickly', 'sentiment.But', ',', 'talented', 'director', 'Jack', 'Gold', 'is', 'an', 'expert', 'on', 'touching', 'the', 'emotions', 'in', 'the', 'right', 'manner', ',', 'and', 'it', 'emerges', 'instead', 'as', 'a', 'compelling', ',', 'deeply', 'moving', 'wartime', 'drama', 'with', 'excellent', 'production', 'and', 'lead', 'performances.One', 'of', 'the', 'best', ',', 'if', 'not', 'the', 'best', 'TV', 'movies', 'of', 'the', '1990', "'s", 'which', 'possibly', 'would', "'ve", 'had', 'even', 'greater', 'success', 'if', 'it', 'had', 'been', 'released', 'in', 'the', 'cinemas.', '<', 'br', '/', '>', '<', 'br', '/', '>', 'The', 'evacuation', 'of', 'children', 'to', 'countryside', 'towns', 'and', 'villages', 'in', 'World', 'War', 'II', 'was', 'of', 'course', 'a', 'common', 'practice', ',', 'but', 'in', 'the', 'case', 'of', 'the', 'young', 'bo

In [16]:
print(tokens[0])
print(word_vectors[tokens[0]].shape)
print(word_vectors[tokens[0]])

The
(300,)
[-0.17285156  0.27929688  0.10693359 -0.15820312 -0.08447266  0.05908203
  0.04077148  0.00254822  0.25976562  0.18066406  0.09765625 -0.08105469
 -0.01049805  0.09814453  0.00060272  0.07080078 -0.015625   -0.09521484
 -0.08105469 -0.02868652 -0.03320312  0.16503906  0.03979492 -0.03710938
  0.04101562 -0.12695312 -0.12890625  0.12353516  0.04980469  0.01257324
  0.05786133 -0.00830078 -0.02832031 -0.03320312  0.16113281  0.07519531
 -0.25976562  0.08935547  0.13574219  0.00460815 -0.04418945  0.02319336
 -0.10449219 -0.05151367  0.08349609 -0.02050781 -0.02172852 -0.02734375
  0.16015625  0.19042969 -0.0324707   0.06787109  0.10302734 -0.25390625
  0.00634766  0.20507812  0.02111816 -0.21679688 -0.02441406  0.17089844
 -0.21875     0.10009766 -0.15527344 -0.12597656 -0.03833008 -0.05419922
  0.19238281  0.21777344  0.12109375 -0.02648926  0.05297852 -0.0201416
  0.0534668   0.07666016  0.0456543   0.01977539  0.12451172  0.10205078
  0.15234375  0.25195312  0.04296875 -0.1

In [17]:
def tokenize_and_vectorize(dataset):
    tokenizer = TreebankWordTokenizer()
    vectorized_data = []
    expected = []
    for sample in dataset:
        tokens = tokenizer.tokenize(sample[1])
        sample_vecs = []
        for token in tokens:
            try:
                sample_vecs.append(word_vectors[token])

            except KeyError:
                pass  # No matching token in the Google w2v vocab

        vectorized_data.append(sample_vecs)

    return vectorized_data

In [18]:
def collect_expected(dataset):
    """ Peel of the target values from the dataset """
    expected = []
    for sample in dataset:
        expected.append(sample[0])
    return expected

In [19]:
vectorized_data = tokenize_and_vectorize(dataset)
expected = collect_expected(dataset)

In [20]:
print(len(vectorized_data))
print(len(vectorized_data[0]))
print(vectorized_data[0][0])

25000
281
[ 0.07910156 -0.0050354   0.11181641  0.21289062  0.13085938 -0.01470947
 -0.03540039 -0.07763672  0.04077148  0.11474609  0.00147247 -0.29101562
  0.00457764 -0.20019531 -0.19238281  0.08007812  0.10107422  0.04858398
  0.15722656 -0.09521484 -0.05004883  0.25        0.33007812 -0.09716797
 -0.05566406 -0.0071106  -0.16796875 -0.13574219  0.05102539 -0.00598145
  0.10791016  0.16503906 -0.03955078 -0.03955078  0.04321289  0.12060547
  0.13476562  0.09375     0.00909424  0.1640625   0.21289062 -0.05322266
  0.33398438  0.01586914  0.10449219  0.24121094 -0.0189209  -0.04199219
  0.05834961  0.03271484  0.09863281  0.18945312  0.04125977  0.01501465
 -0.05883789  0.10253906  0.01538086  0.03198242  0.02722168 -0.13769531
  0.12695312  0.06396484 -0.13574219 -0.012146    0.07617188 -0.02319336
 -0.21191406  0.20996094 -0.01953125  0.02038574  0.16113281 -0.00897217
  0.04663086  0.03881836 -0.4609375  -0.1796875   0.12792969 -0.00564575
  0.24121094  0.21777344 -0.02600098 -0.1

In [21]:
print(len(vectorized_data))
print(len(vectorized_data[1]))
print(vectorized_data[1][0])

25000
150
[-0.17285156  0.27929688  0.10693359 -0.15820312 -0.08447266  0.05908203
  0.04077148  0.00254822  0.25976562  0.18066406  0.09765625 -0.08105469
 -0.01049805  0.09814453  0.00060272  0.07080078 -0.015625   -0.09521484
 -0.08105469 -0.02868652 -0.03320312  0.16503906  0.03979492 -0.03710938
  0.04101562 -0.12695312 -0.12890625  0.12353516  0.04980469  0.01257324
  0.05786133 -0.00830078 -0.02832031 -0.03320312  0.16113281  0.07519531
 -0.25976562  0.08935547  0.13574219  0.00460815 -0.04418945  0.02319336
 -0.10449219 -0.05151367  0.08349609 -0.02050781 -0.02172852 -0.02734375
  0.16015625  0.19042969 -0.0324707   0.06787109  0.10302734 -0.25390625
  0.00634766  0.20507812  0.02111816 -0.21679688 -0.02441406  0.17089844
 -0.21875     0.10009766 -0.15527344 -0.12597656 -0.03833008 -0.05419922
  0.19238281  0.21777344  0.12109375 -0.02648926  0.05297852 -0.0201416
  0.0534668   0.07666016  0.0456543   0.01977539  0.12451172  0.10205078
  0.15234375  0.25195312  0.04296875 -0.18

In [22]:
print(len(vectorized_data[1]))
print(len(vectorized_data[2]))

150
123


In [23]:
print(len(expected))
pd.DataFrame(expected).head()

25000


Unnamed: 0,0
0,0
1,1
2,0
3,0
4,0


In [24]:
split_point = int(len(vectorized_data) * .8)

x_train = vectorized_data[:split_point]
y_train = expected[:split_point]
x_test = vectorized_data[split_point:]
y_test = expected[split_point:]

In [25]:
print(len(x_train))
print(len(x_train[0]))
print(len(x_train[0][0]))

20000
281
300


In [26]:
maxlen = 400
batch_size = 32         # How many samples to show the net before backpropogating the error and updating the weights
embedding_dims = 300    # Length of the token vectors we will create for passing into the Convnet
filters = 250           # Number of filters we will train
kernel_size = 3         # The width of the filters, actual filters will each be a matrix of weights of size: embedding_dims x kernel_size or 50 x 3 in our case
hidden_dims = 250       # Number of neurons in the plain feed forward net at the end of the chain
epochs = 2              # Number of times we will pass the entire training dataset through the network

def pad_trunc(data, maxlen):
    """ For a given dataset pad with zero vectors or truncate to maxlen """
    new_data = []

    # Create a vector of 0's the length of our word vectors
    zero_vector = []
    for _ in range(len(data[0][0])):
        zero_vector.append(0.0)

    for sample in data:

        if len(sample) > maxlen:
            temp = sample[:maxlen]
        elif len(sample) < maxlen:
            temp = sample
            additional_elems = maxlen - len(sample)
            for _ in range(additional_elems):
                temp.append(zero_vector)
        else:
            temp = sample
        new_data.append(temp)
    return new_data


In [27]:
print(len(x_train))
print(len(x_train[0]))
print(len(x_train[0][0]))
print(x_train[0][0])

20000
281
300
[ 0.07910156 -0.0050354   0.11181641  0.21289062  0.13085938 -0.01470947
 -0.03540039 -0.07763672  0.04077148  0.11474609  0.00147247 -0.29101562
  0.00457764 -0.20019531 -0.19238281  0.08007812  0.10107422  0.04858398
  0.15722656 -0.09521484 -0.05004883  0.25        0.33007812 -0.09716797
 -0.05566406 -0.0071106  -0.16796875 -0.13574219  0.05102539 -0.00598145
  0.10791016  0.16503906 -0.03955078 -0.03955078  0.04321289  0.12060547
  0.13476562  0.09375     0.00909424  0.1640625   0.21289062 -0.05322266
  0.33398438  0.01586914  0.10449219  0.24121094 -0.0189209  -0.04199219
  0.05834961  0.03271484  0.09863281  0.18945312  0.04125977  0.01501465
 -0.05883789  0.10253906  0.01538086  0.03198242  0.02722168 -0.13769531
  0.12695312  0.06396484 -0.13574219 -0.012146    0.07617188 -0.02319336
 -0.21191406  0.20996094 -0.01953125  0.02038574  0.16113281 -0.00897217
  0.04663086  0.03881836 -0.4609375  -0.1796875   0.12792969 -0.00564575
  0.24121094  0.21777344 -0.02600098 

In [28]:
print(x_train[0])

[array([ 0.07910156, -0.0050354 ,  0.11181641,  0.21289062,  0.13085938,
       -0.01470947, -0.03540039, -0.07763672,  0.04077148,  0.11474609,
        0.00147247, -0.29101562,  0.00457764, -0.20019531, -0.19238281,
        0.08007812,  0.10107422,  0.04858398,  0.15722656, -0.09521484,
       -0.05004883,  0.25      ,  0.33007812, -0.09716797, -0.05566406,
       -0.0071106 , -0.16796875, -0.13574219,  0.05102539, -0.00598145,
        0.10791016,  0.16503906, -0.03955078, -0.03955078,  0.04321289,
        0.12060547,  0.13476562,  0.09375   ,  0.00909424,  0.1640625 ,
        0.21289062, -0.05322266,  0.33398438,  0.01586914,  0.10449219,
        0.24121094, -0.0189209 , -0.04199219,  0.05834961,  0.03271484,
        0.09863281,  0.18945312,  0.04125977,  0.01501465, -0.05883789,
        0.10253906,  0.01538086,  0.03198242,  0.02722168, -0.13769531,
        0.12695312,  0.06396484, -0.13574219, -0.012146  ,  0.07617188,
       -0.02319336, -0.21191406,  0.20996094, -0.01953125,  0.0

In [29]:
x_train = pad_trunc(x_train, maxlen)
x_test = pad_trunc(x_test, maxlen)

In [30]:
print(len(x_train))
print(len(x_train[0]))
print(len(x_train[0][0]))
# print(x_train[0][0])

20000
400
300


In [31]:
type(x_train)
print(len(x_train))
print(len(x_train[0]))
print(len(x_train[0][0]))

20000
400
300


In [32]:
x_train = np.reshape(x_train, (len(x_train), maxlen, embedding_dims))
y_train = np.array(y_train)
x_test = np.reshape(x_test, (len(x_test), maxlen, embedding_dims))
y_test = np.array(y_test)

In [33]:
print(x_train.shape)
print(y_train.shape)
print(x_test.shape)
print(y_test.shape)

(20000, 400, 300)
(20000,)
(5000, 400, 300)
(5000,)


Ch8

simple RNN

In [34]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN

num_neurons = 50

print('Build model...')
model = Sequential()

model.add(SimpleRNN(num_neurons, return_sequences=True, \
                    input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile('rmsprop', 'binary_crossentropy',  metrics=['accuracy'])
print(model.summary())

Build model...
Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_1 (SimpleRNN)     (None, 400, 50)           17550     
_________________________________________________________________
dropout_1 (Dropout)          (None, 400, 50)           0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 20000)             0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 20001     
Total params: 37,551
Trainable params: 37,551
Non-trainable params: 0
_________________________________________________________________
None


In [36]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
model_structure = model.to_json()
with open("simplernn_model1.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("simplernn_weights1.h5")
print('Model saved.')

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2
Model saved.


In [37]:
from keras.models import model_from_json
with open("simplernn_model1.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)

model.load_weights('simplernn_weights1.h5')

In [38]:
sample_1 = "I'm hate that the dismal weather that had me down for so long, when will it break! Ugh, when does happiness return?  The sun is blinding and the puffy clouds are too thin.  I can't wait for the weekend."

In [39]:
# We pass a dummy value in the first element of the tuple just because our helper expects it from the way processed the initial data.  That value won't ever see the network, so it can be whatever.
vec_list = tokenize_and_vectorize([(1, sample_1)])

# Tokenize returns a list of the data (length 1 here)
test_vec_list = pad_trunc(vec_list, maxlen)

test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))

In [40]:
model.predict_classes(test_vec)

array([[0]])

In [41]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Flatten, SimpleRNN

num_neurons = 100

print('Build model...')
model = Sequential()

model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
model.add(Dropout(.2))

model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))

model.compile('rmsprop', 'binary_crossentropy',  metrics=['accuracy'])
print(model.summary())

Build model...
Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_2 (SimpleRNN)     (None, 400, 100)          40100     
_________________________________________________________________
dropout_2 (Dropout)          (None, 400, 100)          0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 40000)             0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 40001     
Total params: 80,101
Trainable params: 80,101
Non-trainable params: 0
_________________________________________________________________
None


In [42]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
model_structure = model.to_json()
with open("simplernn_model2.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("simplernn_weights2.h5")
print('Model saved.')

Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2
Model saved.


### ch9 cnn

In [43]:



# In[ ]:


print('Build model...')
model = Sequential()

# we add a Convolution1D, which will learn filters
# word group filters of size filter_length:
model.add(Conv1D(filters,
                 kernel_size,
                 padding='valid',
                 activation='relu',
                 strides=1,
                 input_shape=(maxlen, embedding_dims)))
# we use max pooling:
model.add(GlobalMaxPooling1D())
# We add a vanilla hidden layer:
model.add(Dense(hidden_dims))
model.add(Dropout(0.2))
model.add(Activation('relu'))
# We project onto a single unit output layer, and squash it with a sigmoid:
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          validation_data=(x_test, y_test))
model_structure = model.to_json()
with open("cnn_model.json", "w") as json_file:
    json_file.write(model_structure)

model.save_weights("cnn_weights.h5")
print('Model saved.')


# In[ ]:


from keras.models import model_from_json
with open("cnn_model.json", "r") as json_file:
    json_string = json_file.read()
model = model_from_json(json_string)

model.load_weights('cnn_weights.h5')


# In[ ]:


sample_1 = "I'm hate that the dismal weather that had me down for so long, when will it break! Ugh, when does happiness return?  The sun is blinding and the puffy clouds are too thin.  I can't wait for the weekend."


# In[ ]:


# We pass a dummy value in the first element of the tuple just because our helper expects it from the way processed the initial data.  That value won't ever see the network, so it can be whatever.
vec_list = tokenize_and_vectorize([(1, sample_1)])

# Tokenize returns a list of the data (length 1 here)
test_vec_list = pad_trunc(vec_list, maxlen)

test_vec = np.reshape(test_vec_list, (len(test_vec_list), maxlen, embedding_dims))
model.predict(test_vec)


# In[ ]:


model.predict_classes(test_vec)

Build model...
Train on 20000 samples, validate on 5000 samples
Epoch 1/2
Epoch 2/2
Model saved.


array([[0]])

ch9

In [44]:
import numpy as np
np.random.seed(1337)  # for reproducibility

from keras.preprocessing import sequence
from keras.models import Sequential
from keras.layers import Dense, Activation, Embedding
from keras.layers import LSTM
from keras.datasets import imdb

max_features = 20000
maxlen = 80  # cut texts after this number of words (among top max_features most common words)
batch_size = 32

print('Loading data...')
(X_train, y_train), (X_test, y_test) = imdb.load_data(nb_words=max_features)
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')

print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=maxlen)
X_test = sequence.pad_sequences(X_test, maxlen=maxlen)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)

print('Build model...')
model = Sequential()
model.add(Embedding(max_features, 128, dropout=0.2))
model.add(LSTM(128, dropout_W=0.2, dropout_U=0.2))  # try using a GRU instead, for fun
model.add(Dense(1))
model.add(Activation('sigmoid'))

# try using different optimizers and different optimizer configs
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

print('Train...')
model.fit(X_train, y_train, batch_size=batch_size, nb_epoch=15,
          validation_data=(X_test, y_test))
score, acc = model.evaluate(X_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Loading data...




25000 train sequences
25000 test sequences
Pad sequences (samples x time)
X_train shape: (25000, 80)
X_test shape: (25000, 80)
Build model...




Train...


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 25000 samples, validate on 25000 samples
Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15
Test score: 1.0555996658229827
Test accuracy: 0.8107600212097168


處理TF2.0 記憶體不足的問題，使用 multiprocessing

In [45]:
# Hi @HristoBuyukliev , this is a very old issue that everyone is facing in TF 1.x as well as TF 2.x, it seems to be a design flaw and the TF team doesn't seem to care about fixing (I have been facing this issue for more than 2 years now).

# What worked well for me was just to run my train/eval in a separate process and wait for it to finish. So when the process finishes the system kills it and releases the GPU resources automatically.
# You can achieve this by doing something like:

# import multiprocessing

# process_eval = multiprocessing.Process(target=evaluate, args=(...))
# process_eval.start()
# process_eval.join()

In [46]:
import multiprocessing

In [48]:
# from keras.models import Sequential
# from keras.layers import Dense, Dropout, Flatten, SimpleRNN

# num_neurons = 100

# print('Build model...')
# model = Sequential()

# model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
# model.add(Dropout(.2))

# model.add(Flatten())
# model.add(Dense(1, activation='sigmoid'))

# model.compile('rmsprop', 'binary_crossentropy',  metrics=['accuracy'])
# print(model.summary())

Build model...
Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
simple_rnn_3 (SimpleRNN)     (None, 80, 100)           40100     
_________________________________________________________________
dropout_4 (Dropout)          (None, 80, 100)           0         
_________________________________________________________________
flatten_3 (Flatten)          (None, 8000)              0         
_________________________________________________________________
dense_6 (Dense)              (None, 1)                 8001      
Total params: 48,101
Trainable params: 48,101
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
# model.fit(x_train, y_train,
#           batch_size=batch_size,
#           epochs=epochs,
#           validation_data=(x_test, y_test))

In [60]:
def _training_worker():
#     設定模型
    model = Sequential()
    model.add(SimpleRNN(num_neurons, return_sequences=True, input_shape=(maxlen, embedding_dims)))
    model.add(Dropout(.2))
    model.add(Flatten())
    model.add(Dense(1, activation='sigmoid'))
    model.compile('rmsprop', 'binary_crossentropy',  metrics=['accuracy'])
#     fit model
    model.fit(x_train, y_train,
              batch_size=batch_size,
              epochs=epochs,
              validation_data=(x_test, y_test))
    print(model.summary())
    print(model.predict(x_train))

def train_new_model():
    training_process = \
    multiprocessing.Process(target=_training_worker)
    training_process.start()
#     get_message_from_training_process(...)
    training_process.join()
    print("計算結束")

In [62]:
# _training_worker()

In [57]:
training_process = multiprocessing.Process(target=_training_worker)
training_process.start()
training_process.join()

In [58]:
train_new_model()

計算結束
