In [11]:
from keras import layers, models
import numpy as np
import matplotlib.pyplot as plt

from keras.preprocessing.text import Tokenizer

We have two different way to deal with sequence
1. recurrent neural network
2. 1D Convnet

Application including
- doc classification, time sequence classification
- compare how closely related are between two different doc or stock
- language translation, sequence to sequence learning
- Sentiment analysis
- time sequence prediction, for instance, weather forecast

## keras one-hot

In [16]:
samples = ['The cat sat on the mat', 'The dog ate my homework.']

tk = Tokenizer(num_words=1000)
tk.fit_on_texts(samples)

sequences = tk.texts_to_sequences(samples)

one_hot_results = tk.texts_to_matrix(samples, mode='binary')

word_index = tk.word_index
print('Found {} unique tokens.'.format(len(word_index)))
print(word_index)

Found 9 unique tokens.
{'the': 1, 'cat': 2, 'sat': 3, 'on': 4, 'mat': 5, 'dog': 6, 'ate': 7, 'my': 8, 'homework': 9}


## instantiate a Embedding layer
https://zhuanlan.zhihu.com/p/27830489 explaination

In [19]:
from keras.datasets import imdb
from keras import preprocessing
max_features = 10000
maxlen = 20

In [20]:
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)

In [21]:
x_train = preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = preprocessing.sequence.pad_sequences(x_test, maxlen=maxlen)

In [22]:
model = models.Sequential()
model.add(layers.Embedding(10000, 8, input_length=maxlen))
model.add(layers.Flatten())
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop', loss='binary_crossentropy', metrics=['acc'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 20, 8)             80000     
_________________________________________________________________
flatten_1 (Flatten)          (None, 160)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 161       
Total params: 80,161
Trainable params: 80,161
Non-trainable params: 0
_________________________________________________________________


In [23]:
history = model.fit(x_train, y_train,epochs=10, batch_size=32, validation_split=0.2)

Train on 20000 samples, validate on 5000 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


## Use pre-trained embedding
like word2vec or GloVe

In [31]:
# download the dataset
!wget --no-check-certificate http://mng.bz/0tIo -O fullIMDB.zip

--2019-01-04 15:55:30--  http://mng.bz/0tIo
正在解析主机 mng.bz (mng.bz)... 35.166.24.88
正在连接 mng.bz (mng.bz)|35.166.24.88|:80... 已连接。
已发出 HTTP 请求，正在等待回应... 301 Moved Permanently
位置：https://mng.bz/0tIo [跟随至新的 URL]
--2019-01-04 15:55:31--  https://mng.bz/0tIo
正在连接 mng.bz (mng.bz)|35.166.24.88|:443... 已连接。
警告: 无法验证 mng.bz 的由 “CN=Go Daddy Secure Certificate Authority - G2,OU=http://certs.godaddy.com/repository/,O=GoDaddy.com\\, Inc.,L=Scottsdale,ST=Arizona,C=US” 颁发的证书:
  无法本地校验颁发者的权限。
已发出 HTTP 请求，正在等待回应... 301 
位置：http://s3.amazonaws.com/text-datasets/aclImdb.zip [跟随至新的 URL]
--2019-01-04 15:55:31--  http://s3.amazonaws.com/text-datasets/aclImdb.zip
正在解析主机 s3.amazonaws.com (s3.amazonaws.com)... 52.216.239.45
正在连接 s3.amazonaws.com (s3.amazonaws.com)|52.216.239.45|:80... 已连接。
已发出 HTTP 请求，正在等待回应... 302 Found
位置：http://120.52.51.16/s3.amazonaws.com/text-datasets/aclImdb.zip [跟随至新的 URL]
--2019-01-04 15:55:32--  http://120.52.51.16/s3.amazonaws.com/text-datasets/aclImdb.zip
正在连接 120.52.51.16:80... 已连接

In [36]:
import subprocess
import os

subprocess.call(['unzip', 'fullIMDB.zip'])

res = subprocess.check_output(["ls", "aclImdb/"])

for line in res.splitlines():
    print(line.decode('UTF-8'))

test
train


In [40]:
root = subprocess.check_output(["bash", "-c", "echo $PWD"])
root = root.splitlines()[0].decode('UTF-8')
imdb_dir = root + "/aclImdb"

train_dir = os.path.join(imdb_dir, 'train')
labels = []
texts = []

for label_type in ['neg', 'pos']:
    dir_name = os.path.join(train_dir, label_type)
    for fn in os.listdir(dir_name):
        if fn[-4:] == '.txt':
            with open(os.path.join(dir_name, fn)) as f:
                texts.append(f.read())
            if label_type == 'neg':
                labels.append(0)
            else:
                labels.append(1)


In [44]:
print(len(labels), len(texts))

25000 25000


In [47]:
from keras.preprocessing.sequence import pad_sequences

maxlen = 100
training_samples = 200
validation_samples = 10000
max_words = 10000

tk = Tokenizer(num_words=max_words)
tk.fit_on_texts(texts)
sequences = tk.texts_to_sequences(texts)

word_index = tk.word_index
print('Found {} unique tokens.'.format(len(word_index)))

data = pad_sequences(sequences, maxlen=maxlen)

labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

# generate [0,1,2,...,data.shape[0]]
indices = np.arange(data.shape[0])

np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]

x_train = data[:training_samples]
y_train = labels[:training_samples]
x_val = data[training_samples: training_samples + validation_samples]
y_val = labels[training_samples:training_samples + validation_samples]

Found 88582 unique tokens.
Shape of data tensor: (25000, 100)
Shape of label tensor: (25000,)


In [None]:
if("glove.6B.zip" not in os.listdir(root)):
    # download the dataset
    !wget --no-check-certificate http://nlp.stanford.edu/data/glove.6B.zip

--2019-01-04 16:37:09--  http://nlp.stanford.edu/data/glove.6B.zip
正在解析主机 nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
正在连接 nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... 已连接。
已发出 HTTP 请求，正在等待回应... 302 Found
位置：https://nlp.stanford.edu/data/glove.6B.zip [跟随至新的 URL]
--2019-01-04 16:37:10--  https://nlp.stanford.edu/data/glove.6B.zip
正在连接 nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... 已连接。
已发出 HTTP 请求，正在等待回应... 200 OK
长度：862182613 (822M) [application/zip]
正在保存至: “glove.6B.zip”
