In [None]:
"""
Train convolutional network for sentiment analysis on IMDB corpus. Based on
"Convolutional Neural Networks for Sentence Classification" by Yoon Kim
http://arxiv.org/pdf/1408.5882v2.pdf
For "CNN-rand" and "CNN-non-static" gets to 88-90%, and "CNN-static" - 85% after 2-5 epochs with following settings:
embedding_dim = 50          
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50
Differences from original article:
- larger IMDB corpus, longer sentences; sentence length is very important, just like data size
- smaller embedding dimension, 50 instead of 300
- 2 filter sizes instead of original 3
- fewer filters; original work uses 100, experiments show that 3-10 is enough;
- random initialization is no worse than word2vec init on IMDB corpus
- sliding Max Pooling instead of original Global Pooling
"""

import numpy as np
import data_helpers
from w2v import train_word2vec

from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Flatten, Input, MaxPooling1D, Convolution1D, Embedding
from keras.layers.merge import Concatenate
from keras.datasets import imdb
from keras.preprocessing import sequence
np.random.seed(0)

# ---------------------- Parameters section -------------------
#
# Model type. See Kim Yoon's Convolutional Neural Networks for Sentence Classification, Section 3
model_type = "CNN-non-static"  # CNN-rand|CNN-non-static|CNN-static

# Data source
data_source = "keras_data_set"  # keras_data_set|local_dir

# Model Hyperparameters
embedding_dim = 50
filter_sizes = (3, 8)
num_filters = 10
dropout_prob = (0.5, 0.8)
hidden_dims = 50

# Training parameters
batch_size = 64
num_epochs = 10

# Prepossessing parameters
sequence_length = 400
max_words = 5000

# Word2Vec parameters (see train_word2vec)
min_word_count = 1
context = 10

#
# ---------------------- Parameters end -----------------------


def load_data(data_source):
    assert data_source in ["keras_data_set", "local_dir"], "Unknown data source"
    if data_source == "keras_data_set":
        (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_words, start_char=None,
                                                              oov_char=None, index_from=None)

        x_train = sequence.pad_sequences(x_train, maxlen=sequence_length, padding="post", truncating="post")
        x_test = sequence.pad_sequences(x_test, maxlen=sequence_length, padding="post", truncating="post")

        vocabulary = imdb.get_word_index()
        vocabulary_inv = dict((v, k) for k, v in vocabulary.items())
        vocabulary_inv[0] = "<PAD/>"
    else:
        x, y, vocabulary, vocabulary_inv_list = data_helpers.load_data()
        vocabulary_inv = {key: value for key, value in enumerate(vocabulary_inv_list)}
        y = y.argmax(axis=1)

        # Shuffle data
        shuffle_indices = np.random.permutation(np.arange(len(y)))
        x = x[shuffle_indices]
        y = y[shuffle_indices]
        train_len = int(len(x) * 0.9)
        x_train = x[:train_len]
        y_train = y[:train_len]
        x_test = x[train_len:]
        y_test = y[train_len:]

    return x_train, y_train, x_test, y_test, vocabulary_inv


# Data Preparation
print("Load data...")
x_train, y_train, x_test, y_test, vocabulary_inv = load_data(data_source)

if sequence_length != x_test.shape[1]:
    print("Adjusting sequence length for actual size")
    sequence_length = x_test.shape[1]

print("x_train shape:", x_train.shape)
print("x_test shape:", x_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

# Prepare embedding layer weights and convert inputs for static model
print("Model type is", model_type)
if model_type in ["CNN-non-static", "CNN-static"]:
    embedding_weights = train_word2vec(np.vstack((x_train, x_test)), vocabulary_inv, num_features=embedding_dim,
                                       min_word_count=min_word_count, context=context)
    if model_type == "CNN-static":
        x_train = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_train])
        x_test = np.stack([np.stack([embedding_weights[word] for word in sentence]) for sentence in x_test])
        print("x_train static shape:", x_train.shape)
        print("x_test static shape:", x_test.shape)

elif model_type == "CNN-rand":
    embedding_weights = None
else:
    raise ValueError("Unknown model type")

# Build model
if model_type == "CNN-static":
    input_shape = (sequence_length, embedding_dim)
else:
    input_shape = (sequence_length,)

model_input = Input(shape=input_shape)

# Static model does not have embedding layer
if model_type == "CNN-static":
    z = model_input
else:
    z = Embedding(len(vocabulary_inv), embedding_dim, input_length=sequence_length, name="embedding")(model_input)

z = Dropout(dropout_prob[0])(z)

# Convolutional block
conv_blocks = []
for sz in filter_sizes:
    conv = Convolution1D(filters=num_filters,
                         kernel_size=sz,
                         padding="valid",
                         activation="relu",
                         strides=1)(z)
    conv = MaxPooling1D(pool_size=2)(conv)
    conv = Flatten()(conv)
    conv_blocks.append(conv)
z = Concatenate()(conv_blocks) if len(conv_blocks) > 1 else conv_blocks[0]

z = Dropout(dropout_prob[1])(z)
z = Dense(hidden_dims, activation="relu")(z)
model_output = Dense(1, activation="sigmoid")(z)

model = Model(model_input, model_output)
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Initialize weights with word2vec
if model_type == "CNN-non-static":
    weights = np.array([v for v in embedding_weights.values()])
    print("Initializing embedding layer with word2vec weights, shape", weights.shape)
    embedding_layer = model.get_layer("embedding")
    embedding_layer.set_weights([weights])

# Train the model
model.fit(x_train, y_train, batch_size=batch_size, epochs=num_epochs,
          validation_data=(x_test, y_test), verbose=2)

In [None]:
import string 
text = u"Trường đại học bách khoa hà nội 0910833798"
text.translate(None, string.punctuation)
print text

In [17]:
def train_word2vec(sentence_matrix, vocabulary_inv,
                   num_features=100, min_word_count=1, context=10):
    """
    Trains, saves, loads Word2Vec model
    Returns initial weights for embedding layer.
   
    inputs:
    sentence_matrix # int matrix: num_sentences x max_sentence_len
    vocabulary_inv  # dict {int: str}
    num_features    # Word vector dimensionality                      
    min_word_count  # Minimum word count                        
    context         # Context window size 
    """
    model_dir = 'models'
    model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
    model_name = join(model_dir, model_name)
    if exists(model_name):
        embedding_model = word2vec.Word2Vec.load(model_name)
        print('Load existing Word2Vec model \'%s\'' % split(model_name)[-1])
    else:
        # Set values for various parameters
        num_workers = 2  # Number of threads to run in parallel
        downsampling = 1e-3  # Downsample setting for frequent words

        # Initialize and train the model
        print('Training Word2Vec model...')
        sentences = [[vocabulary_inv[str(w)] for w in s] for s in sentence_matrix]
        embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                            size=num_features, min_count=min_word_count,
                                            window=context, sample=downsampling)

        # If we don't plan to train the model any further, calling 
        # init_sims will make the model much more memory-efficient.
        embedding_model.init_sims(replace=True)

        # Saving the model for later use. You can load it later using Word2Vec.load()
        if not exists(model_dir):
            os.mkdir(model_dir)
        print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
        embedding_model.save(model_name)

    # add unknown words
    embedding_weights = {key: embedding_model[word] if word in embedding_model else
                              np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
                         for key, word in vocabulary_inv.items()}
    return embedding_weights

In [11]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, MaxPooling1D, Flatten, Dropout
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from keras.utils.np_utils import to_categorical
import json
from __future__ import print_function
from gensim.models import word2vec
from os.path import join, exists, split
import os

# Using keras to load the dataset with the top_words
x = np.load('/data_x.npy')
y = np.load('/data_y.npy')
with open('vocabularies.json', 'r') as fv:
    vocabulary = json.load(fv)
with open('vocabularies_inv.json', 'r') as fvi:
    vocabulary_inv = json.load(fvi)
    
# Shuffle data
np.random.seed(47)
print(len(x))
print(len(y))

shuffle_indices = np.random.permutation(np.arange(len(y)))

x = x[shuffle_indices]
y = y[shuffle_indices]
x = np.asarray(x)
Y = []
for entry in y:
    Y.append(np.argmax(entry))
print(len([1 for entry in Y if entry == 0]))
print(len([1 for entry in Y if entry == 1]))
print(len([1 for entry in Y if entry == 2]))
print(len([1 for entry in Y if entry == 3]))

Y = to_categorical(Y)

train_len = int(len(x) * 0.8)
X_train = x[:train_len]
y_train = Y[:train_len]
X_test = x[train_len:]
y_test = Y[train_len:]

print("Finish splitting data")
print()

sequence_length = 4000

if sequence_length != X_test.shape[1]:
    print("Adjusting sequence length for actual size")
    sequence_length = X_test.shape[1]

print("x_train shape:", X_train.shape)
print("x_test shape:", X_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

for i in range(3):
    print(Y[i])
    
print("Loading data...")

3933
3933
1030
1414
1079
410
Finish splitting data

Adjusting sequence length for actual size
x_train shape: (3146, 3997)
x_test shape: (787, 3997)
Vocabulary Size: 125697
[ 1.  0.  0.  0.]
[ 0.  1.  0.  0.]
[ 1.  0.  0.  0.]
Loading data...
Training Word2Vec model...


KeyError: 123

In [44]:
w = train_word2vec(np.vstack((X_train, X_test)), vocabulary_inv, num_features=75, min_word_count=3, context=15)

Training Word2Vec model...
Saving Word2Vec model '75features_3minwords_15context'


In [45]:
x_train_new = np.stack([np.stack([w[str(word)] for word in sentence]) for sentence in X_train])
x_test_new = np.stack([np.stack([w[str(word)] for word in sentence]) for sentence in X_test])

In [61]:
# Using embedding from Keras
embedding_dim = 150
embedding_vecor_length = 300
model = Sequential()
#model.add(Embedding(len(vocabulary_inv), embedding_vecor_length, input_length=sequence_length))
#model.add(LSTM(128, weights = w))

# Convolutional model (3x conv, flatten, 2x dense)
model.add(Convolution1D(64, input_shape=(sequence_length, 75), kernel_size=3, strides=1, activation="relu", border_mode='same'))
model.add(Dropout(0.2))
model.add(Convolution1D(32, kernel_size=3, strides=1, activation="relu", border_mode='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Convolution1D(16, kernel_size=3, strides=1, activation="relu", border_mode='same'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dropout(0.2))
model.add(Dense(75, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(4, activation='softmax'))

# Log to tensorboard
tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_38 (Conv1D)           (None, 3997, 64)          14464     
_________________________________________________________________
dropout_33 (Dropout)         (None, 3997, 64)          0         
_________________________________________________________________
conv1d_39 (Conv1D)           (None, 3997, 32)          6176      
_________________________________________________________________
max_pooling1d_29 (MaxPooling (None, 1998, 32)          0         
_________________________________________________________________
conv1d_40 (Conv1D)           (None, 1998, 16)          1552      
_________________________________________________________________
max_pooling1d_30 (MaxPooling (None, 999, 16)           0         
_________________________________________________________________
flatten_13 (Flatten)         (None, 15984)             0         
__________

  if __name__ == '__main__':
  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


In [None]:
model.fit(x_train_new, y_train, epochs=20, validation_data=(x_test_new, y_test), callbacks=[tensorBoardCallback], batch_size=64)

# Evaluation on the test set
scores = model.evaluate(x_test_new, y_test, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

Train on 3146 samples, validate on 787 samples
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20

In [None]:
print np.load('/data_x.npy')