In [1]:
import tensorflow as tf
from keras.backend.tensorflow_backend import set_session
config = tf.ConfigProto()
config.gpu_options.per_process_gpu_memory_fraction = 0.75
set_session(tf.Session(config=config))

Using TensorFlow backend.


In [2]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM, Convolution1D, MaxPooling1D, AveragePooling1D, GlobalMaxPooling1D, Flatten, Dropout, Input, Merge
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.callbacks import TensorBoard
from keras.utils.np_utils import to_categorical
from sklearn.utils import class_weight
import pickle
import json
from __future__ import print_function
from gensim.models import word2vec
from os.path import join, exists, split
import os

In [3]:
def train_word2vec(sentence_matrix, vocabulary_inv,
                   num_features=100, min_word_count=1, context=10):
    """
    Trains, saves, loads Word2Vec model
    Returns initial weights for embedding layer.
   
    inputs:
    sentence_matrix # int matrix: num_sentences x max_sentence_len
    vocabulary_inv  # dict {int: str}
    num_features    # Word vector dimensionality                      
    min_word_count  # Minimum word count                        
    context         # Context window size 
    """
    model_dir = 'models'
    model_name = "{:d}features_{:d}minwords_{:d}context".format(num_features, min_word_count, context)
    model_name = join(model_dir, model_name)
    if exists(model_name):
        embedding_model = word2vec.Word2Vec.load(model_name)
        print('Load existing Word2Vec model \'%s\'' % split(model_name)[-1])
    else:
        # Set values for various parameters
        num_workers = 2  # Number of threads to run in parallel
        downsampling = 1e-3  # Downsample setting for frequent words

        # Initialize and train the model
        print('Training Word2Vec model...')
        sentences = [[vocabulary_inv[str(w)] for w in s] for s in sentence_matrix]
        embedding_model = word2vec.Word2Vec(sentences, workers=num_workers,
                                            size=num_features, min_count=min_word_count,
                                            window=context, sample=downsampling)

        # If we don't plan to train the model any further, calling 
        # init_sims will make the model much more memory-efficient.
        embedding_model.init_sims(replace=True)

        # Saving the model for later use. You can load it later using Word2Vec.load()
        if not exists(model_dir):
            os.mkdir(model_dir)
        print('Saving Word2Vec model \'%s\'' % split(model_name)[-1])
        embedding_model.save(model_name)
    
    for i, word in enumerate(embedding_model.wv.index2word):
        if i < 3:
            print("Index: %s Word: %s" % (i, word))

    print("First embedding word: %s" % embedding_model.wv.index2word[1])
    print("Embedding vector of the word (và): %s", embedding_model.wv[u'và'])
    
    # add unknown words
    embedding_weights = {key: embedding_model[word] if word in embedding_model else
                              np.random.uniform(-0.25, 0.25, embedding_model.vector_size)
                         for key, word in vocabulary_inv.items()}
    return embedding_weights

In [4]:
# Using keras to load the dataset with the top_words
with open(r"data_x_new_200_800.pkl", "rb") as load_x:
    x = pickle.load(load_x)
with open(r"data_y_new_200_800.pkl", "rb") as load_y:
    y = pickle.load(load_y)
    
with open('vocabularies.json', 'r') as fv:
    vocabulary = json.load(fv)
with open('vocabularies_inv.json', 'r') as fvi:
    vocabulary_inv = json.load(fvi)
    
# Shuffle data
np.random.seed(47)
print(len(x))
print(len(y))

shuffle_indices = np.random.permutation(np.arange(len(y)))

x = x[shuffle_indices]
y = y[shuffle_indices]
x = np.asarray(x)
Y = []
for entry in y:
    Y.append(np.argmax(entry))
train_len = int(len(x) * 0.80)
validate_len = int(train_len * 0.2)
print("Age range A has: %s. %s%% of the dataset" % (len([1 for entry in Y[:train_len] if entry == 0]), float(len([1 for entry in Y if entry == 0]))/len(Y[:train_len])))
print("Age range B has: %s. %s%% of the dataset" % (len([1 for entry in Y[:train_len] if entry == 1]), float(len([1 for entry in Y if entry == 1]))/len(Y[:train_len])))
print("Age range C has: %s. %s%% of the dataset" % (len([1 for entry in Y[:train_len] if entry == 2]), float(len([1 for entry in Y if entry == 2]))/len(Y[:train_len])))
print("Age range D has: %s. %s%% of the dataset" % (len([1 for entry in Y[:train_len] if entry == 3]), float(len([1 for entry in Y if entry == 3]))/len(Y[:train_len])))
#class_weights = class_weight.compute_class_weight('balanced', np.unique(Y_train), list(Y_train))
#print(get_class_weights(Y[:train_len]))
#print(class_weights)
print(y)
Y = to_categorical(Y)
X_train = x[:train_len][validate_len:]
y_train = Y[:train_len][validate_len:]
X_validate = x[:train_len][:validate_len]
y_validate = Y[:train_len][:validate_len]
X_test = x[train_len:]
y_test = Y[train_len:]
word2vec_f = False

print("Finish splitting data")
print()

sequence_length = 1000

if sequence_length != X_test.shape[1]:
    print("Adjusting sequence length for actual size")
    sequence_length = X_test.shape[1]

print("X_train shape:", X_train.shape)
print("X_validate shape:", X_validate.shape)
print("X_test shape:", X_test.shape)
print("Vocabulary Size: {:d}".format(len(vocabulary_inv)))

for i in range(3):
    print(y_train[i])
    
print("Loading data...")

22694
22694
Age range A has: 5465. 0.379289451942% of the dataset
Age range B has: 7837. 0.536821812173% of the dataset
Age range C has: 3957. 0.27243183696% of the dataset
Age range D has: 896. 0.0614706692371% of the dataset
[[0 1 0 0]
 [0 1 0 0]
 [1 0 0 0]
 ..., 
 [1 0 0 0]
 [0 0 1 0]
 [1 0 0 0]]
Finish splitting data

Adjusting sequence length for actual size
X_train shape: (14524, 800)
X_validate shape: (3631, 800)
X_test shape: (4539, 800)
Vocabulary Size: 252081
[ 1.  0.  0.  0.]
[ 0.  0.  0.  1.]
[ 0.  1.  0.  0.]
Loading data...


In [None]:
balance_klass = SMOTE()
l = {0: 2, 1: 3}
X_train_smote, y_train_smote = balance_klass.execute(l, samples=X_train, labels=y_train)

In [5]:
if word2vec_f:
    w = train_word2vec(np.vstack((x[:train_len], X_test)), vocabulary_inv, num_features=200, min_word_count=1, context=15)
#model_name = "vi.bin"
#model_temp = word2vec.Word2Vec.load("./models/" + model_name)
#print('Load existing Word2Vec model \'%s\'' % split(model_name)[-1])

In [6]:
if word2vec_f:
    x_train_new = np.stack([np.stack([w[str(word)] for word in sentence]) for sentence in X_train_smote])
    x_validate_new = np.stack([np.stack([w[str(word)] for word in sentence]) for sentence in X_validate])
    x_test_new = np.stack([np.stack([w[str(word)] for word in sentence]) for sentence in X_test])

In [7]:
if word2vec_f:
    #save and load data in order to save space 
    #in case you need them specifically for when computing word2vec  
    print(x_train_new.shape)
    np.save('/train_x_1.npy', x_train_new)
    np.save('/validate_x_1.npy', x_validate_new)
    np.save('/test_x_1.npy', x_test_new)
    x_train_new = np.load('/train_x_1.npy')
    x_validate_new = np.load('/validate_x_1.npy')
    x_test_new = np.load('/test_x_1.npy')

In [19]:
if not word2vec_f:
    # Using embedding from Keras
    from keras.regularizers import l2
    model = Sequential()
    print(X_train.shape)
    model.add(Embedding(len(vocabulary_inv), 8, input_length=sequence_length))

    model.add(Convolution1D(96, activation='relu', kernel_size=3, padding='same'))
    model.add(Dropout(0.4))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Convolution1D(64, activation='relu', kernel_size=2, strides = 1, padding='same'))
    model.add(Dropout(0.4))
    model.add(MaxPooling1D(pool_size=2))

    #model.add(Convolution1D(192, activation='relu', kernel_size=3, padding='same'))
    #model.add(Convolution1D(192, activation='relu', kernel_size=1, padding='same'))
    #model.add(Convolution1D(64, kernel_size=1))
    #model.add(AveragePooling1D(pool_size=8, strides=1))
    model.add(Flatten())
    model.add(Dense(32, activation='relu'))
    model.add(Dropout(0.25))
    #model.add(LSTM(128, dropout=0.3, recurrent_dropout=0.3))
    model.add(Dense(4, activation='softmax', W_regularizer=l2(0.01)))

    # Log to tensorboard
    #tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
    #model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())

(14524, 800)
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 800, 8)            2016648   
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 800, 96)           2400      
_________________________________________________________________
dropout_13 (Dropout)         (None, 800, 96)           0         
_________________________________________________________________
max_pooling1d_9 (MaxPooling1 (None, 400, 96)           0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 400, 64)           12352     
_________________________________________________________________
dropout_14 (Dropout)         (None, 400, 64)           0         
_________________________________________________________________
max_pooling1d_10 (MaxPooling (None, 200, 64)           0       



In [10]:
if word2vec_f:
    # Using embedding from Keras
    from keras.regularizers import l2

    embedding_dim = 100
    embedding_vecor_length = 300
    model = Sequential()

    #Convolutional model (3x conv, flatten, 2x dense)
    #input_shape=(sequence_length, 100), 
    model.add(Convolution1D(512, input_shape=(sequence_length, 200), kernel_size=3, activation="relu", padding='same'))
    #model.add(Convolution1D(128, kernel_size=3, activation="relu", padding='same'))
    model.add(MaxPooling1D(pool_size=3))
    model.add(Dropout(0.25))
    model.add(Convolution1D(256, input_shape=(sequence_length, 200), kernel_size=3, activation="relu", padding='same'))
    model.add(MaxPooling1D(pool_size=2))
    model.add(Convolution1D(128, kernel_size=3, strides = 1, padding='same'))
    model.add(Dropout(0.25))
    model.add(MaxPooling1D(pool_size=1))
    model.add(Flatten())
    model.add(Dense(128, activation='relu'))
    model.add(Dropout(0.25))
    model.add(Dense(4, activation='softmax'))
    #model.add(Dense(4, activation='softmax', W_regularizer=l2(0.01)))

    # Log to tensorboard
    #tensorBoardCallback = TensorBoard(log_dir='./logs', write_graph=True)
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    print(model.summary())

In [None]:
#y_train = y_train.reshape((-1, 1))
if word2vec_f:
    print("Combining with word2vec model!")
    model.fit(x_train_new, y_train, epochs=10, validation_data=(x_validate_new, y_validate), batch_size=64, verbose=2) #class_weight={0:0.82856529, 1:0.57931884, 2:1.14952324, 3:5.07605263})
    #{0:0.82704321, 1:0.58675982, 2:1.13997847, 3:4.7758016})
else:
    print("Running without word2vec")
    model.fit(X_train, y_train, epochs=10, validation_data=(X_validate, y_validate), batch_size=64, verbose=2) #class_weight={0:0.82856529, 1:0.57931884, 2:1.14952324, 3:5.07605263})

Running without word2vec
Train on 14524 samples, validate on 3631 samples
Epoch 1/10
 - 8s - loss: 1.1732 - acc: 0.4351 - val_loss: 1.0303 - val_acc: 0.5059
Epoch 2/10
 - 8s - loss: 0.8864 - acc: 0.5914 - val_loss: 0.8168 - val_acc: 0.6615
Epoch 3/10
 - 8s - loss: 0.5785 - acc: 0.7572 - val_loss: 0.7368 - val_acc: 0.6910
Epoch 4/10
 - 8s - loss: 0.3413 - acc: 0.8746 - val_loss: 0.8169 - val_acc: 0.7050
Epoch 5/10
 - 8s - loss: 0.2141 - acc: 0.9305 - val_loss: 1.0291 - val_acc: 0.6962
Epoch 6/10
 - 8s - loss: 0.1456 - acc: 0.9566 - val_loss: 1.0884 - val_acc: 0.7009
Epoch 7/10
 - 8s - loss: 0.1119 - acc: 0.9699 - val_loss: 1.3353 - val_acc: 0.7001
Epoch 8/10


In [18]:
# Evaluation on the test set
if word2vec_f:
    scores = model.evaluate(x_test_new, y_test)
else:
    scores = model.evaluate(X_test, y_test)
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 71.03%


In [None]:
#in case you want to run LSTM model on this project
#too much time intensive 
model.add(Embedding(len(vocabulary_inv), 100, input_length = sequence_length))
model.add(Dropout(0.4))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(4, activation='softmax'))

In [None]:
'''
for i, word in enumerate(model_temp.index2word):
    if i < 3:
        print("Index: %s Word: %s" % (i, word))

print(model_temp.wv.index2word[1])
print(model_temp.wv[u'và'])
index = 0

embedding_weights = {key: model_temp.wv[word] if word in model_temp.index2word else
                        np.random.uniform(-0.25, 0.25, model_temp.vector_size)
                        for key, word in vocabulary_inv.items()}
                        
nb_words = min(vocabulary_inv)+1

embedding_matrix = np.zeros((nb_words, EMBEDDING_DIM))
for word, i in word_index.items():
    if word in word2vec.vocab:
        embedding_matrix[i] = word2vec.word_vec(word)
print('Null word embeddings: %d' % np.sum(np.sum(embedding_matrix, axis=1) == 0))
'''