In [1]:
import pandas as pd
import numpy as np

from gensim.models import KeyedVectors
from keras.callbacks import CSVLogger, ModelCheckpoint

from nn_utils import build_embeddings, build_vocabulary, convert_questions, get_max_seq_length, prepare_dataset
from malstm import model, f1_score, f2_score

Using TensorFlow backend.
  return f(*args, **kwds)


# Load data
Use extended train dataset (with new generated PerfectMatch'ed pairs), and normal test_df

In [2]:
# This makes it easier to run code on Paperspace machine
DATA_DIR = '/Volumes/DataDrive'

In [3]:
embeddings_dim = 300

In [4]:
train_df = pd.read_csv(DATA_DIR + '/merged/en-train-extended-2.csv')
test_df = pd.read_csv(DATA_DIR + '/merged/en-test.csv')

In [5]:
w2v_model = KeyedVectors.load_word2vec_format(DATA_DIR + '/models/GoogleNews-vectors-negative300.bin', binary=True)

# Prepare vocab and embeddings matrix
vocabulary = build_vocabulary([train_df, test_df], w2v_model)
embeddings = build_embeddings(vocabulary, w2v_model, embeddings_dim)

# Remove word2vec model, as we don't need it anymore
del w2v_model

In [6]:
# Convert questions to number representations
convert_questions([train_df, test_df], vocabulary)

In [7]:
# Find max sequence length
max_seq_length = get_max_seq_length([train_df, test_df])

# Split dataset
(X_train, Y_train), (X_validation, Y_validation) = prepare_dataset(
    train_df, max_seq_length=max_seq_length, validation_size=5000)

print('X_train.shape={}, X_validation.shape={}'.format(X_train['left'].shape, X_validation['left'].shape))

X_train.shape=(25407, 1021), X_validation.shape=(5000, 1021)


In [8]:
# Build model
malstm = model(embeddings, max_seq_length, n_hidden=50,
               embedding_dim=embeddings_dim, metrics=['accuracy', 'mae', f2_score])

In [9]:
# Setup callbacks
csv_logger = CSVLogger(DATA_DIR + '/training/logs/train-3.csv')

checkpoint_path = DATA_DIR + '/training/models/model-test-{epoch: 02d}-{val_loss:.2f}.hdf5'
checkpoint = ModelCheckpoint(filepath=checkpoint_path, period=1, save_best_only=True)

callbacks = [csv_logger, checkpoint]

In [10]:
# Training
train_input = [X_train['left'], X_train['right']]
validation_input = [X_validation['left'], X_validation['right']]

trained = malstm.fit(train_input, Y_train, batch_size=64, epochs=1,
                     validation_data=(validation_input, Y_validation),
                     callbacks=callbacks)

Train on 25407 samples, validate on 5000 samples
Epoch 1/1
  640/25407 [..............................] - ETA: 20:09 - loss: 0.2238 - acc: 0.7562 - mean_absolute_error: 0.2669 - f2_score: 0.0000e+00

KeyboardInterrupt: 