In [40]:
from baselines import MODEL_WEM
import os
import pickle
import numpy as np
import time, datetime
from sklearn.model_selection import train_test_split
from keras.regularizers import l2
from keras.callbacks import Callback, ModelCheckpoint

In [39]:
DATA_SET_PATH = "/root/mounted/datasets/data_0515/"
KN_CSV = os.path.join(DATA_SET_PATH, "knowledge.csv")
TRAIN_CSV = os.path.join(DATA_SET_PATH, "train.csv")
TEST_CSV = os.path.join(DATA_SET_PATH, "test.csv")
SUBMIT_CSV = os.path.join(DATA_SET_PATH, "submit.csv")
CHAR_EMBED_PKL = os.path.join(DATA_SET_PATH, "char_embed.pkl")
WORD_EMBED_PKL = os.path.join(DATA_SET_PATH, "word_embed.pkl")
QUESTION_PKL = os.path.join(DATA_SET_PATH, "question.pkl")
INTERMEDIATE_DATA_PATH = os.path.join(DATA_SET_PATH, 'intermediate')
KN_TRAIN_CSV = os.path.join(INTERMEDIATE_DATA_PATH, 'kn_train.csv')
KN_TRAIN_WV_CV_CSV = os.path.join(INTERMEDIATE_DATA_PATH, 'kn_train_wv_cv.csv')
TEST_WV_CV_CSV = os.path.join(INTERMEDIATE_DATA_PATH, 'test_wv_cv.csv')
WEM_PKL = os.path.join(INTERMEDIATE_DATA_PATH, 'WEM.pkl')
CEM_PKL = os.path.join(INTERMEDIATE_DATA_PATH, 'CEM.pkl')
KN_TRAIN_WIDS_CIDS_PKL = os.path.join(INTERMEDIATE_DATA_PATH, 'kn_train_wids_cids.pkl')
TEST_WIDS_CIDS_PKL = os.path.join(INTERMEDIATE_DATA_PATH, 'test_wids_cids.pkl')
KN_TRAIN_WIDS_CIDS_PADDED_PKL = os.path.join(INTERMEDIATE_DATA_PATH, 'kn_train_wids_cids_padded.pkl')
TEST_WIDS_CIDS_PADDED_PKL = os.path.join(INTERMEDIATE_DATA_PATH, 'test_wids_cids_padded.pkl')

MODEL_WEIGHTS_FILE = './keras_saved_models/baseline_model_wem.h5'

WORDS_NUM = 20891
CHARS_NUM = 3048
WORD_EMBEDDING_DIM = 300
CHAR_EMBEDDING_DIM = 300

MAX_WSEQ_LEN = 39
MAX_CSEQ_LEN = 58

VALIDATION_SPLIT = 0.1
RNG_SEED = 13371447
NB_EPOCHS = 25
BATCH_SIZE = 320

In [2]:
# 加载模型
model = MODEL_WEM()

In [3]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 39)           0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 39)           0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 39, 300)      6267600     input_1[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 39, 300)      6267600     input_2[0][0]                    
__________________________________________________________________________________________________
time_distr

In [9]:
# 加载数据集，并切分成train 和 validation
with open(KN_TRAIN_WIDS_CIDS_PADDED_PKL, 'rb') as f:
    kn_train_wids_cids_padded = pickle.load(f)

kn_train_wids_cids_padded.head()

Unnamed: 0,qid1,qid2,words1,chars1,words2,chars2,label
0,Q131177,Q112611,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
1,Q336221,Q112611,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
2,Q659732,Q112611,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3,Q686996,Q112611,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
4,Q630751,Q112611,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1


In [36]:
q1_inputs = kn_train_wids_cids_padded['words1'].values
q2_inputs = kn_train_wids_cids_padded['words2'].values
inputs = np.stack((q1_inputs, q2_inputs), axis=1)
labels = kn_train_wids_cids_padded['label']
inputs_train, inputs_val, labels_train, labels_val = train_test_split(inputs, labels, test_size=VALIDATION_SPLIT, random_state=RNG_SEED)
q1_train = inputs_train[:,0]
q2_train = inputs_train[:,1]
q1_val = inputs_val[:,0]
q2_val = inputs_val[:,1]

In [35]:
inputs_train.shape, inputs_val.shape

((3644496, 2), (404944, 2))

In [45]:
print("Starting training at", datetime.datetime.now())
t0 = time.time()
callbacks = [ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_acc', save_best_only=True)]
history = model.fit([q1_train, q2_train],
                    labels_train,
                    epochs=NB_EPOCHS,
                    validation_data = ([q1_val, q2_val], labels_val),
                    verbose=2,
                    batch_size=BATCH_SIZE,
                    callbacks=callbacks)
t1 = time.time()
print("Training ended at", datetime.datetime.now())
print("Minutes elapsed: %f" % ((t1 - t0) / 60.))

Starting training at 2018-05-16 09:55:01.128546


ValueError: Error when checking input: expected input_1 to have shape (39,) but got array with shape (1,)