In [15]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Flatten, Dropout, Input, concatenate, GRU, CuDNNGRU
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import f1_score

In [2]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import tensorflow as tf

**Node** **information**

In [3]:
columes = ["id", "year", "title", "authors", "journal", "abstract"]

In [4]:
df= pd.read_csv("dataset/node_information.csv",names=columes )

**Train** **set**

In [5]:
data_set = np.loadtxt("dataset/training_set.txt").astype(np.int64)

In [6]:
data = data_set[:, :2]
label = data_set[:, -1]
data_dic = {}

for index, row in df.iterrows():
    data_dic[row['id']] = (row['year'], row['title'], row['authors'], row['journal'], row['abstract'])
    
clean_title = pd.read_pickle('clean_title.pkl')['text'].values.tolist()
clean_abstract = pd.read_pickle('clean_abstract.pkl')['text'].values.tolist()
df['title'] = clean_title
df['abstract'] = clean_abstract

**K**-**fold** **split**

In [7]:
train_all = []
evaluate_all = []

k = 3
fold_num = 1

skf = StratifiedKFold(n_splits=k, random_state=1234, shuffle=True)
for train_index, evaluate_index in skf.split(data, label):
    train_all.append(train_index)
    evaluate_all.append(evaluate_index)
    print(train_index.shape,evaluate_index.shape)
train_df = pd.DataFrame()
train_df['index'] = train_all[0]
evaluate_df = pd.DataFrame()
evaluate_df['index'] = evaluate_all[0]
train_df.to_pickle('train_df.pkl')
evaluate_df.to_pickle('test_df.pkl')

(410341,) (205171,)
(410341,) (205171,)
(410342,) (205170,)


"\nsplit_train_data, split_test_data, split_train_label, split_test_label = train_test_split(data, label, test_size=0.2, random_state=7)\ntrain_df = pd.DataFrame()\ntest_df = pd.DataFrame()\ntrain_df['doc1'] = split_train_data[:,0]\ntrain_df['doc2'] = split_train_data[:,1]\ntrain_df['label'] = split_train_label\ntest_df['doc1'] = split_test_data[:,0]\ntest_df['doc2'] = split_test_data[:,1]\ntest_df['label'] = split_test_label\ntrain_df.to_pickle('train_df.pkl')\ntest_df.to_pickle('test_df.pkl')\nprint(split_train_data.shape, split_test_data.shape, split_train_label.shape, split_test_label.shape) \n"

In [8]:
def get_data(fold_index, max_sentence_len):
    tokenizer = Tokenizer(num_words=7000)  
    texts = [df["title"].values.tolist()[i]+ ' ' +df["abstract"].values.tolist()[i] for i in range(df.shape[0])]
    tokenizer.fit_on_texts(texts)

    word_index = tokenizer.word_index
    
    embedding_matrix = np.zeros((len(word_index) + 1, 100))
    for word, i in word_index.items():
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector
            
    train_index = train_all[fold_index - 1]
    evaluate_index = evaluate_all[fold_index - 1]
    
    # group train data
    train_label = label[train_index]
    
    train_input = data[train_index]
    paper_id1 = train_input[:, 0]
    paper_id2 = train_input[:, 1]
    train_input1 = [data_dic[id][1] + ' ' + data_dic[id][4] for id in  paper_id1]
    train_input2 = [data_dic[id][1] + ' ' + data_dic[id][4] for id in  paper_id2]
    # convert texts to numbers
    x_train_seq1 = tokenizer.texts_to_sequences(train_input1)
    x_train_seq2 = tokenizer.texts_to_sequences(train_input2)

    x_train1 = sequence.pad_sequences(x_train_seq1, maxlen=max_sentence_len)
    x_train2 = sequence.pad_sequences(x_train_seq2, maxlen=max_sentence_len)
    
    # group evaluate data
    eval_label = label[evaluate_index]
    
    eval_input = data[evaluate_index]
    paper_id1_ = eval_input[:, 0]
    paper_id2_ = eval_input[:, 1]
    eval_input1 = [data_dic[id][1] + ' ' + data_dic[id][4] for id in  paper_id1_]
    eval_input2 = [data_dic[id][1] + ' ' + data_dic[id][4] for id in  paper_id2_]
    # convert texts to numbers
    x_eval_seq1 = tokenizer.texts_to_sequences(eval_input1)
    x_eval_seq2 = tokenizer.texts_to_sequences(eval_input2)

    x_eval1 = sequence.pad_sequences(x_eval_seq1, maxlen=max_sentence_len)
    x_eval2 = sequence.pad_sequences(x_eval_seq2, maxlen=max_sentence_len)
    

    return x_train1, x_train2, train_label, x_eval1, x_eval2, eval_label, embedding_matrix


**Create Model**

In [9]:
def create_model(maxlen=150, max_features=7000, embed_size=100):
    # Inputs
    input1 = Input(shape=[maxlen], name='x_seq1')
    input2 = Input(shape=[maxlen], name='x_seq2')

    # Embeddings layers
    shared_embedding_layer = Embedding(embedding_matrix.shape[0], embedding_matrix.shape[1], weights=[embedding_matrix], trainable=True)
    
    # two inputs share embedding layer
    encoded1 = shared_embedding_layer(input1)
    encoded2 = shared_embedding_layer(input2)

    # rnn layers
    gru = Bidirectional(CuDNNGRU(units=100))
    gru1 = gru(encoded1)
    gru2 = gru(encoded2)
    
    # merge two branches
    merge = concatenate([gru1, gru2], axis=-1)

    # out = Dropout(0.5)(merge)
    hidden1 = Dense(64, activation='relu')(merge)
    hidden2 = Dense(32, activation='relu')(hidden1)

    output = Dense(units=1, activation='sigmoid')(hidden2)
    model = Model(inputs=[input1, input2], outputs=output)

    return model

# Pretrained embedding

In [10]:
embeddings_index = {}
f = open('glove.6B.100d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


**Training**

In [11]:
x_train1, x_train2, train_label, x_eval1, x_eval2, eval_label, embedding_matrix = get_data(fold_num, 150)

In [12]:
def plot_history(history,metric_name):
    fig, (ax_loss, ax_score) = plt.subplots(1, 2, figsize=(15,5))
    ax_loss.plot(history.epoch, history.history["loss"], label="Train loss")
    ax_loss.plot(history.epoch, history.history["val_loss"], label="Validation loss")
    ax_loss.legend()
    ax_score.plot(history.epoch, history.history[metric_name], label="Train score")
    ax_score.plot(history.epoch, history.history["val_" + metric_name], label="Validation score")
    ax_score.legend()


In [13]:
def f1(label, pred):
    return tf.py_func(f1_score, [label, (pred>0.5)], tf.float64)

In [16]:
model = create_model()
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])

In [17]:
save_model_name = 'model_stack_'+str(fold_num)+'.ckpt'

model_checkpoint = ModelCheckpoint(save_model_name, monitor='val_acc', 
                               mode='max', save_best_only=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_acc', mode='max',
                              factor=0.5, patience=10, min_lr=0.0000001, verbose=1)

batch_size = 64
epochs = 50
history = model.fit(x=[x_train1, x_train2],
          y=train_label,
          validation_data=[[x_eval1, x_eval2], eval_label],
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[ model_checkpoint,reduce_lr],
          verbose=2,
          shuffle=True)

plot_history(history,'accuracy')

Train on 410341 samples, validate on 205171 samples
Epoch 1/50
Epoch 00001: val_acc improved from -inf to 0.86681, saving model to model_stack_2.ckpt
 - 411s - loss: 0.4145 - acc: 0.8041 - val_loss: 0.3154 - val_acc: 0.8668
Epoch 2/50
Epoch 00002: val_acc improved from 0.86681 to 0.89946, saving model to model_stack_2.ckpt
 - 411s - loss: 0.2648 - acc: 0.8923 - val_loss: 0.2506 - val_acc: 0.8995
Epoch 3/50
Epoch 00003: val_acc improved from 0.89946 to 0.91014, saving model to model_stack_2.ckpt
 - 412s - loss: 0.2035 - acc: 0.9219 - val_loss: 0.2310 - val_acc: 0.9101
Epoch 4/50
Epoch 00004: val_acc improved from 0.91014 to 0.91677, saving model to model_stack_2.ckpt
 - 412s - loss: 0.1663 - acc: 0.9381 - val_loss: 0.2216 - val_acc: 0.9168
Epoch 5/50
Epoch 00005: val_acc improved from 0.91677 to 0.91940, saving model to model_stack_2.ckpt
 - 412s - loss: 0.1392 - acc: 0.9491 - val_loss: 0.2160 - val_acc: 0.9194
Epoch 6/50
Epoch 00006: val_acc improved from 0.91940 to 0.91956, saving mod

KeyboardInterrupt: 

In [None]:
scores = model.evaluate([x_eval1[:1000], x_eval2[:1000]], eval_label[:1000])

In [None]:
print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))

In [None]:
pred = model.predict(x=[x_eval1, x_eval2])
f1 = f1_score(eval_label, (pred>0.5).astype(np.int8))
print(f1)                    
                      

**load test data and create submission file**

In [99]:
test_set = np.loadtxt("dataset/testing_set.txt").astype(np.int64)

In [100]:
tokenizer = Tokenizer(num_words=7000)  
texts = [df["title"].values.tolist()[i]+ ' ' +df["abstract"].values.tolist()[i] for i in range(df.shape[0])]
tokenizer.fit_on_texts(texts)

In [101]:
test_paper_id1 = test_set[:, 0]
test_paper_id2 = test_set[:, 1]
test_input1 = [data_dic[id][1] + ' ' + data_dic[id][4] for id in  test_paper_id1]
test_input2 = [data_dic[id][1] + ' ' + data_dic[id][4] for id in  test_paper_id2]
# convert texts to numbers
x_test_seq1 = tokenizer.texts_to_sequences(test_input1)
x_test_seq2 = tokenizer.texts_to_sequences(test_input2)

x_test1 = sequence.pad_sequences(x_test_seq1, maxlen=150)
x_test2 = sequence.pad_sequences(x_test_seq2, maxlen=150)

In [102]:
prediction = model.predict(x=[x_test1, x_test2])

In [103]:
prediction = (prediction>0.5).astype(np.int8)

In [104]:
pred_dict = {idx: prediction[idx] for idx in range(len(prediction))}

In [105]:
submission = pd.DataFrame.from_dict(pred_dict,orient='index')
submission.index.names = ['id']
submission.columns = ['category']
submission.to_pickle('result_'+str(fold_num)+'.pkl')

In [106]:
e_prediction = model.predict(x=[x_eval1, x_eval2])
e_prediction = (e_prediction>0.5).astype(np.int8)
e_pred_dict = {idx: e_prediction[idx] for idx in range(len(e_prediction))}

e_sub = pd.DataFrame.from_dict(e_pred_dict,orient='index')

In [107]:
e_list = e_sub[0].values.tolist()

In [108]:
len(e_list)

205171

In [109]:
stack = [-1]*len(data_set)
c = 0
for i in evaluate_all[fold_num-1]:
    stack[i] = e_list[c]
    c += 1

In [110]:
len(stack)

615512

In [111]:
stack_1 = pd.DataFrame(np.array(stack))
stack_1.to_pickle('stack_'+str(fold_num)+'.pkl')