In [2]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from keras.layers import Dense, Flatten, Conv1D, MaxPooling1D, Dropout, Input, concatenate
from keras.layers.embeddings import Embedding
from keras.models import Model
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

Using TensorFlow backend.


In [0]:
from keras.callbacks import EarlyStopping, ModelCheckpoint, ReduceLROnPlateau
import tensorflow as tf

**Node** **information**

In [0]:
columes = ["id", "year", "title", "authors", "journal", "abstract"]

In [0]:
df= pd.read_csv("node_information.csv",names=columes )

**Train** **set**

In [0]:
data_set = np.loadtxt("training_set.txt").astype(np.int64)

In [0]:
data = data_set[:, :2]
label = data_set[:, -1]

**K**-**fold** **split**

In [7]:
k = 5
train_all = []
evaluate_all = []
skf = StratifiedKFold(n_splits=k, random_state=1234, shuffle=True)
for train_index, evaluate_index in skf.split(data, label):
    train_all.append(train_index)
    evaluate_all.append(evaluate_index)
    print(train_index.shape,evaluate_index.shape) 

(492409,) (123103,)
(492409,) (123103,)
(492410,) (123102,)
(492410,) (123102,)
(492410,) (123102,)


In [0]:
def get_data(fold_index, max_sentence_len):
    tokenizer = Tokenizer(num_words=3000)  
    texts = list(df["abstract"].values)
    tokenizer.fit_on_texts(texts)

    
    train_index = train_all[fold_index - 1]
    evaluate_index = evaluate_all[fold_index - 1]
    
    # group train data
    train_label = label[train_index]
    
    train_input = data[train_index]
    paper_id1 = train_input[:, 0]
    paper_id2 = train_input[:, 1]
    train_input1 = [df.loc[df['id']==idx1]['abstract'].values[0] for idx1 in paper_id1]
    train_input2 = [df.loc[df['id']==idx2]['abstract'].values[0] for idx2 in paper_id2]
    # convert texts to numbers
    x_train_seq1 = tokenizer.texts_to_sequences(train_input1)
    x_train_seq2 = tokenizer.texts_to_sequences(train_input2)

    x_train1 = sequence.pad_sequences(x_train_seq1, maxlen=max_sentence_len)
    x_train2 = sequence.pad_sequences(x_train_seq2, maxlen=max_sentence_len)
    
    # group evaluate data
    eval_label = label[evaluate_index]
    
    eval_input = data[evaluate_index]
    paper_id1_ = eval_input[:, 0]
    paper_id2_ = eval_input[:, 1]
    eval_input1 = [df.loc[df['id']==idx1]['abstract'].values[0] for idx1 in paper_id1_]
    eval_input2 = [df.loc[df['id']==idx2]['abstract'].values[0] for idx2 in paper_id2_]
    # convert texts to numbers
    x_eval_seq1 = tokenizer.texts_to_sequences(eval_input1)
    x_eval_seq2 = tokenizer.texts_to_sequences(eval_input2)

    x_eval1 = sequence.pad_sequences(x_eval_seq1, maxlen=max_sentence_len)
    x_eval2 = sequence.pad_sequences(x_eval_seq2, maxlen=max_sentence_len)
    

    return x_train1, x_train2, train_label, x_eval1, x_eval2, eval_label


**Create Model**

In [0]:
def create_model(maxlen=150, max_features=3000, embed_size=32):
    # Inputs
    input1 = Input(shape=[maxlen], name='x_seq1')
    input2 = Input(shape=[maxlen], name='x_seq2')

    # Embeddings layers
    shared_embedding_layer = Embedding(max_features, embed_size)
    
    # two inputs share embedding layer
    encoded1 = shared_embedding_layer(input1)
    encoded2 = shared_embedding_layer(input2)

    # conv layers
    convs1 = []
    convs2 = []
    filter_sizes = [2, 3, 4, 5]
    for fsz in filter_sizes:
        conv_layer = Conv1D(filters=64, kernel_size=fsz, activation='relu')
        pool_layer = MaxPooling1D(maxlen - fsz + 1)
        flatten_layer = Flatten()
        # two encoded vectors share the cnn weights
        l_conv1 = flatten_layer(pool_layer(conv_layer(encoded1)))
        l_conv2 = flatten_layer(pool_layer(conv_layer(encoded2)))
        
        convs1.append(l_conv1)
        convs2.append(l_conv2)
        
    merge1 = concatenate(convs1, axis=1)
    merge2 = concatenate(convs2, axis=1)
    
    # merge two branches
    merge = concatenate([merge1, merge2], axis=-1)

    # out = Dropout(0.5)(merge)
    hidden1 = Dense(64, activation='relu')(merge)
    hidden2 = Dense(32, activation='relu')(hidden1)

    output = Dense(units=1, activation='sigmoid')(hidden2)
    model = Model(inputs=[input1, input2], outputs=output)

    return model

**Training**

In [0]:
x_train1, x_train2, train_label, x_eval1, x_eval2, eval_label = get_data(1, 150)

In [0]:
def plot_history(history,metric_name):
    fig, (ax_loss, ax_score) = plt.subplots(1, 2, figsize=(15,5))
    ax_loss.plot(history.epoch, history.history["loss"], label="Train loss")
    ax_loss.plot(history.epoch, history.history["val_loss"], label="Validation loss")
    ax_loss.legend()
    ax_score.plot(history.epoch, history.history[metric_name], label="Train score")
    ax_score.plot(history.epoch, history.history["val_" + metric_name], label="Validation score")
    ax_score.legend()


In [0]:
def f1(label, pred):
    return tf.py_func(f1_score, [label, (pred>0.5)], tf.float64)

In [0]:
model = create_model()
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=['accuracy'])

In [19]:
save_model_name = "model.ckpt"

model_checkpoint = ModelCheckpoint(save_model_name, monitor='val_acc', 
                               mode='max', save_best_only=True, verbose=1)
reduce_lr = ReduceLROnPlateau(monitor='val_acc', mode='max',
                              factor=0.5, patience=10, min_lr=0.0000001, verbose=1)

batch_size = 64
epochs = 50
history = model.fit(x=[x_train1, x_train2],
          y=train_label,
          validation_data=[[x_eval1, x_eval2], eval_label],
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[ model_checkpoint,reduce_lr],
          verbose=2,
          shuffle=True)

plot_history(history,'accuracy')

Train on 492409 samples, validate on 123103 samples
Epoch 1/50
 - 301s - loss: 0.4111 - acc: 0.8058 - val_loss: 0.3332 - val_acc: 0.8564
Epoch 2/50




 - 304s - loss: 0.3007 - acc: 0.8738 - val_loss: 0.2942 - val_acc: 0.8789
Epoch 3/50
 - 304s - loss: 0.2568 - acc: 0.8960 - val_loss: 0.2687 - val_acc: 0.8907
Epoch 4/50
 - 300s - loss: 0.2298 - acc: 0.9088 - val_loss: 0.2617 - val_acc: 0.8951
Epoch 5/50
 - 302s - loss: 0.2088 - acc: 0.9188 - val_loss: 0.2455 - val_acc: 0.9033
Epoch 6/50
 - 305s - loss: 0.1925 - acc: 0.9258 - val_loss: 0.2405 - val_acc: 0.9069
Epoch 7/50
 - 302s - loss: 0.1800 - acc: 0.9315 - val_loss: 0.2445 - val_acc: 0.9064
Epoch 8/50
 - 300s - loss: 0.1690 - acc: 0.9363 - val_loss: 0.2339 - val_acc: 0.9102
Epoch 9/50
 - 304s - loss: 0.1583 - acc: 0.9411 - val_loss: 0.2498 - val_acc: 0.9067
Epoch 10/50
 - 305s - loss: 0.1499 - acc: 0.9445 - val_loss: 0.2439 - val_acc: 0.9085
Epoch 11/50
 - 300s - loss: 0.1422 - acc: 0.9473 - val_loss: 0.2326 - val_acc: 0.9138
Epoch 12/50
 - 304s - loss: 0.1350 - acc: 0.9502 - val_loss: 0.2380 - val_acc: 0.9119
Epoch 13/50
 - 301s - loss: 0.1285 - acc: 0.9526 - val_loss: 0.2375 - val

KeyboardInterrupt: ignored

In [20]:
scores = model.evaluate([x_eval1[:1000], x_eval2[:1000]], eval_label[:1000])



In [21]:
print('test_loss: %f, accuracy: %f' % (scores[0], scores[1]))

test_loss: 0.290044, accuracy: 0.913000


In [23]:
pred = model.predict(x=[x_eval1, x_eval2])
f1 = f1_score(eval_label, (pred>0.5).astype(np.int8))
print(f1)                    
                      

0.920840283099671


**load test data and create submission file**

In [0]:
test_set = np.loadtxt("testing_set.txt").astype(np.int64)

In [0]:
tokenizer = Tokenizer(num_words=3000)  
texts = list(df["abstract"].values)
tokenizer.fit_on_texts(texts)

In [0]:
test_paper_id1 = test_set[:, 0]
test_paper_id2 = test_set[:, 1]
test_input1 = [df.loc[df['id']==idx1]['abstract'].values[0] for idx1 in test_paper_id1]
test_input2 = [df.loc[df['id']==idx2]['abstract'].values[0] for idx2 in test_paper_id2]
# convert texts to numbers
x_test_seq1 = tokenizer.texts_to_sequences(test_input1)
x_test_seq2 = tokenizer.texts_to_sequences(test_input2)

x_test1 = sequence.pad_sequences(x_test_seq1, maxlen=150)
x_test2 = sequence.pad_sequences(x_test_seq2, maxlen=150)

In [0]:
prediction = model.predict(x=[x_test1, x_test2])

In [0]:
prediction = (prediction>0.5).astype(np.int8)

In [0]:
pred_dict = {idx: prediction[idx] for idx in range(len(prediction))}

In [0]:
submission = pd.DataFrame.from_dict(pred_dict,orient='index')
submission.index.names = ['id']
submission.columns = ['category']
submission.to_csv("submission.csv")