In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
import string
import gensim
from gensim.models import Word2Vec, KeyedVectors
import keras
from keras.utils import np_utils
from keras.layers.core import Reshape, Flatten
from keras import regularizers
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Input, Dense, Activation, Embedding, Flatten, GlobalMaxPool1D, Dropout, Conv1D, LSTM, MaxPooling1D, concatenate
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint
from keras.losses import binary_crossentropy
from keras.optimizers import Adam
from keras.models import Model
from keras.initializers import RandomUniform, glorot_uniform
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import LinearSVC
from xgboost import XGBClassifier
from sklearn.model_selection import KFold
from time import time
from sklearn.metrics import classification_report, f1_score
import matplotlib.pyplot as plt

import io
import random

%matplotlib inline

Using TensorFlow backend.


In [0]:
from reader import Reader
from keyphrase import Keyphrase
from publication import Publication
from wordembedder import Wordembedder

In [3]:
from google.colab import files
uploaded = files.upload()

Saving data-classification.csv to data-classification.csv


In [4]:
data = pd.read_csv('data-classification.csv')
data.head()

Unnamed: 0,phrases,labels
0,Nuclear theory,Task
1,thermalization,Task
2,thermalization,Process
3,semi-classical methods,Process
4,nuclear reactions,Process


In [0]:
def tokenize(msg):
    clean = [char for char in msg if char not in string.punctuation]
    clean = ''.join(clean)
    return clean.lower().split()

In [6]:
maxlen = max([len(tokenize(p)) for p in data.phrases])
print(maxlen)

25


In [0]:
docs = data.phrases

In [0]:
encoder = LabelEncoder()
encoder.fit(data['labels'])
encoded_Y = encoder.transform(data['labels'])
# convert integers to dummy variables (i.e. one hot encoded)
dummy_y = np_utils.to_categorical(encoded_Y)

In [0]:
we = Wordembedder(docs)

In [10]:
from google.colab import files
uploaded = files.upload()

Saving embedding-matrix-classification.npy to embedding-matrix-classification.npy


In [0]:
embedding_matrix = np.load('embedding-matrix-classification.npy')

In [12]:
uploaded = files.upload()

Saving encoded-docs-classification.npy to encoded-docs-classification.npy


In [0]:
encoded_docs = np.load('encoded-docs-classification.npy')
vocab_size = 5379
maxlen = 25

In [0]:
 x_train, x_test, y_train, y_test = train_test_split(encoded_docs, dummy_y, test_size=0.2, random_state=42)

In [0]:
def create_conv_layers(num_filters, filter_sizes, embedding):
    conv_layers = []
    for s in filter_sizes:
        conv = Conv1D(num_filters, s, activation='tanh', kernel_initializer=glorot_uniform(seed=random.seed(7)), kernel_regularizer=regularizers.l2(0.01))(embedding)
        conv_layers.append(conv)
    return conv_layers

def max_pools(maxlen, filter_sizes, conv_layers):
    pools = []
    for i in range(len(conv_layers)):
        pool = MaxPooling1D(maxlen - filter_sizes[i] + 1, strides=1)(conv_layers[i])
        pools.append(pool)
    return pools

def create_cnn_model(filter_sizes, num_filters, embedding_matrix, embedding_dim, vocabulary_size, maxlen, num_classes):
    filter_sizes = filter_sizes
    num_filters = num_filters
    drop = 0.5
    
    inputs = Input(shape=(maxlen,))
    embedding_layer = Embedding(vocabulary_size,
                            embedding_dim,
                            weights=[embedding_matrix],
                            trainable=False)
    embedding = embedding_layer(inputs)
    
    convs = create_conv_layers(num_filters, filter_sizes, embedding)
    pools = max_pools(maxlen, filter_sizes, convs)
    merged_tensor = concatenate(pools, axis=1)

    flatten = Flatten()(merged_tensor)
    
    dense1 = Dense(128, kernel_initializer=RandomUniform(seed=random.seed(7)))(flatten)
    dropout = Dropout(drop)(dense1)
    
    output = Dense(units=num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    
    # this creates a model that includes
    model = Model(inputs, output)
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    print(model.summary())
    return model
# def create_cnn_model():
#     filter_sizes = filter_sizes
#     num_filters = num_filters
#     drop = 0.5
    
#     inputs = Input(shape=(maxlen,))
#     embedding_layer = Embedding(vocabulary_size,
#                             embedding_dim,
#                             weights=[embedding_matrix],
#                             trainable=True)
#     embedding = embedding_layer(inputs)
    
#     convs = create_conv_layers(num_filters, filter_sizes, embedding)
#     pools = max_pools(maxlen, filter_sizes, convs)
#     merged_tensor = concatenate(pools, axis=1)

#     flatten = Flatten()(merged_tensor)
    
#     dense1 = Dense(128, kernel_initializer=RandomUniform(seed=random.seed(7)))(flatten)
#     dropout = Dropout(drop)(dense1)
    
#     output = Dense(units=num_classes, activation='softmax',kernel_regularizer=regularizers.l2(0.01))(dropout)
    
#     # this creates a model that includes
#     model = Model(inputs, output)
#     model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
#     print(model.summary())
#     return model

In [0]:
def fit_and_evaluate(x_train, y_train, x_val, y_val, x_test, y_test, epochs, filepath):
    model = None
    model = create_cnn_model([1,2], 128, embedding_matrix, 300, vocab_size, maxlen, 3)
    adam = Adam(lr=1e-3)
    model.compile(loss='categorical_crossentropy', metrics=['accuracy'],
              optimizer=adam)
    callbacks = [EarlyStopping(patience=4),
            ModelCheckpoint(filepath=filepath, save_best_only=True)]
    results = model.fit(x_train, y_train, batch_size=32, epochs=epochs, validation_data= (x_val,y_val),
                        callbacks=callbacks, verbose=1)
    metrics = model.evaluate(x_test, y_test)
    loss = metrics[0]
    accuracy = metrics[1]
    return results, model, loss, accuracy

In [0]:
import os

path = 'model/model-cnn.h5'
path_best = 'model/model-cnn-cv.h5'
path_dir = os.path.dirname(path)
path_best_dir = os.path.dirname(path_best)

In [82]:
kfold = KFold(10, True, random_state=7)

#save the model history in a list after fitting so that we can plot later
model_history = [] 

table_cnn = []

i=0

for t, v in kfold.split(x_train, y_train):
    print("Training on Fold: ",i+1)
    t_x = x_train[t]
    val_x = x_train[v]
    t_y = y_train[t]
    val_y = y_train[v]
    
    history, cnn_model, loss, accuracy = fit_and_evaluate(t_x, t_y, val_x, val_y, x_test, y_test, 100, path_dir)
    model_history.append(history)
    
    y_pred = cnn_model.predict(x_test)
    
    table_cnn += [[loss, accuracy]]
    i+=1

Training on Fold:  1
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_24 (InputLayer)           (None, 25)           0                                            
__________________________________________________________________________________________________
embedding_24 (Embedding)        (None, 25, 300)      1613700     input_24[0][0]                   
__________________________________________________________________________________________________
conv1d_49 (Conv1D)              (None, 25, 128)      38528       embedding_24[0][0]               
__________________________________________________________________________________________________
conv1d_50 (Conv1D)              (None, 24, 128)      76928       embedding_24[0][0]               
________________________________________________________________________________________

In [83]:
df_cnn = pd.DataFrame(table_cnn, columns=['losses', 'accuracies'], index=range(1,len(table_cnn)+1))
display(df_cnn)

Unnamed: 0,losses,accuracies
1,0.878892,0.687454
2,0.863848,0.678545
3,0.874627,0.68003
4,0.870686,0.659985
5,0.876661,0.674833
6,0.895287,0.654788
7,0.867976,0.678545
8,0.934366,0.683742
9,0.874676,0.665924
10,0.875627,0.668894
