# Neural Network Test

This script attempts to create a neural network to solve the problem of classifying a document as part of discussion or conclusion section in scientific papers. This attempts to replicate work done in the paper "Character-level Convolutional Networks for Text Classification" by Zhang. 

Here we input our libraries and set some basic parameters.

In [None]:
import keras
import theano
from __future__ import print_function
from time import time

import h5py
import keras
from keras.models import Sequential
from keras.layers.core import Dense, Dropout, Activation, Flatten, Reshape
from keras.layers.convolutional import Convolution1D, MaxPooling1D
from keras.optimizers import SGD
from keras.utils.np_utils import to_categorical
from keras.layers.normalization import BatchNormalization as BN
from keras.layers.embeddings import Embedding
from sklearn.cross_validation import train_test_split

import numpy as np
import os
import pickle

BATCH_SIZE = 16
FIELD_SIZE = 5 * 300
STRIDE = 1
N_FILTERS = 200

Define a function that will turn a string into a list of Ascii numbers based on each character in the string.

In [None]:
def vectorizeData(text):
    textList = list(text)
    returnList = []
    for item in textList[:1014]:
        returnList.append(ord(item))
    return returnList

# Preprocess Data

Import the dataset.

In [None]:
validDocsDict = dict()
fileList = os.listdir("BioMedProcessed")
for f in fileList:
    validDocsDict.update(pickle.load(open("BioMedProcessed/" + f, "rb")))

#validDocsDict2 = dict()
#fileList = os.listdir("PubMedProcessed")
#for f in fileList:
#    validDocsDict2.update(pickle.load(open("PubMedProcessed/" + f, "rb")))

Define some parameters for use later. This was developed to handle multiple datasets. Take the conclusion and discussion sections that are at least charLength number of characters. Vectorize that data and put them in the documents list. Then split the data up into multiple different train/test sets. 

In [None]:
print("Loading dataset...")
t0 = time()
documents = []
testPubDocuments = []
allDocuments = []
labels = []
testPubLabels = []
concLengthTotal = 0
discLengthTotal = 0
concCount = 0
discCount = 0
charLength = 1014
charList = []

#combinedDicts = validDocsDict.copy()
#combinedDicts.update(validDocsDict2.copy())

for k in validDocsDict.keys():
    if k.startswith("conclusion") and len(validDocsDict[k]) >= charLength:
        labels.append(0)
        documents.append(vectorizeData(validDocsDict[k]))
        charList.extend(vectorizeData(validDocsDict[k]))
        concCount += 1
        concLengthTotal += len(validDocsDict[k])
    elif k.startswith("discussion") and len(validDocsDict[k]) >= charLength:
        labels.append(1)
        documents.append(vectorizeData(validDocsDict[k]))
        charList.extend(vectorizeData(validDocsDict[k]))
        discCount += 1
        discLengthTotal += len(validDocsDict[k])

charList = set(charList)
        
#for k in validDocsDict2.keys():
#    if k.startswith("conclusion"):
#        testPubLabels.append("conclusion")
#        testPubDocuments.append(vectorizeData(validDocsDict2[k]))
#        concCount += 1
#        concLengthTotal += len(validDocsDict2[k])
#    elif k.startswith("discussion"):
#        testPubLabels.append("discussion")
#        testPubDocuments.append(vectorizeData(validDocsDict2[k]))
#        discCount += 1
#        discLengthTotal += len(validDocsDict2[k])
        
#for k in combinedDicts.keys():
#    if k.startswith("conclusion"):
#        allDocuments.append(vectorizeData(combinedDicts[k]))
#    elif k.startswith("discussion"):
#        allDocuments.append(vectorizeData(combinedDicts[k]))
        
print(len(documents))
print(concLengthTotal * 1.0/ concCount)
print(discLengthTotal * 1.0/ discCount)


train, test, labelsTrain, labelsTest = train_test_split(documents, labels, test_size = 0.95)
test1, test2, labelsTest1, labelsTest2 = train_test_split(test, labelsTest, test_size = 0.9)
print(len(train))
print(len(labelsTrain))

Get an identity matrix from the length of the charList set (to know how many features we have in the set). This identity matrix is used in the one-hot encodding of the characters. For each character in the charList set, we assign a different row of the identity matrix. Then we create X_train and X_test sets using this mapping to convert character ascii numbers to one-hot encoddings. We also create Y_train which maps each section (discussion or conclusion) to a length two one-hot encodded vector.

In [None]:
npVecs = np.eye(len(charList))
numToVec = dict()
labelsToVec = dict()
labelsToVec[0] = np.array([1,0])
labelsToVec[1] = np.array([0,1])
counter = 0
for item in charList:
    numToVec[item] = npVecs[counter]
    counter += 1
X_train = np.array([np.array([numToVec[x[y]] for y in x]) for x in train])
Y_train = np.array([np.array(labelsToVec[x]) for x in labelsTrain])
X_test = np.array([np.array([numToVec[x[y]] for y in x]) for x in test1])

In [None]:
X_train.shape
#X_train = np.expand_dims(X_train, axis = 1)

In [None]:
Y_train

# Creating and Running the Neural Network

Define the model for use in the neural network. This model was taken from the Zhang paper and is attempting to replicate their work. 

In [None]:
# VGG-like convolution stack
model = Sequential()
model.add(Convolution1D(256, 7, border_mode = 'valid', input_shape=(X_train.shape[1], X_train.shape[2]))) 
model.add(Activation('relu'))
model.add(MaxPooling1D(3))

model.add(Convolution1D(256, 7, border_mode = 'valid')) 
model.add(Activation('sigmoid'))
model.add(MaxPooling1D(3))

model.add(Convolution1D(256, 3, border_mode = 'valid')) 
model.add(Activation('relu'))

model.add(Convolution1D(256, 3, border_mode = 'valid')) 
model.add(Activation('sigmoid'))

model.add(Convolution1D(256, 3, border_mode = 'valid')) 
model.add(Activation('relu'))

model.add(Convolution1D(256, 3, border_mode = 'valid')) 
model.add(Activation('sigmoid'))
model.add(MaxPooling1D(3))

model.add(Flatten())
model.add(Dense(1024))
model.add(Dropout(0.5))
model.add(Dense(2048))
model.add(Dropout(0.5))
model.add(Dense(2))

Compile the model and start running the model on the X_train and Y_train

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adadelta')

In [None]:
model.fit(X_train, Y_train, nb_epoch=5000, batch_size=BATCH_SIZE, verbose=1, 
          show_accuracy=True, validation_split=0.1)

Get the predicted classes for independent test set data, compare them with known labels and output the accuracy.

In [None]:
Y_guess = model.predict_classes(X_test)

In [None]:
numCorrect = 0
for item in range(len(labelsTest1)):
    if Y_guess[item] == labelsTest1[item]:
        numCorrect += 1
print(numCorrect)
print(numCorrect * 1.0 / len(labelsTest1))