In [3]:
import multiprocessing
from gensim.models import Word2Vec

import os
import numpy as np
import pandas as pd
from bs4 import BeautifulSoup
import re
from sklearn import model_selection
from sklearn.utils import shuffle

In [4]:
data_dir = './api_data'
ignore_files = []
data_files = os.listdir(data_dir)
files = list(set(data_files)-set(ignore_files))

categories = files

predict_from = {'abstract': True, 'title': False, 'full_page': False}

files_codes = dict(zip(files, range(0, len(files))))
do_preproc = False

In [5]:
def create_df(route, label):
    pages_df = pd.DataFrame(columns=['document', 'category'])
    f = open(route, "rt", errors='ignore')
    articles = f.read()
    f.close()
    soup_atricles = BeautifulSoup(articles, 'xml')
    article_list = soup_atricles.find_all('PubmedArticle')
    for article, i in zip(article_list, range(0, len(article_list))):
        extracted = ''
        if predict_from['full_page']:
            extracted = article.get_text()
        elif predict_from['title']:
            extracted = article.find('ArticleTitle').get_text()
        elif predict_from['abstract']:
            maybe_abstract = article.find("Abstract")
            if maybe_abstract:
                extracted = maybe_abstract.get_text()
        
        extracted = re.sub("\W"," ",extracted)
        extracted = re.sub(r" +"," ",extracted)
        
        pages_df.loc[i] = (extracted, label)
    return pages_df

In [6]:
all_pages = pd.DataFrame(columns=['document', 'category', 'label'])

train_x = pd.Series()
valid_x = pd.Series()
train_y = pd.Series()
valid_y = pd.Series()

for file_name in files_codes:
    category = file_name.split('_')[0]
    code = files_codes[file_name]
    
    #Drop empty rows
    pages_df = create_df(data_dir+f'/{file_name}', category)
    pages_df.replace('', np.nan, inplace=True)
    pages_df.dropna(subset=['document'], inplace=True)

    pages_df['label'] =code
    
    all_pages = all_pages.append(pages_df, ignore_index=True)
    
    #Representative train-val sets
    train_x_, valid_x_, train_y_, valid_y_ = model_selection.train_test_split(pages_df['document'], pages_df['label'], test_size =  0.15)
    
    #Append to main dataframe
    train_x = train_x.append(train_x_, ignore_index=True)
    valid_x = valid_x.append(valid_x_, ignore_index=True)
    train_y = train_y.append(train_y_, ignore_index=True)
    valid_y = valid_y.append(valid_y_, ignore_index=True)
    print(category, "-ready")
train_x, train_y = shuffle(train_x, train_y, random_state=42)
valid_x, valid_y = shuffle(valid_x, valid_y, random_state=42)

cysticfibrosis -ready
pneumonia -ready
COPD -ready
asthma -ready
acutebronchitis -ready
flu -ready
lungcancer -ready


In [7]:
all_pages.document = all_pages.apply(lambda x: x.document.lower(), axis = 1)

In [8]:
all_pages['doc_list'] = all_pages.apply(lambda x: x.document.split(), axis = 1)

In [121]:
all_pages['doc_list_len'] = all_pages.apply(lambda x: len(x.doc_list), axis = 1)
all_pages['doc_list_len'].describe()

count    6207.000000
mean      245.030127
std        95.749661
min         0.000000
25%       190.000000
50%       249.000000
75%       290.000000
max      1079.000000
Name: doc_list_len, dtype: float64

In [9]:
a = np.array(all_pages.label.astype(int))
deep_target = np.zeros((a.size, a.max()+1))
deep_target[np.arange(a.size),a] = 1
deep_target

array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])

In [10]:
model = Word2Vec(all_pages['doc_list'], size=200, window=5, min_count=1, workers=4)

In [141]:
from keras.preprocessing import text, sequence
from tensorflow.keras import layers, models, optimizers

In [12]:
dataset = np.zeros(shape=(all_pages.shape[0], 200) )

### TF-IDF scores

In [13]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# our corpus
data = all_pages['document']

cv = CountVectorizer()

# convert text data into term-frequency matrix
data = cv.fit_transform(data)

tfidf_transformer = TfidfTransformer()

# convert term-frequency matrix into tf-idf
tfidf_matrix = tfidf_transformer.fit_transform(data)

# create dictionary to find a tfidf word each word
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))

dictis = {}
for word, score in word2tfidf.items():
    dictis[word] = score

### Doc to vec

In [None]:
for ind, row in all_pages.iterrows():
    sentence_vec = np.zeros(shape = (200,))
    for word in row['doc_list']:
        if word in dictis:
            word_vec = model[word] * dictis[word]
        else:
            word_vec = model[word]
        sentence_vec = np.add(word_vec, sentence_vec)
    sentence_vec = sentence_vec / len(row['doc_list'])
    dataset[ind] = sentence_vec *10

## Shallow neural net

In [17]:
input_size = dataset[0].shape

In [84]:
dataset = tf.keras.utils.normalize(dataset)

In [85]:
nans = np.isnan(dataset)
dataset[nans] = 0

In [86]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(dataset, deep_target, test_size =  0.15)

In [109]:
# create input layer 
input_layer = layers.Input(input_size, sparse=True)

# create hidden layer
hidden_layer = layers.Dense(200, activation="tanh")(input_layer)

# create output layer
output_layer = layers.Dense(7, activation="softmax")(hidden_layer)

classifier = models.Model(inputs = input_layer, outputs = output_layer)
classifier.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])


In [110]:
classifier.summary()

Model: "model_20"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_21 (InputLayer)        [(None, 200)]             0         
_________________________________________________________________
dense_41 (Dense)             (None, 200)               40200     
_________________________________________________________________
dense_42 (Dense)             (None, 7)                 1407      
Total params: 41,607
Trainable params: 41,607
Non-trainable params: 0
_________________________________________________________________


In [113]:
classifier.fit(x=train_x, y=train_y, batch_size=30, epochs=30)
classifier.evaluate(x=valid_x, y=valid_y)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


[0.8585413098335266, 0.7070815563201904]

## LSTM

In [122]:
dataset = np.zeros(shape=(all_pages.shape[0], 300, 200) )

In [125]:
for ind, row in all_pages.iterrows():
    sentence_vec = np.zeros(shape = (300, 200))
    for i, word in enumerate(row['doc_list'][0:300]):
        if word in dictis:
            word_vec = model[word] * dictis[word]
        else:
            word_vec = model[word]
        sentence_vec[i] = word_vec * 10
    dataset[ind] = sentence_vec

  """
  import sys


In [139]:
input_size = dataset[0].shape

In [140]:
dataset = tf.keras.utils.normalize(dataset)

KeyboardInterrupt: 

In [None]:
nans = np.isnan(dataset)
dataset[nans] = 0

In [None]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(dataset, deep_target, test_size =  0.15)

In [None]:
input_layer = layers.Input(input_size)
lstm_layer = layers.LSTM(200)(input_layer)
output_layer1 = layers.Dense(100, activation="tanh")(lstm_layer)
output_layer1 = layers.Dropout(0.25)(output_layer1)
output_layer2 = layers.Dense(7, activation="softmax")(output_layer1)

classifier = models.Model(inputs = input_layer, outputs = output_layer2)
classifier.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])

In [None]:
classifier.fit(x=train_x, y=train_y, batch_size=30, epochs=30)
classifier.evaluate(x=valid_x, y=valid_y)