## Imports

In [2]:
import urllib
import requests
from bs4 import BeautifulSoup
import re
import os

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np

from sklearn.feature_extraction.text import  CountVectorizer, TfidfVectorizer
from sklearn.utils import shuffle
from sklearn import naive_bayes, metrics, model_selection, linear_model
from sklearn.metrics import confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Init 'global' variables

In [3]:
data_dir = './api_data'
ignore_files = []
data_files = os.listdir(data_dir)
files = list(set(data_files)-set(ignore_files))

categories = files

predict_from = {'abstract': True, 'title': False, 'full_page': False}

files_codes = dict(zip(files, range(0, len(files))))
do_preproc = False

In [4]:
files_codes

{'flu_articles.xml': 0,
 'COPD_articles.xml': 1,
 'cysticfibrosis_articles.xml': 2,
 'acutebronchitis_articles.xml': 3,
 'pneumonia_articles.xml': 4,
 'lungcancer_articles.xml': 5,
 'asthma_articles.xml': 6}

### Create dataframe from webpage files

In [5]:
def create_df(route, label):
    pages_df = pd.DataFrame(columns=['document', 'category'])
    f = open(route, "rt", errors='ignore')
    articles = f.read()
    f.close()
    soup_atricles = BeautifulSoup(articles, 'xml')
    article_list = soup_atricles.find_all('PubmedArticle')
    for article, i in zip(article_list, range(0, len(article_list))):
        extracted = ''
        if predict_from['full_page']:
            extracted = article.get_text()
        elif predict_from['title']:
            extracted = article.find('ArticleTitle').get_text()
        elif predict_from['abstract']:
            maybe_abstract = article.find("Abstract")
            if maybe_abstract:
                extracted = maybe_abstract.get_text()
        
        extracted = re.sub("\W"," ",extracted)
        extracted = re.sub(r" +"," ",extracted)
        
        pages_df.loc[i] = (extracted, label)
    return pages_df

### Read the data from differenct directories

In [6]:
def preproc(document):
    new_doc = ''
    lemmatizer = WordNetLemmatizer() 
    for word in document.split(' '):
        if word not in set(stopwords.words('english')) and len(word) > 0:
            
            word = lemmatizer.lemmatize(word)
            new_doc += ' ' + word
    return new_doc

In [6]:
all_pages = pd.DataFrame(columns=['document', 'category', 'label'])

train_x = pd.Series()
valid_x = pd.Series()
train_y = pd.Series()
valid_y = pd.Series()

for file_name in files_codes:
    category = file_name.split('_')[0]
    code = files_codes[file_name]
    
    #Drop empty rows
    pages_df = create_df(data_dir+f'/{file_name}', category)
    pages_df.replace('', np.nan, inplace=True)
    pages_df.dropna(subset=['document'], inplace=True)

    pages_df['label'] =code
    
    all_pages = all_pages.append(pages_df, ignore_index=True)
    
    #Representative train-val sets
    train_x_, valid_x_, train_y_, valid_y_ = model_selection.train_test_split(pages_df['document'], pages_df['label'], test_size =  0.15)
    
    #Append to main dataframe
    train_x = train_x.append(train_x_, ignore_index=True)
    valid_x = valid_x.append(valid_x_, ignore_index=True)
    train_y = train_y.append(train_y_, ignore_index=True)
    valid_y = valid_y.append(valid_y_, ignore_index=True)
    print(category, "-ready")
train_x, train_y = shuffle(train_x, train_y, random_state=42)
valid_x, valid_y = shuffle(valid_x, valid_y, random_state=42)

flu -ready
COPD -ready
cysticfibrosis -ready
acutebronchitis -ready
pneumonia -ready
lungcancer -ready
asthma -ready


### Preproc before vectorization (optional)

In [23]:
all_pages['proc'] = ''
all_pages['proc'] = all_pages.apply(lambda x: preproc(x['document']), axis=1)

In [37]:
train_x, valid_x, train_y, valid_y = model_selection.train_test_split(all_pages['proc'], all_pages['label'], test_size =  0.15)
train_y = train_y.astype(int)
valid_y = valid_y.astype(int)

### How much data left

In [13]:
all_pages['one'] = 1

In [14]:
all_pages.groupby(['category']).sum()

Unnamed: 0_level_0,one
category,Unnamed: 1_level_1
COPD,890
acutebronchitis,929
asthma,850
cysticfibrosis,900
flu,937
lungcancer,930
pneumonia,771


### Word Count vectorizer

In [39]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(all_pages['document'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [40]:
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

### TF-IDF vectorizer

In [31]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_matrix = tfidf_vect.fit_transform(all_pages['document'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

In [57]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
# our corpus
data = all_pages['document']

cv = CountVectorizer()

# convert text data into term-frequency matrix
data = cv.fit_transform(data)

tfidf_transformer = TfidfTransformer()

# convert term-frequency matrix into tf-idf
tfidf_matrix = tfidf_transformer.fit_transform(data)

# create dictionary to find a tfidf word each word
word2tfidf = dict(zip(cv.get_feature_names(), tfidf_transformer.idf_))

dictis = {}
for word, score in word2tfidf.items():
    dictis[word] = score

In [65]:
dictis

{'00': 5.1184735450217955,
 '000': 4.639843861056292,
 '0000': 9.04044688130311,
 '00001': 7.941834592635,
 '0000175': 9.04044688130311,
 '00004': 9.04044688130311,
 '00006': 9.04044688130311,
 '00008': 9.04044688130311,
 '0001': 4.850792139276684,
 '00017': 9.04044688130311,
 '0001â': 9.04044688130311,
 '0002': 7.3356987890646845,
 '0003': 7.787683912807742,
 '000300': 9.04044688130311,
 '0004': 7.654152520183219,
 '0005': 7.431008968869009,
 '0006': 8.124156149428954,
 '0007': 8.124156149428954,
 '000726': 9.04044688130311,
 '0008': 8.347299700743164,
 '0009': 8.347299700743164,
 '000â': 8.347299700743164,
 '001': 3.509035632587695,
 '0010': 8.124156149428954,
 '0011': 9.04044688130311,
 '0012': 9.04044688130311,
 '0013': 8.347299700743164,
 '0013â': 9.04044688130311,
 '0014': 8.347299700743164,
 '0015': 8.347299700743164,
 '0016': 8.634981773194944,
 '0017': 7.787683912807742,
 '001932': 9.04044688130311,
 '001for': 9.04044688130311,
 '001â': 9.04044688130311,
 '002': 5.089203162721

### Naiv Bayes - word count

In [42]:
classifier = naive_bayes.MultinomialNB()

classifier.fit(xtrain_count, train_y)

predictions = classifier.predict(xvalid_count)

metrics.accuracy_score(predictions, valid_y)

0.7178111587982833

In [47]:
confusion_matrix(predictions, valid_y)

array([[ 83,   2,   2,  10,   3,   0,   3],
       [  4, 101,  10,   0,   1,   1,   3],
       [  4,   8,  82,   6,   1,   5,   3],
       [ 24,   1,   7,  94,   7,   8,  15],
       [  2,   0,   5,   3, 121,   2,   4],
       [  7,   3,   2,   4,   4, 144,   2],
       [ 10,   4,   5,   4,   2,   0, 116]], dtype=int64)

### Naiv Bayes - TF-IDF

In [49]:
classifier = naive_bayes.MultinomialNB()

classifier.fit(xtrain_tfidf, train_y)

predictions = classifier.predict(xvalid_tfidf)

metrics.accuracy_score(predictions, valid_y)

0.7156652360515021

In [50]:
confusion_matrix(predictions, valid_y)

array([[ 68,   4,   1,  10,   3,   0,   2],
       [  5,  91,  22,   0,   1,   1,   5],
       [  2,   8,  66,   3,   2,   6,   2],
       [ 37,   4,   9,  86,  17,  15,  15],
       [  3,   1,   5,  11, 106,   2,   7],
       [  6,   3,   3,   5,   3, 136,   1],
       [ 13,   8,   7,   6,   7,   0, 114]], dtype=int64)

### Linear - word count

In [51]:
classifier = linear_model.LogisticRegression(solver='liblinear',multi_class='auto')

classifier.fit(xtrain_count, train_y)

predictions = classifier.predict(xvalid_count)

metrics.accuracy_score(predictions, valid_y)

0.7832618025751072

In [52]:
confusion_matrix(predictions, valid_y)

array([[ 82,   3,   1,  19,   5,   2,   2],
       [  5,  99,   8,   3,   2,   0,   2],
       [  4,   8,  90,   7,   6,   8,   4],
       [ 25,   1,   7,  78,  10,   6,   7],
       [  5,   1,   2,   6, 112,   1,   3],
       [  9,   3,   3,   4,   2, 142,   1],
       [  4,   4,   2,   4,   2,   1, 127]], dtype=int64)

### Linear - TF-IDF

In [53]:
classifier = linear_model.LogisticRegression(solver='lbfgs')

classifier.fit(xtrain_tfidf, train_y)

predictions = classifier.predict(xvalid_tfidf)

metrics.accuracy_score(predictions, valid_y)

0.7950643776824035

In [54]:
confusion_matrix(predictions, valid_y)

array([[ 83,   2,   2,  10,   3,   0,   3],
       [  4, 101,  10,   0,   1,   1,   3],
       [  4,   8,  82,   6,   1,   5,   3],
       [ 24,   1,   7,  94,   7,   8,  15],
       [  2,   0,   5,   3, 121,   2,   4],
       [  7,   3,   2,   4,   4, 144,   2],
       [ 10,   4,   5,   4,   2,   0, 116]], dtype=int64)

### NLTK library

In [55]:
import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import time
import random

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ivani\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [56]:
word_count_dict = {}
start = time.time()
lemmatizer = WordNetLemmatizer() 

dp = display('Process started',display_id=True)
for index, row in all_pages.iterrows():
    words = re.split(' ', row['document'])
    
    for word in words:
        word = word.lower()
        if word not in set(stopwords.words('english')) and len(word) > 1:
            word = lemmatizer.lemmatize(word)
            if word not in word_count_dict:
                word_count_dict[word] = 1
            else:
                word_count_dict[word] = word_count_dict[word] + 1
    if index % 10 == 0:
        now = time.time()
        percent_ready = (index / all_pages.shape[0]) + 0.0000001
        dp.update('expected finish:'+ time.ctime(start + ((now-start) / percent_ready)))

'expected finish:Mon May 25 04:47:31 2020'

In [57]:
word_dict_list = sorted(word_count_dict.items(), key=lambda x: x[1], reverse = True)
words_shrinked = word_dict_list[0:6000]
word_list = [w for (w,n) in words_shrinked]

In [58]:
words_shrinked[0:15]

[('patient', 13439),
 ('study', 6693),
 ('disease', 6566),
 ('lung', 5326),
 ('cell', 4935),
 ('treatment', 4020),
 ('group', 3996),
 ('infection', 3872),
 ('asthma', 3700),
 ('respiratory', 3587),
 ('year', 3505),
 ('clinical', 3491),
 ('copd', 3166),
 ('associated', 3147),
 ('risk', 3145)]

In [59]:
def document_features(document):
    document_words = set(re.split(' ',document))
    features = {}
    for word in word_list:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [60]:
train_set = []
for ind, row in all_pages.iterrows():
    feature = (document_features(row['document']), row['category'])
    train_set.append(feature)
random.shuffle(train_set)

In [61]:
train_part = int(len(train_set)*0.85)

In [62]:
classifier_nb = nltk.NaiveBayesClassifier.train(train_set[0:train_part])

In [63]:
nltk.classify.accuracy(classifier_nb, train_set[train_part:len(train_set)])

0.6984978540772532

In [64]:
classifier_nb.show_most_informative_features(25)

Most Informative Features
        contains(cystic) = True           cystic : flu    =    239.3 : 1.0
      contains(fibrosis) = True           cystic : flu    =    118.7 : 1.0
           contains(flu) = True              flu : acuteb =    115.8 : 1.0
 contains(bronchiolitis) = True           acuteb : COPD   =    115.6 : 1.0
      contains(allergic) = True           asthma : lungca =    105.6 : 1.0
    contains(bronchitis) = True           acuteb : flu    =     99.1 : 1.0
        contains(asthma) = True           asthma : pneumo =     94.0 : 1.0
   contains(obstructive) = True             COPD : flu    =     86.6 : 1.0
     contains(influenza) = True              flu : cystic =     83.1 : 1.0
   contains(conductance) = True           cystic : lungca =     75.8 : 1.0
    contains(aeruginosa) = True           cystic : lungca =     75.1 : 1.0
      contains(outbreak) = True           pneumo : lungca =     74.1 : 1.0
    contains(metastasis) = True           lungca : COPD   =     73.7 : 1.0