## Imports

In [35]:
import urllib
import requests
from bs4 import BeautifulSoup
import re
import os
import pandas as pd

from sklearn.feature_extraction.text import  CountVectorizer, TfidfVectorizer

from sklearn import naive_bayes, metrics, model_selection, linear_model

### Init 'global' variables

In [55]:
#ignore_dir = ['IT_company']
ignore_dir = []

data_dir = './data'
routes = os.listdir(data_dir)
routes = list(set(routes)-set(ignore_dir))

categories = routes

predict_from = {'abstract': True, 'title': False, 'full_page': False}

label_cat_dict = dict(zip(routes, range(0, len(routes))))

### Create dataframe from webpage files

In [58]:
def create_df(route, label):
    pages_df = pd.DataFrame(columns=['document', 'category'])
    files = os.listdir(f'{data_dir}/'+route)
    for file, i in zip(files, range(0, len(files))):
        f = open(f'{data_dir}/'+route + "/" + file, "rt", errors='ignore')
        
        page = f.read()
        f.close()
        soup_page = BeautifulSoup(page, "html.parser")
        page_content = ''
        if predict_from['full_page']:
            page_content = soup_page.get_text()
        elif predict_from['title']:
            page_content = soup_page.find_all('h1', class_='heading-title')[0].get_text()
        elif predict_from['abstract']:
            abstract = soup_page.find_all('div', class_='abstract-content selected')
            if abstract:
                page_content = abstract[0].get_text()
            
        cleaned_page_content = re.sub("\W"," ",page_content)
        cleaned_page_content = re.sub(r" +"," ",cleaned_page_content)
        pages_df.loc[i] = (cleaned_page_content, label)
    return pages_df

### Read the data from differenct directories

In [59]:
all_pages = pd.DataFrame(columns=['document', 'category', 'label'])

train_x = pd.Series()
valid_x = pd.Series()
train_y = pd.Series()
valid_y = pd.Series()

for route, category in zip(routes, categories):
    label = label_cat_dict[category]
    pages_df = create_df(route, category)
    pages_df['label'] =label
    all_pages = all_pages.append(pages_df, ignore_index=True)
    
    train_x_, valid_x_, train_y_, valid_y_ = model_selection.train_test_split(pages_df['document'], pages_df['label'], test_size =  0.15)
    
    train_x = train_x.append(train_x_, ignore_index=True)
    valid_x = valid_x.append(valid_x_, ignore_index=True)
    train_y = train_y.append(train_y_, ignore_index=True)
    valid_y = valid_y.append(valid_y_, ignore_index=True)
    print(category, "-ready")

chemistry -ready
disease -ready
food -ready


### Word Count vectorizer

In [60]:
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(all_pages['document'])

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='\\w{1,}', tokenizer=None,
        vocabulary=None)

In [61]:
xtrain_count = count_vect.transform(train_x)
xvalid_count = count_vect.transform(valid_x)

### TF-IDF vectorizer

In [62]:
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(all_pages['document'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

### Naiv Bayes - word count

In [63]:
classifier = naive_bayes.MultinomialNB()

classifier.fit(xtrain_count, train_y)

predictions = classifier.predict(xvalid_count)

metrics.accuracy_score(predictions, valid_y)

0.9185185185185185

### Naiv Bayes - TF-IDF

In [64]:
classifier = naive_bayes.MultinomialNB()

classifier.fit(xtrain_tfidf, train_y)

predictions = classifier.predict(xvalid_tfidf)

metrics.accuracy_score(predictions, valid_y)

0.9185185185185185

### Linear - word count

In [65]:
classifier = linear_model.LogisticRegression(solver='liblinear',multi_class='auto')

classifier.fit(xtrain_count, train_y)

predictions = classifier.predict(xvalid_count)

metrics.accuracy_score(predictions, valid_y)

1.0

### Linear - TF-IDF

In [66]:
classifier = linear_model.LogisticRegression(solver='lbfgs')

classifier.fit(xtrain_tfidf, train_y)

predictions = classifier.predict(xvalid_tfidf)

metrics.accuracy_score(predictions, valid_y)



1.0

### TLDR

In [67]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
import time
import random

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\ivani\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [68]:
word_dict = {}
start = time.time()
for index, row in all_pages.iterrows():
    words = re.split(' ', row['document'])
    
    for word in words:
        word = word.lower()
        if word not in set(stopwords.words('english')) and len(word) > 2:
            if word not in word_dict:
                word_dict[word] = 1
            else:
                word_dict[word] = word_dict[word] + 1
    if index % 10 == 0:
        now = time.time()
        percent_ready = (index / all_pages.shape[0]) + 0.0000001
        print(index,'expected finish:', time.ctime(start + ((now-start) / percent_ready)), end='\r')

890 expected finish: Thu May 21 21:53:35 2020

In [69]:
word_dict_list = sorted(word_dict.items(), key=lambda x: x[1], reverse = True)
words_shrinked = word_dict_list[0:6000]
word_list = [w for (w,n) in words_shrinked]

In [70]:
def document_features(document):
    document_words = set(re.split(' ',document))
    features = {}
    for word in word_list:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [71]:
train_set = []
for ind, row in all_pages.iterrows():
    feature = (document_features(row['document']), row['category'])
    train_set.append(feature)
random.shuffle(train_set)

In [72]:
classifier_nb = nltk.NaiveBayesClassifier.train(train_set[0:400])

In [73]:
nltk.classify.accuracy(classifier_nb, train_set[400:len(train_set)])

0.968

In [74]:
classifier_nb.show_most_informative_features(100)

Most Informative Features
           contains(gas) = True           chemis : diseas =     33.7 : 1.0
    contains(expression) = True             food : chemis =     33.0 : 1.0
        contains(tomato) = True             food : chemis =     30.8 : 1.0
     contains(emissions) = True           chemis : food   =     29.7 : 1.0
          contains(soil) = True           chemis : diseas =     26.3 : 1.0
        contains(plants) = True             food : diseas =     23.4 : 1.0
         contains(total) = True             food : diseas =     21.1 : 1.0
       contains(content) = True             food : diseas =     20.4 : 1.0
   contains(temperature) = True           chemis : diseas =     20.3 : 1.0
 contains(concentration) = True           chemis : diseas =     19.0 : 1.0
        contains(genome) = True             food : chemis =     18.9 : 1.0
          contains(acid) = True             food : diseas =     17.4 : 1.0
      contains(syndrome) = True           diseas : chemis =     17.0 : 1.0