In [4]:
import json
import os
from pymongo import MongoClient
from tqdm import tqdm

## Count Number of Documents by Parse Quality

Full Parse = title, authors, abstract, references, body

Partial Parse = abstract, body, references

No Body Parse = no body

No Parse = Nothing to little parsed

In [5]:
db = MongoClient().ds_documents
col = db.papers

In [3]:
total_docs = col.count()
full_parse = col.find(filter={'parse_status':'full_parse'}).count(True)
partial_parse = col.find(filter={'parse_status':'partial_parse'}).count(True)
no_body_parse = col.find(filter={'parse_status':'no_body'}).count(True)
no_parse = col.find(filter={'parse_status':'no_parse'}).count(True)

In [4]:
print('Total Documents: '+str(total_docs))
print('Full Parse: '+ str(full_parse) + ' (' + str(round(full_parse/total_docs*100,2))+'%' + ')')
print('Partial Parse: '+str(partial_parse)+ ' (' +str(round(partial_parse/total_docs*100,2))+'%' + ')')
print('No Body Parse: '+str(no_body_parse)+ ' (' +str(round(no_body_parse/total_docs*100,2))+'%' + ')')
print('No Parse: '+str(no_parse)+ ' (' + str(round(no_parse/total_docs*100,2))+'%' + ')')

Total Documents: 128418
Full Parse: 98717 (76.87%)
Partial Parse: 21152 (16.47%)
No Body Parse: 42 (0.03%)
No Parse: 8507 (6.62%)


## Paper Parsing and Concept Vocab Generation
-------------------------------------------------------------

This runs the paper processor methods, which take a corpus of documents in from a specific domain in and creates a concept dictionary for all the concepts/subjects which are covered in the texts .  This uses Dbpedia as a knowledge base in order to validate concepts.

The script also outputs a TFIDF matrix for the concept terms within the papers.  This is to be later used for creating similarity vectors for the papers.

In [1]:
from paper_processor import *

In [2]:
ignored_words = set(nltk.corpus.stopwords.words('english'))
ignored_words.update(('cid','et','e.g.','et al','al', 'yes', 'method',
                    'results','citation','use','used','submitted',
                    'published', 'professor','dtu', 'pubdb',
                    'university','acknowledgements','arxiv',
                    'association','society','.',','))

### Initial Parse and Tokenization

The concept_vectorizer class is a modified count vectorizer method based on Sklearn's implementation.  It varies by creating n_grams of the passed documents and uses collocation of these terms in order to search a greater space of possible concepts and subjects.

In [6]:
full_parse_cursor = col.find(filter={'parse_status':'full_parse'})

In [8]:
print('##### Starting first parse ####')
counter = concept_vectorizer(stop_words=ignored_words,min_df=5,path=True,doc_path_type='db_cursor')
path = full_parse_cursor
counter_vecs = counter.fit_transform(path)

##### Starting first parse ####


 97%|█████████▋| 95821/98717 [11:15<00:20, 141.87it/s]


In [9]:
import pickle
with open('counter_vecs.pickle','wb') as fhand:
    pickle.dump(counter_vecs,fhand)
with open('counter.pickle','wb') as fh:
    pickle.dump(counter,fh)

In [11]:
counter = pickle.load(open('counter.pickle','rb'))
counter_vecs = pickle.load(open('counter_vecs.pickle','rb'))

### Concept Validation

The concept_validator function takes the concept_vectorizer object and extracts the feature names for the count vectorized n_gram frequency matrix.  
It queries each of these n_grams against dbpedia and validates whether the concept exists or not and also returns a list of possible similar terms for the same concept providing disambiguation terms.
This results in creating a validated and reduced concept vocabulary for a given corpus of texts, giving the full list of concepts covered in these documents.

In [None]:
print('##### Validating the topics on DBPedia ####')
concept_dict = concept_validator(counter)
vocab_dict = dbpedia_metadata(concept_dict)
with open('vocab_dict.json','w') as fh:
    json.dump(vocab_dict,fh)
print('------ Vocab dict written to json')

In [7]:
with open('vocab_dict.json','r') as fh:
    vocab_dict = json.load(fh)

fixed_concept_dict = fixed_concept_dict(vocab_dict)
fix_vocab = fixed_concept_dict

100%|██████████| 15723/15723 [00:00<00:00, 86189.74it/s]


### Counting Concepts

Now with a fixed domain vocabulary the corpus is reparsed and tokenized but with a fixed search vocabulary for the counts of the concepts covered in each document.
The use of disambiguation terms will reference terms which are just synonyms to one fixed term, therefore meaning Machine Learning = ML.

After the count vectorized concept matrix a TFIDF transform is called.

In [None]:
print('#### Reparsing documents with vocabulary #####')

counter_fixed = concept_vectorizer(stop_words=ignored_words,min_df=3,fixed_vocab=fix_vocab,doc_path_type='db_cursor')
path = full_parse_cursor
counter_vec_fixed = counter_fixed.fit_transform(path)
tfidf = TfidfTransformer()
tfidf_vec = tfidf.fit_transform(counter_vec_fixed)

### Pickle Models

In [None]:
pickle.dump(counter_fixed,open('counter_fixed.p','wb'))
pickle.dump(counter_vec_fixed,open('counter_vec_fixed.p','wb'))
pickle.dump(tfidf,open('tfidf.p','wb'))
pickle.dump(tfidf_vec,open('tfidf_vec.p','wb'))

print('#### Finished and models pickled ####')