In [1]:
from __future__ import absolute_import
from __future__ import division, print_function, unicode_literals

from sumy.parsers.html import HtmlParser
from sumy.parsers.plaintext import PlaintextParser
from sumy.nlp.tokenizers import Tokenizer
from sumy.summarizers.text_rank import TextRankSummarizer as Summarizer
from sumy.nlp.stemmers import Stemmer
from sumy.utils import get_stop_words

In [4]:
def sum_form_file(text_file, language="english", sentences_cout=10):

    parser = PlaintextParser.from_file(text_file, Tokenizer(language))
    stemmer = Stemmer(language)

    summarizer = Summarizer(stemmer)
    summarizer.stem_words = get_stop_words(language)

    sentences = summarizer(parser.document, sentences_cout)

    return sentences

def sum_form_url(url, language="english", sentences_cout=10):

    parser = HtmlParser.from_url(url, Tokenizer(language))
    stemmer = Stemmer(language)

    summarizer = Summarizer(stemmer)
    summarizer.stem_words = get_stop_words(language)

    sentences = summarizer(parser.document, sentences_cout)

    return sentences

In [5]:
sentences = sum_form_url('https://www.nature.com/articles/s41467-017-02681-z', sentences_cout=3)

In [8]:
sentences[0]._text

u'Using the WSBM to uncover a network\u2019s community structure involves inferring both the parameters of these processes and the nodes\u2019 community assignments that maximize the log-evidence that the WSBM generated the observed network (see Methods for details on connectome reconstruction and the WSBM).'

In [13]:
import sys
sys.path.append("relation_extraction/")
from pycorenlp import StanfordCoreNLP

corenlp = StanfordCoreNLP('http://localhost:9000')
corenlp_properties = {
    'annotators': 'tokenize, pos, ner',
    'outputFormat': 'json'
}

def get_tagged_from_server(input_text):
    """
    Send the input_text to the CoreNLP server and retrieve the tokens, named entity tags and part-of-speech tags.
    """
    corenlp_output = corenlp.annotate(input_text,properties=corenlp_properties).get("sentences", [])[0]
    tagged = [(t['originalText'], t['ner'], t['pos']) for t in corenlp_output['tokens']]
    return tagged

In [21]:
get_tagged_from_server(sentences[0]._text.encode('ascii','ignore'))

[(u'Using', u'O', u'VBG'),
 (u'the', u'O', u'DT'),
 (u'WSBM', u'O', u'NNP'),
 (u'to', u'O', u'TO'),
 (u'uncover', u'O', u'VB'),
 (u'a', u'O', u'DT'),
 (u'networks', u'O', u'NNS'),
 (u'community', u'O', u'NN'),
 (u'structure', u'O', u'NN'),
 (u'involves', u'O', u'VBZ'),
 (u'inferring', u'O', u'VBG'),
 (u'both', u'O', u'CC'),
 (u'the', u'O', u'DT'),
 (u'parameters', u'O', u'NNS'),
 (u'of', u'O', u'IN'),
 (u'these', u'O', u'DT'),
 (u'processes', u'O', u'NNS'),
 (u'and', u'O', u'CC'),
 (u'the', u'O', u'DT'),
 (u'nodes', u'O', u'NNS'),
 (u'community', u'O', u'NN'),
 (u'assignments', u'O', u'NNS'),
 (u'that', u'O', u'WDT'),
 (u'maximize', u'O', u'VBP'),
 (u'the', u'O', u'DT'),
 (u'log-evidence', u'O', u'NN'),
 (u'that', u'O', u'IN'),
 (u'the', u'O', u'DT'),
 (u'WSBM', u'ORGANIZATION', u'NNP'),
 (u'generated', u'O', u'VBD'),
 (u'the', u'O', u'DT'),
 (u'observed', u'O', u'VBN'),
 (u'network', u'O', u'NN'),
 (u'(', u'O', u'-LRB-'),
 (u'see', u'O', u'VB'),
 (u'Methods', u'O', u'NNS'),
 (u'for', 

In [20]:
type(sentences[0]._text.encode('ascii','ignore'))

str