In [5]:
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

In [22]:
doc = nlp('Call came directly to the IC for medical inquiry and adverse event noted for Crestor. Patient started Crestor 10 mg daily by mouth in Jan2008 for high cholesterol. Patient also taking Toprol XL 50 mg daily by mouth started in Nov2000 for high blood pressure. Patient reported\nthe following that she has skipped doses of Crestor over the years since she started taking it and the last time was in Dec2017. She would just take her usual dose the next day and not try to make up the missed dose. Her HCP is aware, no treatment offered. Had high\ncholesterol since about 2003 and currently taking brand Crestor. She is nervous now about taking new medications, unknown start date and if her HCP is aware. Patient to start generic Crestor when finish her brand Crestor. No further information was provided.\nFollow up received on 03-Jan-2018.\nUpdated on 03Jan2018. The following should have been included in the above narrative. Patient stated that she is fine with the brand Crestor, she no problems with it. This information was received in the original phone call. Same sender table.')
pprint([(X.text, X.label_) for X in doc.ents])

[('IC', 'ORG'),
 ('Crestor', 'ORG'),
 ('Crestor', 'ORG'),
 ('10', 'CARDINAL'),
 ('daily', 'DATE'),
 ('Toprol', 'ORG'),
 ('50', 'CARDINAL'),
 ('daily', 'DATE'),
 ('Nov2000', 'GPE'),
 ('\n', 'GPE'),
 ('Crestor', 'ORG'),
 ('the years', 'DATE'),
 ('the next day', 'DATE'),
 ('HCP', 'ORG'),
 ('\n', 'GPE'),
 ('about 2003', 'DATE'),
 ('Crestor', 'ORG'),
 ('HCP', 'ORG'),
 ('Crestor', 'ORG'),
 ('Crestor', 'ORG'),
 ('\n', 'GPE'),
 ('\n', 'GPE'),
 ('Updated', 'ORG'),
 ('03Jan2018', 'CARDINAL'),
 ('Crestor', 'ORG')]


In [23]:
pprint([(X, X.ent_iob_, X.ent_type_) for X in doc])

[(Call, 'O', ''),
 (came, 'O', ''),
 (directly, 'O', ''),
 (to, 'O', ''),
 (the, 'O', ''),
 (IC, 'B', 'ORG'),
 (for, 'O', ''),
 (medical, 'O', ''),
 (inquiry, 'O', ''),
 (and, 'O', ''),
 (adverse, 'O', ''),
 (event, 'O', ''),
 (noted, 'O', ''),
 (for, 'O', ''),
 (Crestor, 'B', 'ORG'),
 (., 'O', ''),
 (Patient, 'O', ''),
 (started, 'O', ''),
 (Crestor, 'B', 'ORG'),
 (10, 'B', 'CARDINAL'),
 (mg, 'O', ''),
 (daily, 'B', 'DATE'),
 (by, 'O', ''),
 (mouth, 'O', ''),
 (in, 'O', ''),
 (Jan2008, 'O', ''),
 (for, 'O', ''),
 (high, 'O', ''),
 (cholesterol, 'O', ''),
 (., 'O', ''),
 (Patient, 'O', ''),
 (also, 'O', ''),
 (taking, 'O', ''),
 (Toprol, 'B', 'ORG'),
 (XL, 'O', ''),
 (50, 'B', 'CARDINAL'),
 (mg, 'O', ''),
 (daily, 'B', 'DATE'),
 (by, 'O', ''),
 (mouth, 'O', ''),
 (started, 'O', ''),
 (in, 'O', ''),
 (Nov2000, 'B', 'GPE'),
 (for, 'O', ''),
 (high, 'O', ''),
 (blood, 'O', ''),
 (pressure, 'O', ''),
 (., 'O', ''),
 (Patient, 'O', ''),
 (reported, 'O', ''),
 (
, 'B', 'GPE'),
 (the, 'O', ''

In [24]:
from bs4 import BeautifulSoup
import requests
import re

def url_to_string(url):
    res = requests.get(url)
    html = res.text
    soup = BeautifulSoup(html, 'html5lib')
    for script in soup(["script", "style", 'aside']):
        script.extract()
    return " ".join(re.split(r'[\n\t]+', soup.get_text()))

ny_bb = url_to_string('https://www.nytimes.com/2018/08/13/us/politics/peter-strzok-fired-fbi.html?hp&action=click&pgtype=Homepage&clickSource=story-heading&module=first-column-region&region=top-news&WT.nav=top-news')
article = nlp(ny_bb)
len(article.ents)

188

In [25]:
labels = [x.label_ for x in article.ents]
Counter(labels)

Counter({'PERSON': 83,
         'GPE': 36,
         'ORG': 24,
         'DATE': 30,
         'CARDINAL': 6,
         'EVENT': 1,
         'NORP': 5,
         'ORDINAL': 1,
         'WORK_OF_ART': 1,
         'LOC': 1})

In [26]:
items = [x.text for x in article.ents]
Counter(items).most_common(3)

[('Strzok', 32), ('F.B.I.', 17), ('Trump', 10)]

In [27]:
sentences = [x for x in article.sents]
print(sentences[20])

Firing Mr. Strzok, however, removes a favorite target of Mr. Trump from the ranks of the F.B.I. and gives Mr. Bowdich and the F.B.I. director, Christopher A. Wray, a chance to move beyond the president’s ire.


In [28]:
displacy.render(nlp(str(sentences[20])), jupyter=True, style='ent')

In [13]:
displacy.render(nlp(str(sentences[20])), style='dep', jupyter = True, options = {'distance': 120})

In [14]:
[(x.orth_,x.pos_, x.lemma_) for x in [y 
                                      for y
                                      in nlp(str(sentences[20])) 
                                      if not y.is_stop and y.pos_ != 'PUNCT']]

[('Firing', 'VERB', 'fire'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Strzok', 'PROPN', 'strzok'),
 ('removes', 'VERB', 'remove'),
 ('favorite', 'ADJ', 'favorite'),
 ('target', 'NOUN', 'target'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Trump', 'PROPN', 'trump'),
 ('ranks', 'NOUN', 'rank'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('gives', 'VERB', 'give'),
 ('Mr.', 'PROPN', 'mr.'),
 ('Bowdich', 'PROPN', 'bowdich'),
 ('F.B.I.', 'PROPN', 'f.b.i.'),
 ('director', 'NOUN', 'director'),
 ('Christopher', 'PROPN', 'christopher'),
 ('A.', 'PROPN', 'a.'),
 ('Wray', 'PROPN', 'wray'),
 ('chance', 'NOUN', 'chance'),
 ('president', 'NOUN', 'president'),
 ('’s', 'PART', '’s'),
 ('ire', 'NOUN', 'ire')]

In [15]:
dict([(str(x), x.label_) for x in nlp(str(sentences[20])).ents])

{'Strzok': 'PERSON',
 'Trump': 'PERSON',
 'F.B.I.': 'GPE',
 'Bowdich': 'PERSON',
 'Christopher A. Wray': 'PERSON'}