In [1]:
!conda install -y spacy

Fetching package metadata ...........
Solving package specifications: ..........

Package plan for installation in environment /opt/conda/envs/python2:

The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    cymem-1.31.2               |           py27_0          55 KB
    murmurhash-0.26.4          |           py27_0          29 KB
    pathlib-1.0.1              |           py27_0          25 KB
    plac-0.9.6                 |           py27_0          31 KB
    regex-2017.4.5             |           py27_0         568 KB
    termcolor-1.1.0            |           py27_0           7 KB
    tqdm-4.15.0                |           py27_0          48 KB
    ujson-1.35                 |           py27_0          57 KB
    wrapt-1.10.11              |           py27_0          64 KB
    cytoolz-0.8.2              |           py27_0         913 KB
    preshed-1.0.0              |           py27_0    

In [2]:
import spacy                        
nlp = spacy.load('en')



    Only loading the 'en' tokenizer.



### Sentence detection

In [2]:
txt =u"""Prescribing sick days due to diagnosis of influenza.Jane complains about flu-like symptoms.Jane may be experiencing some sort of flu episode.Jane’s RIDT came back negative for influenza.
Jane is at high risk for flu if she’s not vaccinated.Jane’s older brother had the flu last month.Jane had a severe case of flu last year.Joe expressed concerns about the risks of bird flu.
Joe shows no signs of stroke, except for numbness.Nausea, vomiting and ankle swelling negative.Patient denies alcohol abuse. Allergies: Penicillin, Dust, Sneezing.
There's an outbreak of happiness in San Jose organized by O'Reilly Media, today, March 15, 2017, involving thousands of people."""

print 'Language:',nlp.lang
print 'Vocabulary size:',nlp.vocab.length
print
doc = nlp(txt) 
data=[]
for sent in doc.sents:
#     print 'Text:',sent.text.replace('\n','')
#     print 'Start:',sent.start,'End:',sent.end
    data.append((sent.start,sent.end,sent.text.replace('\n','')))
import pandas as pd
sents = pd.DataFrame(data=data,columns = ['Start','End','Sentence Text'])
sents

Language: en
Vocabulary size: 1297614



Unnamed: 0,Start,End,Sentence Text
0,0,9,Prescribing sick days due to diagnosis of infl...
1,9,17,Jane complains about flu-like symptoms.
2,17,27,Jane may be experiencing some sort of flu epis...
3,27,37,Jane’s RIDT came back negative for influenza.
4,37,50,Jane is at high risk for flu if she’s not vacc...
5,50,60,Jane’s older brother had the flu last month.
6,60,70,Jane had a severe case of flu last year.
7,70,81,Joe expressed concerns about the risks of bird...
8,81,92,"Joe shows no signs of stroke, except for numbn..."
9,92,100,"Nausea, vomiting and ankle swelling negative."


### Part of speech tagging and Named Entity extraction

In [6]:
data = []
for sent in doc.sents:
    for w in sent:
        tmp=[]
        tmp.append(w.idx)
        tmp.append(w.text)
        tmp.append(w.lex_id)
        tmp.append(w.lemma_)
        tmp.append(w.pos_)
        tmp.append(w.head)
        tmp.append(w.dep_)
        tmp.append(w.ent_type_)
#         tmp.append(w.sentiment)
#         tmp.append(sent.text)
#         tmp.append(sent.label_)
        data.append(tmp)
tokens = pd.DataFrame(data=data, columns = ['Index','Token','Id_in_vocab',
        'Lemma','POS','Depends_on','Dependency_type','Entity_Type'])
tokens

Unnamed: 0,Index,Token,Id_in_vocab,Lemma,POS,Depends_on,Dependency_type,Entity_Type
0,0,Prescribing,258460,prescribe,VERB,days,amod,
1,12,sick,1239,sick,ADJ,days,amod,
2,17,days,360,day,NOUN,days,ROOT,
3,22,due,586,due,ADJ,days,amod,
4,26,to,5,to,ADP,due,pcomp,
5,29,diagnosis,8171,diagnosis,NOUN,due,pobj,
6,39,of,8,of,ADP,diagnosis,prep,
7,42,influenza,47577,influenza,NOUN,of,pobj,
8,51,.,1,.,PUNCT,days,punct,
9,52,Jane,10305,jane,PROPN,complains,nsubj,PERSON


### Using the syntactic dependencies

In [7]:
from spacy.symbols import nsubj, VERB
# Finding a verb with a subject 
pairs = []
for possible_subject in doc:
    if possible_subject.dep == nsubj and possible_subject.head.pos == VERB:
        pairs.append((possible_subject,possible_subject.head))

for pair in pairs:
    print 'Subject: ',pair[0],' verb: ',pair[1]

Subject:  Jane  verb:  complains
Subject:  Jane  verb:  experiencing
Subject:  RIDT  verb:  came
Subject:  Jane  verb:  is
Subject:  Jane  verb:  had
Subject:  brother  verb:  had
Subject:  Jane  verb:  had
Subject:  Joe  verb:  expressed
Subject:  Joe  verb:  shows
Subject:  ankle  verb:  swelling
Subject:  Patient  verb:  denies


### Costum pipeline... Adding negation detection.

In [8]:
from negex import *
rfile = open(r'negex_triggers.txt')
irules = sortRules(rfile.readlines())

def negation_tag(doc):
    for sent in doc.sents:
        ph= set()
        for word in sent:
            if word.pos_!='ADP' and word.pos_!='PUNCT':
                ph.add(word.text)
        tagger = negTagger(sentence = sent.text, phrases = list(ph),rules = irules, negP=False)
        scopes=  tagger.getScopes()
        res = set()
        for scope in scopes:
            s = scope.replace('[NEGATED]','').replace('.','').replace(',','')
            if ' ' in s:
                for wd in s.split(' '):
                    res.add(wd)
            else:
                res.add(s)
        for word in sent:
            if word.text in res:
                word.dep_ = u'NEGATED'
            else:
                word.dep_= u'AFFIRMATIVE'


def custom_pipeline(nlp):
    return (nlp.tagger,nlp.parser,negation_tag)

nlp_neg = spacy.load('en', create_pipeline=custom_pipeline)

In [9]:
doc2 = nlp_neg(txt)
for sent in doc2.sents:
    negs = []
    for word in sent:
        if word.dep_==u'NEGATED':
            negs.append(word)
#         if word.dep_== u'NEGATED':
#             negs.append[word]
    if len(negs)>0:
        print sent
        print 'Negated words: ',negs

Jane is at high risk for flu if she’s not vaccinated.
Negated words:  [vaccinated]
Joe shows no signs of stroke, except for numbness.
Negated words:  [stroke]
Patient denies alcohol abuse.
Negated words:  [alcohol, abuse]
