In [83]:
#imports
import spacy
import stanza
from spacy_stanza import StanzaLanguage
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
import pandas as pd
from spacy import displacy

In [84]:
# LOAD ENTITY RULER

snlp = stanza.Pipeline(lang="bg")
nlp = StanzaLanguage(snlp)
ruler = EntityRuler(nlp)
ruler.from_disk("entity_ruler")  
nlp.add_pipe(ruler)

2020-11-04 19:02:44 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package |
-----------------------
| tokenize  | btb     |
| pos       | btb     |
| lemma     | btb     |
| depparse  | btb     |

2020-11-04 19:02:44 INFO: Use device: gpu
2020-11-04 19:02:44 INFO: Loading: tokenize
2020-11-04 19:02:44 INFO: Loading: pos
2020-11-04 19:02:45 INFO: Loading: lemma
2020-11-04 19:02:45 INFO: Loading: depparse
2020-11-04 19:02:46 INFO: Done loading processors!


In [85]:
# function for display options - describes color options for visualizing the named entities
def getDisplayOptions():

    entities = ["ORGAN", "ANATOMICAL_SYSTEM", "SYMPTOM", "COMPLAINT", "FAMILY", "RISK_FACTOR"]
    
    colors = {"ORGAN":"#F9E79F", "ANATOMICAL_SYSTEM":"#6fcbf7", "SYMPTOM":"#F4D03F", 
              "FAMILY":"#faa0eb", "RISK_FACTOR":"#f8717d", "COMPLAINT":"#A9DFBF"}
    
    options = {"ents": entities, "colors": colors}
    
    return options

In [99]:
import re
# text to analyze
text = "Оплаква се от световъртеж, неспирно дразнене  на носа, силна плачливост, обща отпадналост при ехография на коремни органи установена Ту формация в малък таз. Страда от болки в ръката. Наследствено обременена по майчина линия Консултирана с АГ.Оперирана В момента провежда химиотерапия; Не съобщава за оплаквания от страна на сърдечносъдовата система. Пуши от 20 години. "

lowerTextWithoutMultipleSpaces = re.sub(' +', ' ', text.lower()) # make text to lowercase and remove multiple spaces

formatedText = re.sub('\\.', ' .', lowerTextWithoutMultipleSpaces) # add space before "."
doc = nlp(formatedText)

In [100]:
#ANALYZE TEXT

# change doc tokenization: merge words from one entity
with doc.retokenize() as retokenizer:
    for ent in doc.ents:
        retokenizer.merge(doc[ent.start:ent.end])


#add mather rules - using lamatization and entities from ruler
from spacy.matcher import Matcher
matcher = Matcher(nlp.vocab)

pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'ORGAN', 'OP': '+'}}]
matcher.add('ORGAN', None, pattern)

pattern =[{'ENT_TYPE': {'REGEX': 'ANATOMICAL_SYSTEM', 'OP': '+'}}]
matcher.add('ANATOMICAL_SYSTEM', None, pattern)

pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'SYMPTOM', 'OP': '+'}}]
matcher.add('SYMPTOM', None, pattern)

pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'COMPLAINT', 'OP': '+'}}]
matcher.add('COMPLAINT', None, pattern)

pattern =[{'ENT_TYPE': {'REGEX': 'FAMILY', 'OP': '+'}}]
matcher.add('FAMILY', None, pattern)

pattern =[{'ENT_TYPE': {'REGEX': 'RISK_FACTOR', 'OP': '+'}}]
matcher.add('RISK_FACTOR', None, pattern)

pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'COMPLAINT', 'OP': '+'}},{'POS': 'ADP', 'OP': '+'}, {'POS': 'ADJ',  'OP': '*'}, {'ENT_TYPE': {'REGEX': 'ORGAN', 'OP': '+'}}]
matcher.add('COMPLAINT', None, pattern)

pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'SYMPTOM', 'OP': '+'}},{'POS': 'ADP', 'OP': '+'}, {'POS': 'ADJ',  'OP': '*'}, {'ENT_TYPE': {'REGEX': 'ORGAN', 'OP': '+'}}]
matcher.add('SYMPTOM', None, pattern)

matches = matcher(doc)

    
spans = []    
doc.ents = [] # clear entities in doc (created from ruler)

for match_id, start, end in matches:
    # create a new Span for each match and use the match_id as the label
    spans.append(Span(doc, start, end, label=match_id))
spans = spacy.util.filter_spans(spans); # clear overlaping spans

# add all matching entities to doc.ents
for span in spans:
    doc.ents = list(doc.ents) + [span]  # add span to doc.ents

In [101]:
#DISPLAY RESULTS
displacy.render(doc, style='ent', jupyter=True, options=getDisplayOptions())