In [1]:
#imports
import spacy
import stanza
from spacy_stanza import StanzaLanguage
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
import pandas as pd
from spacy import displacy
import re

In [2]:
# LOAD ENTITY RULER

snlp = stanza.Pipeline(lang="bg")
nlp = StanzaLanguage(snlp)
ruler = EntityRuler(nlp)
ruler.from_disk("entity_ruler")  
nlp.add_pipe(ruler)

2020-11-18 13:30:48 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package |
-----------------------
| tokenize  | btb     |
| pos       | btb     |
| lemma     | btb     |
| depparse  | btb     |

2020-11-18 13:30:48 INFO: Use device: gpu
2020-11-18 13:30:48 INFO: Loading: tokenize
2020-11-18 13:30:56 INFO: Loading: pos
2020-11-18 13:30:57 INFO: Loading: lemma
2020-11-18 13:30:57 INFO: Loading: depparse
2020-11-18 13:30:59 INFO: Done loading processors!


In [3]:
# function for display options - describes color options for visualizing the named entities
def getDisplayOptions():

    entities = ["ORGAN", "ANATOMICAL_SYSTEM", "SYMPTOM", "COMPLAINT", "FAMILY", "RISK_FACTOR"]
    
    colors = {"ORGAN":"#F9E79F", "ANATOMICAL_SYSTEM":"#6fcbf7", "SYMPTOM":"#F4D03F", 
              "FAMILY":"#faa0eb", "RISK_FACTOR":"#f8717d", "COMPLAINT":"#A9DFBF"}
    
    options = {"ents": entities, "colors": colors}
    
    return options

In [4]:
# import re
# # text to analyze
# text = "Оплаква се от световъртеж, загуба на вкус и обоняние, гадене, дразнене в гърлото, неспирно дразнене  на носа, силна плачливост, фебрилен, обща отпадналост при ехография на коремни органи установена Ту формация в малък таз. Страда от болки в ръката. Наследствено обременена по майчина линия Консултирана с АГ.Оперирана В момента провежда химиотерапия; Не съобщава за оплаквания от страна на сърдечносъдовата система. Пуши от 20 години. "

# lowerTextWithoutMultipleSpaces = re.sub(' +', ' ', text.lower()) # make text to lowercase and remove multiple spaces

# formatedText = re.sub('\\.', ' .', lowerTextWithoutMultipleSpaces) # add space before "."
# doc = nlp(formatedText)

In [5]:
#ANALYZE TEXT
def analyzeText(text):
    lowerTextWithoutMultipleSpaces = re.sub(' +', ' ', text.lower()) # make text to lowercase and remove multiple spaces

    formatedText = re.sub('\\.', ' . ', lowerTextWithoutMultipleSpaces) # add space before and after "."
    formatedText = re.sub('\\,', ' , ', lowerTextWithoutMultipleSpaces) # add space before and after ","
    doc = nlp(formatedText)

    # change doc tokenization: merge words from one entity
    with doc.retokenize() as retokenizer:
        for ent in doc.ents:
            retokenizer.merge(doc[ent.start:ent.end])


    #add mather rules - using lamatization and entities from ruler
    from spacy.matcher import Matcher
    matcher = Matcher(nlp.vocab)

    pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'ORGAN', 'OP': '+'}}]
    matcher.add('ORGAN', None, pattern)

    pattern =[{'ENT_TYPE': {'REGEX': 'ANATOMICAL_SYSTEM', 'OP': '+'}}]
    matcher.add('ANATOMICAL_SYSTEM', None, pattern)

    pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'SYMPTOM', 'OP': '+'}}]
    matcher.add('SYMPTOM', None, pattern)

    pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'COMPLAINT', 'OP': '+'}}]
    matcher.add('COMPLAINT', None, pattern)

    pattern =[{'ENT_TYPE': {'REGEX': 'FAMILY', 'OP': '+'}}]
    matcher.add('FAMILY', None, pattern)

    pattern =[{'ENT_TYPE': {'REGEX': 'RISK_FACTOR', 'OP': '+'}}]
    matcher.add('RISK_FACTOR', None, pattern)

    pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'COMPLAINT', 'OP': '+'}},{'POS': 'ADP', 'OP': '+'}, {'POS': 'ADJ',  'OP': '*'}, {'ENT_TYPE': {'REGEX': 'ORGAN', 'OP': '+'}}]
    matcher.add('COMPLAINT', None, pattern)

    pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'SYMPTOM', 'OP': '+'}},{'POS': 'ADP', 'OP': '+'}, {'POS': 'ADJ',  'OP': '*'}, {'ENT_TYPE': {'REGEX': 'ORGAN', 'OP': '+'}}]
    matcher.add('SYMPTOM', None, pattern)
    
    pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'COMPLAINT', 'OP': '+'}},{'POS': 'ADP', 'OP': '+'}, {'POS': 'ADJ',  'OP': '*'}, {'POS': 'NOUN', 'OP': '+'}]
    matcher.add('COMPLAINT', None, pattern)

    pattern =[{'POS': 'ADJ',  'OP': '*'},{'ENT_TYPE': {'REGEX': 'SYMPTOM', 'OP': '+'}},{'POS': 'ADP', 'OP': '+'}, {'POS': 'ADJ',  'OP': '*'}, {'POS': 'NOUN', 'OP': '+'}]
    matcher.add('SYMPTOM', None, pattern)

    matches = matcher(doc)


    spans = []    
    doc.ents = [] # clear entities in doc (created from ruler)

    for match_id, start, end in matches:
        # create a new Span for each match and use the match_id as the label
        spans.append(Span(doc, start, end, label=match_id))
    spans = spacy.util.filter_spans(spans); # clear overlaping spans

    # add all matching entities to doc.ents
    for span in spans:
        doc.ents = list(doc.ents) + [span]  # add span to doc.ents
    return doc

In [6]:
# User interface

from ipywidgets import widgets  
from IPython.display import clear_output
import re
import os.path

DATA_PATH = '..\\data\\'

layout_hidden  = widgets.Layout(visibility = 'hidden')
layout_visible = widgets.Layout(visibility = 'visible')

def updateDocEnts(tuples):
    spans = []    
    doc.ents = [] # clear entities in doc (created from ruler)

    for text, label, start, end in tuples:
        # create a new Span for each match and use the match_id as the label
        spans.append(Span(doc, start, end, label))
    filteredSpans = spacy.util.filter_spans(spans); # clear overlaping spans

    # add all matching entities to doc.ents
    for span in filteredSpans:
        doc.ents = list(doc.ents) + [span]  # add span to doc.ents
        displayResult()

def analyzeAndDisplayResult(btn):
    global doc
    doc = analyzeText(inputBox.value)
    displayResult()
    spans = [(ent.text, ent.label_, ent.start, ent.end) for ent in doc.ents]

    checkResultBox.value = "\n".join(map(str,spans))
    checkResultBox.layout = layout_visible
    saveButton.layout = layout_visible
    applyChangesButton.layout = layout_visible
    resultBoxLabel.layout = layout_visible
    
    
def saveToCsv(btn):
    entities = []
    for ent in doc.ents:
        entities.append(tuple([ent.start_char, ent.end_char, ent.label_]))
    df = pd.DataFrame({'sentense': [doc.text],
                   'entities': [entities]})
    file = '{0}{1}'.format(DATA_PATH, "saved_anotations.csv")
    hdr = False  if os.path.isfile(file) else True # if file is new add headers
    df.to_csv(file, mode='a', header=hdr, index=False)
    


def displayResult():
    with output:
        #clear old output
        clear_output(wait=True)
        #DISPLAY RESULTS
        displacy.render(doc, style='ent', jupyter=True, options=getDisplayOptions())

def applyEntitiesChanges(btn):
    displayResult()
    if validateEntitiesString(checkResultBox.value):
        
        spans = [(ent.text, ent.label_, ent.start, ent.end) for ent in doc.ents]
        userSpans = convert(checkResultBox.value)
        
        removedSpans =  set(spans) - set(userSpans)
        newlySpans = set(userSpans) - set(spans)
        
        if len(removedSpans) > 0 or len(newlySpans) > 0:
            updateDocEnts(userSpans)
            

    
def validateEntitiesString(string):
    with output:
        r = re.compile("\(\'.*\', \'.*\', \d*, \d*\)$")
        for token in string.split("\n"):
            if r.match(token) is None:
                print('Added entites not match entities pattern')
                return False
        return True
    
# convert string to list of tuples (used to read changes in entities)    
def convert(rawString):
    result = []
    for entity in rawString.split("\n"): # every entity is on new line
        tupleString = entity.replace("(","").replace(")", "").replace("'", "") #remove (,),'
        tupleArray = tupleString.split(", ");
        result.append(tuple([tupleArray[0], tupleArray[1], int(tupleArray[2]), int(tupleArray[3])]))
    return result

# Create text widget for input
inputBox = widgets.Textarea(
    placeholder='Напишете текста, който искате да бъде анализиран',
    disabled=False)
inputBox.layout = layout_visible

checkResultBox = widgets.Textarea(
    placeholder='Намерени обекти в текста',
    continuous_update=False,
    disabled=False)
checkResultBox.layout = layout_hidden

output = widgets.Output()

analyzeButton = widgets.Button(description="Анализирай")
analyzeButton.layout = layout_visible
analyzeButton.on_click(analyzeAndDisplayResult)

applyChangesButton = widgets.Button(description="Виж промените")
applyChangesButton.layout = layout_hidden
applyChangesButton.on_click(applyEntitiesChanges)

saveButton = widgets.Button(description="Запази",  tooltip='Текста и обектите се записват във файл data/saved_anotations.csv')
saveButton.layout = layout_hidden
saveButton.on_click(saveToCsv)

resultBoxLabel = widgets.Label(value="Резултат от анализа:")
resultBoxLabel.layout = layout_hidden

inputLabel = widgets.Label(value="Текст за анализиране:")
inputLabel.layout = layout_visible

#    print(len(doc)) # number of tokens


title = widgets.HTML(
    value="<h2>Анализатор на медицински текстове</h2> <h5>Разпознават се обекти от следните категории: ORGAN, ANATOMICAL_SYSTEM, SYMPTOM, COMPLAINT, FAMILY, RISK_FACTOR</h5>"
)

row0 = widgets.HBox([title])
row1 = widgets.HBox([widgets.HBox([widgets.Label(value="Текст за анализиране:")], layout={'width': '150px'}), inputBox, analyzeButton])
row2 = widgets.HBox([widgets.HBox([resultBoxLabel], layout={'width': '150px'}), checkResultBox, applyChangesButton])
row3 = widgets.HBox([output])
row4 = widgets.HBox([saveButton])
widgets.VBox((row0, row1, row2, row3, row4))
# болки в кръста и гърба, диария, фебрилен

VBox(children=(HBox(children=(HTML(value='<h2>Анализатор на медицински текстове</h2> <h5>Разпознават се обекти…