In [1]:
#imports
import spacy
import stanza
from spacy_stanza import StanzaLanguage
from spacy.matcher import PhraseMatcher
from spacy.pipeline import EntityRuler
from spacy.tokens import Span
import pandas as pd
from spacy import displacy

In [2]:
DATA_PATH = '..\\data\\'

In [3]:
#LOAD lexicons
organs = pd.read_csv('{0}{1}'.format(DATA_PATH, "organs.csv"), sep='\n', usecols=['name'], squeeze=True)
complaints = pd.read_csv('{0}{1}'.format(DATA_PATH,"complaints.csv"), sep='\n', usecols=['name'], squeeze=True)
symptoms = pd.read_csv('{0}{1}'.format(DATA_PATH,"symptoms.csv"), sep='\n', usecols=['name'], squeeze=True)
anatomicalSystems = pd.read_csv('{0}{1}'.format(DATA_PATH,"systems.csv"), sep=',', usecols=['name'], squeeze=True)
familyRelations = pd.read_csv('{0}{1}'.format(DATA_PATH,"familyRelations.csv"), sep='\n', usecols=['name'], squeeze=True)
riskFactors = pd.read_csv('{0}{1}'.format(DATA_PATH,"riskFactors.csv"), sep=',', usecols=['name'], squeeze=True)

In [4]:
#build ruler and save to disk

snlp = stanza.Pipeline(lang="bg")
nlp = StanzaLanguage(snlp)
ruler = EntityRuler(nlp, overwrite_ents=True, phrase_matcher_attr='LEMMA')

for organ in organs:
    doc = nlp.make_doc(organ)
    token_pattern = [{"lower": token.lower_} for token in doc]
    ruler.add_patterns([{"label": "ORGAN", "pattern": token_pattern}])
    
for complaint in complaints:
    doc = nlp.make_doc(complaint)
    token_pattern = [{"lower": token.lower_} for token in doc]
    ruler.add_patterns([{"label": "COMPLAINT", "pattern": token_pattern}])
    
for symptom in symptoms:
    doc = nlp.make_doc(symptom)
    token_pattern = [{"lower": token.lower_} for token in doc]
    ruler.add_patterns([{"label": "SYMPTOM", "pattern": token_pattern}])
    
for system in anatomicalSystems:
    doc = nlp.make_doc(system)
    token_pattern = [{"lower": token.lower_} for token in doc]
    ruler.add_patterns([{"label": "ANATOMICAL_SYSTEM", "pattern": token_pattern}])
    
for familyRelation in familyRelations:
    doc = nlp.make_doc(familyRelation)
    token_pattern = [{"lower": token.lower_} for token in doc]
    ruler.add_patterns([{"label": "FAMILY", "pattern": token_pattern}])
    
for riskFactor in riskFactors:
    doc = nlp.make_doc(riskFactor)
    token_pattern = [{"lower": token.lower_} for token in doc]
    ruler.add_patterns([{"label": "RISK_FACTOR", "pattern": token_pattern}])
    

ruler.to_disk("entity_ruler") 

2020-11-23 21:17:09 INFO: Loading these models for language: bg (Bulgarian):
| Processor | Package |
-----------------------
| tokenize  | btb     |
| pos       | btb     |
| lemma     | btb     |
| depparse  | btb     |

2020-11-23 21:17:09 INFO: Use device: gpu
2020-11-23 21:17:09 INFO: Loading: tokenize
2020-11-23 21:17:12 INFO: Loading: pos
2020-11-23 21:17:13 INFO: Loading: lemma
2020-11-23 21:17:13 INFO: Loading: depparse
2020-11-23 21:17:14 INFO: Done loading processors!
