In [3]:
import spacy

from funcs.data_processing.stage1_processing import get_ebi_data

In [4]:
ebi_data = get_ebi_data(verbose=False)
print(ebi_data.info())
ebi_data

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1191 entries, 0 to 1190
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   query              1191 non-null   object
 1   MAPPED_TERM_LABEL  1191 non-null   object
 2   MAPPED_TERM_URI    1191 non-null   object
 3   MAPPING_TYPE       1191 non-null   object
 4   id                 1191 non-null   object
 5   full_id            1191 non-null   object
 6   mapping_id         1191 non-null   int64 
dtypes: int64(1), object(6)
memory usage: 65.3+ KB
None


Unnamed: 0,query,MAPPED_TERM_LABEL,MAPPED_TERM_URI,MAPPING_TYPE,id,full_id,mapping_id
0,gonarthrosis,osteoarthritis || knee,EFO_0004616,Broad,EFO_0004616,http://www.ebi.ac.uk/efo/EFO_0004616,1
1,psoriatic and enteropathic arthropathies,psoriatic arthritis,EFO_0003778,? Broad,EFO_0003778,http://www.ebi.ac.uk/efo/EFO_0003778,2
2,pain associated with micturition,dysuria,EFO_0003901,? Broad,EFO_0003901,http://www.ebi.ac.uk/efo/EFO_0003901,3
3,other mood,mood disorder,EFO_0004247,? Broad,EFO_0004247,http://www.ebi.ac.uk/efo/EFO_0004247,4
4,preterm delivery,premature birth,EFO_0003917,? Exact,EFO_0003917,http://www.ebi.ac.uk/efo/EFO_0003917,5
...,...,...,...,...,...,...,...
1186,malignant neoplasm without specification of site,cancer,EFO_0000311,Broad,EFO_0000311,http://www.ebi.ac.uk/efo/EFO_0000311,1187
1187,other and unspecified types of non-hodgkin's l...,non-Hodgkins lymphoma,EFO_0005952,Exact,EFO_0005952,http://www.ebi.ac.uk/efo/EFO_0005952,1188
1188,candidiasis,"Candidiasis, Invasive",EFO_1001283,Narrow,EFO_1001283,http://www.ebi.ac.uk/efo/EFO_1001283,1189
1189,other predominantly sexually transmitted disea...,bacterial sexually transmitted disease,EFO_0003955,Narrow,EFO_0003955,http://www.ebi.ac.uk/efo/EFO_0003955,1190


In [58]:
SAMPLE_SIZE = 50
ebi_sample = ebi_data[ebi_data["MAPPING_TYPE"] == "Exact"] \
    [["query", "MAPPED_TERM_LABEL", "MAPPED_TERM_URI"]] \
    .head(SAMPLE_SIZE).drop_duplicates().reset_index(drop=True)
ebi_sample

Unnamed: 0,query,MAPPED_TERM_LABEL,MAPPED_TERM_URI
0,alzheimer s disease,Alzheimer's disease,EFO_0000249
1,endocarditis valve unspecified,endocarditis,EFO_0000465
2,family history of other conditions,family history,EFO_0000493
3,unspecified maternal hypertension,preeclampsia,EFO_0000668
4,stroke not specified as haemorrhage or infarction,stroke,EFO_0000712
5,subarachnoid haemorrhage,subarachnoid hemorrhage,EFO_0000713
6,unspecified human immunodeficiency virus,HIV infection,EFO_0000764
7,other endocrine disorders,endocrine system disease,EFO_0001379
8,other diseases of liver,liver disease,EFO_0001421
9,primary disorders of muscles,muscular disease,EFO_0002970


In [67]:
nlp0 = spacy.load("en_core_web_md")
nlp1 = spacy.load("en_core_web_md")
nlp0.pipe_names

['tok2vec', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner']

In [68]:
def clean_text(term, nlp):
    clean_text = " ".join([_.text for _ in nlp(term) if _.pos_ != "PART" and _.text != "s"])
    return clean_text

def get_norm_pattern(term, nlp):
    doc = nlp(clean_text(term=term, nlp=nlp))
    # remove apostrophe
    res = [{"NORM": _.norm_} for _ in doc if _.pos_ != "PART"]
    return res

patterns = [
    {
        "label": "EFO",
        "pattern": get_norm_pattern(_["MAPPED_TERM_LABEL"], nlp=nlp0),
        "id": _["MAPPED_TERM_URI"],
    }
    for _ in ebi_sample.to_dict(orient="records")
]
print(patterns[:3])

[{'label': 'EFO', 'pattern': [{'NORM': 'alzheimer'}, {'NORM': 'disease'}], 'id': 'EFO_0000249'}, {'label': 'EFO', 'pattern': [{'NORM': 'endocarditis'}], 'id': 'EFO_0000465'}, {'label': 'EFO', 'pattern': [{'NORM': 'family'}, {'NORM': 'history'}], 'id': 'EFO_0000493'}]


In [69]:
ruler = nlp1.add_pipe("entity_ruler")

In [70]:
ruler.add_patterns(patterns)

In [71]:
for idx, _ in enumerate(ebi_sample["MAPPED_TERM_LABEL"]):
    print(f"# {idx}: {_}")
    doc = nlp1(clean_text(_, nlp=nlp0))
    for ent in doc.ents:
        print(ent.text, ent.label_, ent.ent_id_)
    print("\n")

# 0: Alzheimer's disease
Alzheimer disease EFO EFO_0000249


# 1: endocarditis
endocarditis EFO EFO_0000465


# 2: family history
family history EFO EFO_0000493


# 3: preeclampsia
preeclampsia EFO EFO_0000668


# 4: stroke
stroke EFO EFO_0000712


# 5: subarachnoid hemorrhage
subarachnoid hemorrhage EFO EFO_0000713


# 6: HIV infection
HIV infection EFO EFO_0000764


# 7: endocrine system disease
endocrine system disease EFO EFO_0001379


# 8: liver disease
liver disease EFO EFO_0001421


# 9: muscular disease
muscular disease EFO EFO_0002970


# 10: pervasive developmental disorder - not otherwise specified
pervasive developmental disorder - otherwise specified EFO EFO_0003759


# 11: cerebrovascular disorder
cerebrovascular disorder EFO EFO_0003763


# 12: gallbladder disease
gallbladder disease EFO EFO_0003832


# 13: spinal fracture'
spinal fracture ' EFO EFO_0003902


# 14: hepatitis B infection
hepatitis B infection EFO EFO_0004197


# 15: cardiac arrythmia
cardiac arrythmia EFO

In [72]:
for idx, _ in enumerate(ebi_sample["query"]):
    print(f"# {idx}: {_}")
    doc = nlp1(clean_text(_, nlp=nlp0))
    for ent in doc.ents:
        print(ent.text, ent.label_, ent.ent_id_)
    print("\n")

# 0: alzheimer s disease
alzheimer disease EFO EFO_0000249


# 1: endocarditis valve unspecified
endocarditis EFO EFO_0000465


# 2: family history of other conditions
family history EFO EFO_0000493


# 3: unspecified maternal hypertension


# 4: stroke not specified as haemorrhage or infarction
stroke EFO EFO_0000712


# 5: subarachnoid haemorrhage
subarachnoid haemorrhage EFO EFO_0000713


# 6: unspecified human immunodeficiency virus


# 7: other endocrine disorders


# 8: other diseases of liver


# 9: primary disorders of muscles


# 10: pervasive developmental disorders


# 11: other cerebrovascular diseases


# 12: other diseases of gallbladder


# 13: fracture of spine level unspecified


# 14: acute hepatitis b


# 15: abnormalities of heart beat


# 16: other congenital malformations of heart


# 17: disorders of tooth development and eruption


# 18: intracerebral haemorrhage
intracerebral haemorrhage EFO EFO_0005669


# 19: retinal detachments and breaks


# 20: tuberculosi

In [73]:
nlp1.to_disk("entity_ruler")

In [74]:
nlp2 = spacy.load("entity_ruler")

In [75]:
nlp2.pipe_names

['tok2vec',
 'tagger',
 'parser',
 'attribute_ruler',
 'lemmatizer',
 'ner',
 'entity_ruler']

In [76]:
for idx, _ in enumerate(ebi_sample["query"]):
    print(f"# {idx}: {_}")
    doc = nlp2(clean_text(_, nlp=nlp0))
    for ent in doc.ents:
        print(ent.text, ent.label_, ent.ent_id_)
    print("\n")

# 0: alzheimer s disease
alzheimer disease EFO EFO_0000249


# 1: endocarditis valve unspecified
endocarditis EFO EFO_0000465


# 2: family history of other conditions
family history EFO EFO_0000493


# 3: unspecified maternal hypertension


# 4: stroke not specified as haemorrhage or infarction
stroke EFO EFO_0000712


# 5: subarachnoid haemorrhage
subarachnoid haemorrhage EFO EFO_0000713


# 6: unspecified human immunodeficiency virus


# 7: other endocrine disorders


# 8: other diseases of liver


# 9: primary disorders of muscles


# 10: pervasive developmental disorders


# 11: other cerebrovascular diseases


# 12: other diseases of gallbladder


# 13: fracture of spine level unspecified


# 14: acute hepatitis b


# 15: abnormalities of heart beat


# 16: other congenital malformations of heart


# 17: disorders of tooth development and eruption


# 18: intracerebral haemorrhage
intracerebral haemorrhage EFO EFO_0005669


# 19: retinal detachments and breaks


# 20: tuberculosi