# Linking named entities
Filip Gregora

In [1]:
import pandas as pd

data = pd.read_csv("data/NER_entities.csv")

In [2]:
data.head()

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,Novalgin
3,symptom,Označena SLU v levé axile.
4,procedura,st.p. totální ME + SNB vlevo


In [3]:
symptom = data[data.label == "symptom"].drop(["label"], axis=1)
procedura = data[data.label == "procedura"].drop(["label"], axis=1)
medikace = data[data.label == "medikace"].drop(["label"], axis=1)
ne_symptom = data[data.label == "NE symptom"].drop(["label"], axis=1)
os_anamneza = data[data.label == "osobní anamnéza"].drop(["label"], axis=1)
ne_os_anamneza = data[data.label == "NE osobní anamnéza"].drop(["label"], axis=1)
ne_medikace = data[data.label == "NE medikace"].drop(["label"], axis=1)

In [4]:
assert(len(os_anamneza) + len(symptom) + len(procedura) + len(medikace) + len(ne_symptom) + len(ne_os_anamneza) + len(ne_medikace) \
            == len(data))

In [5]:
symptom.head(10)

Unnamed: 0,text
0,jemný fibrózní proužek
3,Označena SLU v levé axile.
18,mamma l.sin. dysplazie
24,další ojedinělé
29,stále brnění HKK i DKK
36,hmatná tu léze v DKK
59,další velikostní regrese TU
60,nespavost
64,vlevo jizva ZHK změklá
85,bolesti patní ostruhy vlevo


In [6]:
def clean_db(db):
    db_copy = db.copy()
    db_copy["text"] = db_copy["text"].apply(lambda x: x.strip(" ,.:").lower())
    db_copy["text"] = db_copy["text"].drop_duplicates()
    return db_copy.dropna()

In [8]:
symptom = clean_db(symptom)
symptom.head(10)

Unnamed: 0,text
0,jemný fibrózní proužek
3,označena slu v levé axile
18,mamma l.sin. dysplazie
24,další ojedinělé
29,stále brnění hkk i dkk
36,hmatná tu léze v dkk
59,další velikostní regrese tu
60,nespavost
64,vlevo jizva zhk změklá
85,bolesti patní ostruhy vlevo


## Linking to MASH
Mash is international medical databaze: https://uts.nlm.nih.gov/uts/.

I tried search whole medical text in database. If it didn't success then I searched all combinations of words from text of length -1. I repeat it until I succeed (or all combinations have been tryied).

In [9]:
import requests
from bs4 import BeautifulSoup
import json
from itertools import combinations

# Do not search in databaze if it number or it is too short (shorter than 2)
def filter_short(string):
    return len(string) < 2 or string.isdigit()

def mash_search(string):
    splitted_input = (string.split(" "))
    result = []
    for j in range(len(splitted_input), 0, -1):
        for string in combinations(splitted_input, j): 
            if filter_short(" ".join(string)):
                continue
                
            path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
            query = {
                     'string': " ".join(string),
                     'apiKey':'6a290909-c0d8-4db9-b531-7387929b334e',
            }
            res = requests.get(path, params=query)

            if res.status_code <= 200:
                data = json.loads(res.text)
                for j in data["result"]["results"]:
                    result.append((j["ui"], j["name"]))
            else:
                print(res.status_code, res.text)
        
        if len(result) != 0:
            break
                        
    return result
        
r = mash_search('bolesti patní ostruhy vlevo')
print(r)
# print(json.dumps(r, indent=4))

[('C0149756', 'Fasciitis, Plantar')]


In [10]:
def search_db_mash(db):
    db = clean_db(db)
    db["search"] = db["text"].apply(mash_search)
    return db

ne_medikace_mash = search_db_mash(ne_medikace)
print(ne_medikace_mash["search"][53])
ne_medikace_mash.head()

[('C0700589', 'Contraceptive methods'), ('C0029151', 'Oral contraception'), ('C0419527', 'Contraception, Postcoital'), ('C0851160', 'Morning after pill method of contraception'), ('C0009866', 'Contraceptive Failure'), ('C0558252', 'Emergency Contraception'), ('C0086580', 'Male Contraception'), ('C0009892', 'Contraceptive History'), ('C0009899', 'Contraceptive Usage'), ('C2985296', 'Hormonal Contraception'), ('C0009870', 'Contraception, Immunologic'), ('C1168146', 'Transdermal contraception'), ('C1720756', 'Emergency Contraceptives'), ('C0086286', 'Female Contraception'), ('C4761168', 'Spermicidal contraceptive method'), ('C4761165', 'Intrauterine contraceptive method'), ('C0004764', 'Contraception, Barrier'), ('C0009891', 'Contraceptive Distribution'), ('C0009885', 'Contraceptive Availability'), ('C0009862', 'Contraception Behavior'), ('C4505082', 'Contraceptive Effectiveness'), ('C0021064', 'Immunological Fertility Control'), ('C4324371', 'Contraception via partner'), ('C1262153', 'In

Unnamed: 0,text,search
53,antikoncepce: 0,"[(C0700589, Contraceptive methods), (C0029151,..."
186,antikoncepce,"[(C0700589, Contraceptive methods), (C0029151,..."
557,tramal,"[(C0040611, Tramal), (C1699769, Tramal SR)]"
615,indometacin,"[(C3653099, indometacin, combinations), (C0021..."
856,hormonální léčba,"[(C0282402, Hormone replacement therapy), (C16..."


In [39]:
def print_stats(data):
    empty = len(data[data["search"].apply(lambda x: len(x) == 0)])
    print(f"Number of empty: {empty} ({empty / len(data) * 100} %)")

    number_of_matches = data["search"].apply(lambda x: len(x))
    print(f"Mean from number of matches: {number_of_matches.mean()}")
    print(f"Median from number of matches: {number_of_matches.median()}")
    print(f"Maximal of matches: {number_of_matches.max()}")
    
print_stats(ne_medikace_mash)

Number of empty: 2 (10.0 %)
Mean from number of matches: 20.4
Median from number of matches: 25.0
Maximal of matches: 50


In [37]:
def from_strin_to_list(string):
    result = []
    for j in string.strip("[]()").split("), ("):
        if len(j) == 0:
            continue
        result.append(tuple(j.strip("'").split("', '")))
                
    return result

In [86]:
import os
if os.path.isfile("saved_search/os_anamneza_mash.csv"):
    os_anamneza_mash = pd.read_csv("saved_search/os_anamneza_mash.csv")
    os_anamneza_mash.index = os_anamneza_mash["Unnamed: 0"]
    os_anamneza_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    os_anamneza_mash["search"] = os_anamneza_mash["search"].apply(from_strin_to_list)
else:
    os_anamneza_mash = search_db_mash(os_anamneza)
    os_anamneza_mash.to_csv("saved_search/os_anamneza_mash.csv")
    
print_stats(os_anamneza_mash)

Number of empty: 24 (10.434782608695652 %)
Mean from number of matches: 20.64782608695652
Median from number of matches: 18.5
Maximal of matches: 86


In [41]:
if os.path.isfile("saved_search/ne_medikace_mash.csv"):
    ne_medikace_mash = pd.read_csv("saved_search/ne_medikace_mash.csv")
    ne_medikace_mash.index = ne_medikace_mash["Unnamed: 0"]
    ne_medikace_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_medikace_mash["search"] = ne_medikace_mash["search"].apply(from_strin_to_list)
else:
    ne_medikace_mash = search_db_mash(ne_medikace)
    ne_medikace_mash.to_csv("saved_search/ne_medikace_mash.csv")
    
print_stats(ne_medikace_mash)

Number of empty: 2 (10.0 %)
Mean from number of matches: 20.4
Median from number of matches: 25.0
Maximal of matches: 50


In [None]:
if os.path.isfile("saved_search/ne_symptom_mash.csv"):
    ne_symptom_mash = pd.read_csv("saved_search/ne_symptom_mash.csv")
    ne_symptom_mash.index = ne_symptom_mash["Unnamed: 0"]
    ne_symptom_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_symptom_mash["search"] = ne_symptom_mash["search"].apply(from_strin_to_list)
else:
    ne_symptom_mash = search_db_mash(ne_symptom)
    ne_symptom_mash.to_csv("saved_search/ne_symptom_mash.csv")
    
print_stats(ne_symptom_mash)

In [None]:
import xml.etree.ElementTree as elt

content = elt.parse('databaze/MeSH2023_Marc21.xml').getroot()

In [None]:
for child in content:
#     print(child)
    pass
    