# Linking named entities
Filip Gregora

In [52]:
import pandas as pd

data = pd.read_csv("data/NER_entities.csv")

In [53]:
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,Novalgin
3,symptom,Označena SLU v levé axile.
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,Založení TE l.sin
7,procedura,Cytostatika
8,NE symptom,"přiměřené echogenity,"
9,NE symptom,nezvětšena


## Data exploration
At the beginning we want to explore data.

- We can see that somewhere there is big letter at the beginning (but be carefull when whole first word is written in upper case)
- Somewhere at the end is interpuncion
- There are lots of duplicates
- the length of text is variable and the longest has 20 words (this can be problem in the future)


In [55]:
len(data[data.duplicated()])

2588

In [126]:
def clean_db(db):
    db_copy = db.copy()
    db_copy["text"] = db_copy["text"].apply(lambda x: x.strip(" ,.:(){}[]_-/"))
    db_copy["text"] = db_copy["text"].apply(lambda x: x[0].lower() + x[1:] if x[1].islower() else x)
    db_copy["text"] = db_copy["text"].apply(lambda x: " ".join(x.split())) #to replace multiple whitespaces with one
    db_copy["text"] = db_copy["text"].drop_duplicates()
    return db_copy.dropna()

data = clean_db(data)
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,novalgin
3,symptom,označena SLU v levé axile
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,založení TE l.sin
7,procedura,cytostatika
8,NE symptom,přiměřené echogenity
9,NE symptom,nezvětšena


In [57]:
import math
def comb_sum(j):
    sum = 0
    for i in range(j, 0, -1):
        sum += math.comb(j,i)

    return sum

for i in range(1, 21):
    print(i, comb_sum(i), sep = ": ", end = " | ")
    
lenght_data = data["text"].apply(lambda x: len(x.split(" ")))
len(lenght_data[lenght_data >= 7])

1: 1 | 2: 3 | 3: 7 | 4: 15 | 5: 31 | 6: 63 | 7: 127 | 8: 255 | 9: 511 | 10: 1023 | 11: 2047 | 12: 4095 | 13: 8191 | 14: 16383 | 15: 32767 | 16: 65535 | 17: 131071 | 18: 262143 | 19: 524287 | 20: 1048575 | 

148

In [134]:
symptom = data[data.label == "symptom"].drop(["label"], axis=1)
procedura = data[data.label == "procedura"].drop(["label"], axis=1)
medikace = data[data.label == "medikace"].drop(["label"], axis=1)
ne_symptom = data[data.label == "NE symptom"].drop(["label"], axis=1)
os_anamneza = data[data.label == "osobní anamnéza"].drop(["label"], axis=1)
ne_os_anamneza = data[data.label == "NE osobní anamnéza"].drop(["label"], axis=1)
ne_medikace = data[data.label == "NE medikace"].drop(["label"], axis=1)

In [59]:
assert(len(os_anamneza) + len(symptom) + len(procedura) + len(medikace) + len(ne_symptom) + len(ne_os_anamneza) + len(ne_medikace) \
            == len(data))

## Linking to MASH
Mash is international medical databaze: https://uts.nlm.nih.gov/uts/.

I tried search all combinations of words from text in databaze. The longer combinations have higher priority. 

There is one big problem, the complexity grows exponentially with the lenght of the words (in the worst case for lenght of 20 we have to try around 10^6 combinations). My solution for this problem is go from bottom up, start with lenght 1 and continue only with combinations which success.


In [128]:
import requests
from bs4 import BeautifulSoup
import json
from itertools import combinations

# Do not search in databaze if it number or it is too short (shorter than 2)
def filter_short(string):
    return len(string) < 2 or string.isdigit()


def mash_search(string):
    splitted_input = (string.split(" "))
    result = []
    for j in range(len(splitted_input), 0, -1):
        for string in combinations(splitted_input, j): 
            if filter_short(" ".join(string)):
                continue
                
            path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
            query = {
                     'string': " ".join(string),
                     'apiKey':'6a290909-c0d8-4db9-b531-7387929b334e',
            }
            res = requests.get(path, params=query)

            if res.status_code <= 200:
                data = json.loads(res.text)
                for j in data["result"]["results"]:
                    result.append((j["ui"], j["name"]))
            else:
                print(res.status_code, res.text)
        
        if len(result) != 0:
            break
                        
    return result
        
    
def mash_search_optimized(string):
    splitted_input = (string.split(" "))
    result = []
    last_result = []
    lenght = len(splitted_input)
    
    for j in range(1, lenght + 1):
        splitted_dict={}
        for elem in splitted_input:
            splitted_dict[j] = False
                    
        for string in combinations(splitted_input, j): 
            path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
            query = {
                     'string': " ".join(string),
                     'apiKey':'6a290909-c0d8-4db9-b531-7387929b334e',
            }
            res = requests.get(path, params=query)

            if res.status_code <= 200:
                data = json.loads(res.text)
                if len(data["result"]["results"]) != 0:
                    for j in string:
                        splitted_dict[j] = True
                    if filter_short(" ".join(string)):
                        continue
                    for j in data["result"]["results"]:
                        result.append((j["ui"], j["name"]))
            else:
                print(res.status_code, res.text, splitted_input)
        
        splitted_input = [j for j, i in splitted_dict.items() if i]
        if len(result) == 0:
            break
        else:
            last_result, result = result, []
                        
    return last_result
       
    
def search_db_mash(db):
    db = db.copy()
    db["search"] = db["text"].apply(mash_search)
    return db    


def search_db_mash_optimized(db):
    db = db.copy()
    db["search"] = db["text"].apply(mash_search_optimized)
    return db 
    
    
def print_stats(data):
    empty = len(data[data["search"].apply(lambda x: len(x) == 0)])
    print(f"Number of empty: {empty} ({empty / len(data) * 100} %)")

    number_of_matches = data["search"].apply(lambda x: len(x))
    print(f"Mean from number of matches: {number_of_matches.mean()}")
    print(f"Median from number of matches: {number_of_matches.median()}")
    print(f"Maximal of matches: {number_of_matches.max()}")

In [103]:
r = mash_search('bolesti patní ostruhy vlevo')
print(r)
# print(json.dumps(r, indent=4))

ne_medikace_mash = search_db_mash(ne_medikace)
print_stats(ne_medikace_mash)

print()
ne_medikace_mash_optimized = search_db_mash_optimized(ne_medikace)
print_stats(ne_medikace_mash_optimized)

[('C0149756', 'Fasciitis, Plantar')]
Number of empty: 1 (6.25 %)
Mean from number of matches: 19.3125
Median from number of matches: 15.5
Maximal of matches: 50

Number of empty: 1 (6.25 %)
Mean from number of matches: 19.3125
Median from number of matches: 15.5
Maximal of matches: 50


In [105]:
def from_string_to_list(string):
    result = []
    for j in string.strip("[]()").split("), ("):
        if len(j) == 0:
            continue
        result.append(tuple(j.strip("'").split("', '")))
                
    return result

In [117]:
import os
if os.path.isfile("saved_search/os_anamneza_mash.csv"):
    os_anamneza_mash = pd.read_csv("saved_search/os_anamneza_mash.csv")
    os_anamneza_mash.index = os_anamneza_mash["Unnamed: 0"]
    os_anamneza_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    os_anamneza_mash["search"] = os_anamneza_mash["search"].apply(from_string_to_list)
else:
    os_anamneza_mash = search_db_mash_optimized(os_anamneza)
    os_anamneza_mash.to_csv("saved_search/os_anamneza_mash.csv")
    
print_stats(os_anamneza_mash)

Number of empty: 23 (10.69767441860465 %)
Mean from number of matches: 20.934883720930234
Median from number of matches: 20.0
Maximal of matches: 86


In [116]:
if os.path.isfile("saved_search/ne_medikace_mash.csv"):
    ne_medikace_mash = pd.read_csv("saved_search/ne_medikace_mash.csv")
    ne_medikace_mash.index = ne_medikace_mash["Unnamed: 0"]
    ne_medikace_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_medikace_mash["search"] = ne_medikace_mash["search"].apply(from_string_to_list)
else:
    ne_medikace_mash = search_db_mash_optimized(ne_medikace)
    ne_medikace_mash.to_csv("saved_search/ne_medikace_mash.csv")
    
print_stats(ne_medikace_mash)

Number of empty: 1 (6.25 %)
Mean from number of matches: 19.3125
Median from number of matches: 15.5
Maximal of matches: 50


In [149]:
if os.path.isfile("saved_search/ne_symptom_mash.csv"):
    ne_symptom_mash = pd.read_csv("saved_search/ne_symptom_mash.csv")
    ne_symptom_mash.index = ne_symptom_mash["Unnamed: 0"]
    ne_symptom_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_symptom_mash["search"] = ne_symptom_mash["search"].apply(from_string_to_list)
else:
    ne_symptom_mash = search_db_mash_optimized(ne_symptom)
    ne_symptom_mash.to_csv("saved_search/ne_symptom_mash.csv")
    
print_stats(ne_symptom_mash)

Number of empty: 109 (10.603112840466926 %)
Mean from number of matches: 22.61284046692607
Median from number of matches: 25.0
Maximal of matches: 138


In [None]:
if os.path.isfile("saved_search/symptom_mash.csv"):
    symptom_mash = pd.read_csv("saved_search/symptom_mash.csv")
    symptom_mash.index = symptom_mash["Unnamed: 0"]
    symptom_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    symptom_mash["search"] = symptom_mash["search"].apply(from_string_to_list)
else:
    symptom_mash = search_db_mash_optimized(symptom)
    symptom_mash.to_csv("saved_search/symptom_mash.csv")
    
print_stats(symptom_mash)

In [None]:
if os.path.isfile("saved_search/procedura_mash.csv"):
    procedura_mash = pd.read_csv("saved_search/procedura_mash.csv")
    procedura_mash.index = procedura_mash["Unnamed: 0"]
    procedura_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    procedura_mash["search"] = procedura_mash["search"].apply(from_string_to_list)
else:
    procedura_mash = search_db_mash_optimized(procedura)
    procedura_mash.to_csv("saved_search/procedura_mash.csv")
    
print_stats(procedura_mash)

In [151]:
if os.path.isfile("saved_search/medikace_mash.csv"):
    medikace_mash = pd.read_csv("saved_search/medikace_mash.csv")
    medikace_mash.index = medikace_mash["Unnamed: 0"]
    medikace_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    medikace_mash["search"] = medikace_mash["search"].apply(from_string_to_list)
else:
    medikace_mash = search_db_mash_optimized(medikace)
    medikace_mash.to_csv("saved_search/medikace_mash.csv")
    
print_stats(medikace_mash)

Number of empty: 74 (24.18300653594771 %)
Mean from number of matches: 14.133986928104575
Median from number of matches: 7.0
Maximal of matches: 82


In [150]:
if os.path.isfile("saved_search/ne_os_anamneza_mash.csv"):
    ne_os_anamneza_mash = pd.read_csv("saved_search/ne_os_anamneza_mash.csv")
    ne_os_anamneza_mash.index = ne_os_anamneza_mash["Unnamed: 0"]
    ne_os_anamneza_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_os_anamneza_mash["search"] = ne_os_anamneza_mash["search"].apply(from_string_to_list)
else:
    ne_os_anamneza_mash = search_db_mash_optimized(ne_os_anamneza)
    ne_os_anamneza_mash.to_csv("saved_search/ne_os_anamneza_mash.csv")
    
print_stats(ne_os_anamneza_mash)

Number of empty: 4 (6.557377049180328 %)
Mean from number of matches: 20.852459016393443
Median from number of matches: 25.0
Maximal of matches: 75


In [None]:
import xml.etree.ElementTree as elt

content = elt.parse('databaze/MeSH2023_Marc21.xml').getroot()

In [None]:
for child in content:
#     print(child)
    pass
    