# Linking named entities
Filip Gregora

In [1]:
import pandas as pd

data = pd.read_csv("data/NER_entities.csv")

In [2]:
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,Novalgin
3,symptom,Označena SLU v levé axile.
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,Založení TE l.sin
7,procedura,Cytostatika
8,NE symptom,"přiměřené echogenity,"
9,NE symptom,nezvětšena


## Data exploration
At the beginning we want to explore data.

- We can see that somewhere there is big letter at the beginning (but be carefull when whole first word is written in upper case)
- Somewhere at the end is interpuncion
- There are lots of duplicates
- the length of text is variable and the longest has 20 words (this can be problem in the future)


In [3]:
len(data[data.duplicated()])

2588

In [4]:
from string import punctuation

def clean_db(db):
    db_copy = db.copy()
    db_copy["text"] = db_copy["text"].apply(lambda x: x.strip(" " + "".join(punctuation)))
    db_copy["text"] = db_copy["text"].apply(lambda x: x[0].lower() + x[1:] if x[1].islower() else x)
    db_copy["text"] = db_copy["text"].apply(lambda x: " ".join(x.split())) #to replace multiple whitespaces with one
    db_copy["text"] = db_copy["text"].drop_duplicates()
    return db_copy.dropna()

data = clean_db(data)
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,novalgin
3,symptom,označena SLU v levé axile
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,založení TE l.sin
7,procedura,cytostatika
8,NE symptom,přiměřené echogenity
9,NE symptom,nezvětšena


In [5]:
import math
def comb_sum(j):
    sum = 0
    for i in range(j, 0, -1):
        sum += math.comb(j,i)

    return sum

for i in range(1, 21):
    print(i, comb_sum(i), sep = ": ", end = " | ")
    
lenght_data = data["text"].apply(lambda x: len(x.split(" ")))
len(lenght_data[lenght_data >= 7])

1: 1 | 2: 3 | 3: 7 | 4: 15 | 5: 31 | 6: 63 | 7: 127 | 8: 255 | 9: 511 | 10: 1023 | 11: 2047 | 12: 4095 | 13: 8191 | 14: 16383 | 15: 32767 | 16: 65535 | 17: 131071 | 18: 262143 | 19: 524287 | 20: 1048575 | 

140

In [6]:
symptom = data[data.label == "symptom"].drop(["label"], axis=1)
procedura = data[data.label == "procedura"].drop(["label"], axis=1)
medikace = data[data.label == "medikace"].drop(["label"], axis=1)
ne_symptom = data[data.label == "NE symptom"].drop(["label"], axis=1)
os_anamneza = data[data.label == "osobní anamnéza"].drop(["label"], axis=1)
ne_os_anamneza = data[data.label == "NE osobní anamnéza"].drop(["label"], axis=1)
ne_medikace = data[data.label == "NE medikace"].drop(["label"], axis=1)

In [7]:
assert(len(os_anamneza) + len(symptom) + len(procedura) + len(medikace) + len(ne_symptom) + len(ne_os_anamneza) + len(ne_medikace) \
            == len(data))

## Linking to MASH
Mash is international medical databaze: https://uts.nlm.nih.gov/uts/.

I tried search all combinations of words from text in databaze. The longer combinations have higher priority. 

There is one big problem, the complexity grows exponentially with the lenght of the words (in the worst case for lenght of 20 we have to try around 10^6 combinations). My solution for this problem is go from bottom up, start with lenght 1 and continue only with combinations which success.


In [8]:
import requests
from bs4 import BeautifulSoup
import json
from itertools import combinations

# Do not search in databaze if it number or it is too short (shorter than 2)
def filter_short(string):
    return len(string) < 2 or string.isdigit()


def mash_search(string):
    splitted_input = (string.split(" "))
    result = []
    for j in range(len(splitted_input), 0, -1):
        for string in combinations(splitted_input, j): 
            if filter_short(" ".join(string)):
                continue
                
            path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
            query = {
                     'string': " ".join(string),
                     'apiKey':'6a290909-c0d8-4db9-b531-7387929b334e',
            }
            res = requests.get(path, params=query)

            if res.status_code <= 200:
                data = json.loads(res.text)
                for j in data["result"]["results"]:
                    result.append((j["ui"], j["name"]))
            else:
                print(res.status_code, res.text)
        
        if len(result) != 0:
            break
                        
    return result
        
    
def mash_search_optimized(string):
    splitted_input = (string.split(" "))
    result = []
    last_result = []
    lenght = len(splitted_input)
    
    for j in range(1, lenght + 1):
        splitted_dict={}
        for elem in splitted_input:
            splitted_dict[j] = False
                    
        for string in combinations(splitted_input, j): 
            path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
            query = {
                     'string': " ".join(string),
                     'apiKey':'6a290909-c0d8-4db9-b531-7387929b334e',
            }
            res = requests.get(path, params=query)

            if res.status_code <= 200:
                data = json.loads(res.text)
                if len(data["result"]["results"]) != 0:
                    for j in string:
                        splitted_dict[j] = True
                    if filter_short(" ".join(string)):
                        continue
                    temp = []
                    for j in data["result"]["results"]:
                        temp.append((j["ui"], j["name"]))
                    result.append(temp)
            else:
                print(res.status_code, res.text, splitted_input)
        
        splitted_input = [j for j, i in splitted_dict.items() if i]
        if len(splitted_input) == 0:
            break
        else:
            last_result, result = result, []
        
#     This is there to preserve prioritized output from databaze
    temp = []
    for j in last_result:
        temp += list(enumerate(j))
    return [j for (i, j) in sorted(temp)]


def search_db_mash(db):
    db = db.copy()
    db["search"] = db["text"].apply(mash_search)
    return db    


def search_db_mash_optimized(db):
    db = db.copy()
    db["search"] = db["text"].apply(mash_search_optimized)
    return db 
    
    
def print_stats(data):
    empty = len(data[data["search"].apply(lambda x: len(x) == 0)])
    print(f"Number of empty: {empty} ({empty / len(data) * 100} %)")

    number_of_matches = data["search"].apply(lambda x: len(x))
    print(f"Mean from number of matches: {number_of_matches.mean()}")
    print(f"Median from number of matches: {number_of_matches.median()}")
    print(f"Maximal of matches: {number_of_matches.max()}")

In [9]:
r = mash_search('bolesti patní ostruhy vlevo')
print(r)
# print(json.dumps(r, indent=4))

ne_medikace_mash = search_db_mash(ne_medikace)
print_stats(ne_medikace_mash)

print()
ne_medikace_mash_optimized = search_db_mash_optimized(ne_medikace)
print_stats(ne_medikace_mash_optimized)

[('C0149756', 'Fasciitis, Plantar')]
Number of empty: 1 (6.25 %)
Mean from number of matches: 19.3125
Median from number of matches: 15.5
Maximal of matches: 50

Number of empty: 1 (6.25 %)
Mean from number of matches: 19.3125
Median from number of matches: 15.5
Maximal of matches: 50


In [10]:
def from_string_to_list(string):
    result = []
    for j in string.strip("[]()").split("), ("):
        if len(j) == 0:
            continue
        result.append(tuple(j.strip("'").split("', '")))
                
    return result

In [36]:
import os
if os.path.isfile("saved_search/os_anamneza_mash.csv"):
    os_anamneza_mash = pd.read_csv("saved_search/os_anamneza_mash.csv")
    os_anamneza_mash.index = os_anamneza_mash["Unnamed: 0"]
    os_anamneza_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    os_anamneza_mash["search"] = os_anamneza_mash["search"].apply(from_string_to_list)
else:
    os_anamneza_mash = search_db_mash_optimized(os_anamneza)
    os_anamneza_mash.to_csv("saved_search/os_anamneza_mash.csv")
    
print_stats(os_anamneza_mash)

Number of empty: 23 (10.74766355140187 %)
Mean from number of matches: 20.83177570093458
Median from number of matches: 19.5
Maximal of matches: 86


In [35]:
if os.path.isfile("saved_search/ne_medikace_mash.csv"):
    ne_medikace_mash = pd.read_csv("saved_search/ne_medikace_mash.csv")
    ne_medikace_mash.index = ne_medikace_mash["Unnamed: 0"]
    ne_medikace_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_medikace_mash["search"] = ne_medikace_mash["search"].apply(from_string_to_list)
else:
    ne_medikace_mash = search_db_mash_optimized(ne_medikace)
    ne_medikace_mash.to_csv("saved_search/ne_medikace_mash.csv")
    
print_stats(ne_medikace_mash)

Number of empty: 1 (6.25 %)
Mean from number of matches: 19.3125
Median from number of matches: 15.5
Maximal of matches: 50


In [34]:
if os.path.isfile("saved_search/ne_symptom_mash.csv"):
    ne_symptom_mash = pd.read_csv("saved_search/ne_symptom_mash.csv")
    ne_symptom_mash.index = ne_symptom_mash["Unnamed: 0"]
    ne_symptom_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_symptom_mash["search"] = ne_symptom_mash["search"].apply(from_string_to_list)
else:
    ne_symptom_mash = search_db_mash_optimized(ne_symptom)
    ne_symptom_mash.to_csv("saved_search/ne_symptom_mash.csv")
    
print_stats(ne_symptom_mash)

Number of empty: 113 (11.03515625 %)
Mean from number of matches: 22.505859375
Median from number of matches: 25.0
Maximal of matches: 138


In [33]:
if os.path.isfile("saved_search/symptom_mash.csv"):
    symptom_mash = pd.read_csv("saved_search/symptom_mash.csv")
    symptom_mash.index = symptom_mash["Unnamed: 0"]
    symptom_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    symptom_mash["search"] = symptom_mash["search"].apply(from_string_to_list)
else:
    symptom_mash = search_db_mash_optimized(symptom)
    symptom_mash.to_csv("saved_search/symptom_mash.csv")
    
print_stats(symptom_mash)

Number of empty: 36 (6.132879045996593 %)
Mean from number of matches: 23.340715502555366
Median from number of matches: 22.0
Maximal of matches: 125


In [32]:
if os.path.isfile("saved_search/procedura_mash.csv"):
    procedura_mash = pd.read_csv("saved_search/procedura_mash.csv")
    procedura_mash.index = procedura_mash["Unnamed: 0"]
    procedura_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    procedura_mash["search"] = procedura_mash["search"].apply(from_string_to_list)
else:
    procedura_mash = search_db_mash_optimized(procedura)
    procedura_mash.to_csv("saved_search/procedura_mash.csv")
    
print_stats(procedura_mash)

Number of empty: 47 (7.885906040268456 %)
Mean from number of matches: 24.10234899328859
Median from number of matches: 25.0
Maximal of matches: 121


In [31]:
if os.path.isfile("saved_search/medikace_mash.csv"):
    medikace_mash = pd.read_csv("saved_search/medikace_mash.csv")
    medikace_mash.index = medikace_mash["Unnamed: 0"]
    medikace_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    medikace_mash["search"] = medikace_mash["search"].apply(from_string_to_list)
else:
    medikace_mash = search_db_mash_optimized(medikace)
    medikace_mash.to_csv("saved_search/medikace_mash.csv")
    
print_stats(medikace_mash)

Number of empty: 74 (24.262295081967213 %)
Mean from number of matches: 14.177049180327868
Median from number of matches: 7.0
Maximal of matches: 82


In [30]:
if os.path.isfile("saved_search/ne_os_anamneza_mash.csv"):
    ne_os_anamneza_mash = pd.read_csv("saved_search/ne_os_anamneza_mash.csv")
    ne_os_anamneza_mash.index = ne_os_anamneza_mash["Unnamed: 0"]
    ne_os_anamneza_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_os_anamneza_mash["search"] = ne_os_anamneza_mash["search"].apply(from_string_to_list)
else:
    ne_os_anamneza_mash = search_db_mash_optimized(ne_os_anamneza)
    ne_os_anamneza_mash.to_csv("saved_search/ne_os_anamneza_mash.csv")
    
print_stats(ne_os_anamneza_mash)

Number of empty: 4 (6.557377049180328 %)
Mean from number of matches: 20.852459016393443
Median from number of matches: 25.0
Maximal of matches: 75


In [89]:
data_mash = pd.concat([ne_os_anamneza_mash, ne_medikace_mash, ne_symptom_mash, symptom_mash, os_anamneza_mash, medikace_mash, procedura_mash])
data_mash = data_mash.sort_index()
print_stats(data_mash)

Number of empty: 298 (10.631466286122013 %)
Mean from number of matches: 21.931858722797003
Median from number of matches: 25.0
Maximal of matches: 138


### Not assigned
If we look at the random sample of 10 texts, which are not assigned, then we can see that in five of them there is typographical mistake (*"nejsou zn.plicní hpertenze"* = *"nejsou zn. plicní hypertenze"*, *"kumulce a nehomogenity"* = *"kumulace a nehomogenita"*, *"ceriucal"* = *"cerucal"*, *"paitace"* = *"palpitace"*, *"mamily klidné"* = ?). Others five are correct medical term, but in some non-typical grammatical form.

If we try to improve them we get 50 % improvement.

In [92]:
# empty_sample = data_mash[data_mash["search"].apply(lambda x: len(x) == 0)].sample(10, random_state=42)

# Because of my mistake (I had worser clean_db), the code above generate different sample than I did before.
# So I have to create the sample by hand:
empty_sample = data_mash.loc[[878, 91, 5240, 3728, 1125, 2479, 4981, 1134, 5089, 1129]]

empty_sample["text"][878] = "nejsou zn. plicní hypertenze"
empty_sample["text"][91] = "hormostenické"
empty_sample["text"][5240] = "kumulace a nehomogenita"
empty_sample["text"][3728] = "biopsie"
empty_sample["text"][1125] = "chemobioterapie"
empty_sample["text"][2479] = "dysmorfické"
empty_sample["text"][4981] = "anikterické"
empty_sample["text"][1134] = "cerucal"
empty_sample["text"][5089] = "palpitace"
empty_sample["text"][1129] = "mamily klidné"

empty_sample = search_db_mash_optimized(empty_sample)
empty_sample

Unnamed: 0_level_0,text,search,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
878,nejsou zn. plicní hypertenze,"[(C0020542, Pulmonary Hypertension), (C0152171...",NE symptom
91,hormostenické,[],NE symptom
5240,kumulace a nehomogenita,[],symptom
3728,biopsie,"[(C0005558, Biopsy), (C0220797, biopsy charact...",procedura
1125,chemobioterapie,[],procedura
2479,dysmorfické,"[(C0005887, Body Dysmorphic Disorders)]",symptom
4981,anikterické,[],NE symptom
1134,cerucal,"[(C0701450, Cerucal)]",medikace
5089,palpitace,"[(C0030252, Palpitations), (C0549267, Palpitat...",NE symptom
1129,mamily klidné,[],NE symptom


There is one mistake which we can correct automaticly and it is not having space after punctuation mark. We can see that if we have space after punctuation then it find something, else it didn't.

We can see that there is around 150 examples of this mistakes.

In [21]:
print(len(mash_search_optimized("zn. plicní")))
print(len(mash_search_optimized("zn.plicní")))

50
0


In [22]:
data
def is_space_after_punc(string):
    punctuation = [".", ",", "!", "?", ":", ";", "+"]
    for i in range(len(string) - 1):
        if string[i] in punctuation and string[i+1] != " " and string[i+1] not in punctuation:
            return False
 
    return True

no_space = data[~data["text"].apply(is_space_after_punc)]
len(no_space)

155

In [91]:
def insert_space_after_punc(string):
    punctuation = [".", ",", "!", "?", ":", ";", "+"]
    for i in range(len(string) - 1):
        if string[i] in punctuation and string[i+1] != " " and string[i+1] not in punctuation:
            string = string[:i+1] + " " + string[i+1:]
 
    return string
    
data_mash = clean_db(data_mash)
data_mash["label"] = "N/A"
for i in data_mash.index:
    data_mash["label"][i] = data["label"][i]

if os.path.isfile("saved_search/data_mash_inserted_space.csv"):
    inserted_space_data_mash = pd.read_csv("saved_search/data_mash_inserted_space.csv")
    inserted_space_data_mash.index = inserted_space_data_mash["Unnamed: 0"]
    inserted_space_data_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    inserted_space_data_mash["search"] = inserted_space_data_mash["search"].apply(from_string_to_list)
else:
    inserted_space_data_mash = data_mash.copy()
    no_space["text"] = no_space["text"].apply(insert_space_after_punc).copy()
    for i in no_space.index:
        inserted_space_data_mash["search"][i] = mash_search_optimized(no_space["text"][i])
    inserted_space_data_mash.to_csv("saved_search/data_mash_inserted_space.csv")    
        
print_stats(data_mash)
print()
print_stats(inserted_space_data_mash)

Number of empty: 298 (10.631466286122013 %)
Mean from number of matches: 21.931858722797003
Median from number of matches: 25.0
Maximal of matches: 138

Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.383517659650373
Median from number of matches: 25.0
Maximal of matches: 138


In [None]:
import xml.etree.ElementTree as elt

content = elt.parse('databaze/MeSH2023_Marc21.xml').getroot()

In [None]:
for child in content:
#     print(child)
    pass
    