# Linking named entities
by Filip Gregora

In [1]:
import pandas as pd
from itertools import combinations
from string import punctuation, ascii_letters
import numpy as np
import math
import random
import os
import xml.etree.ElementTree as elt
from openai import OpenAI
import regex
import requests
import json

In [2]:
data = pd.read_csv("data/NER_entities.csv")
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,Novalgin
3,symptom,Označena SLU v levé axile.
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,Založení TE l.sin
7,procedura,Cytostatika
8,NE symptom,"přiměřené echogenity,"
9,NE symptom,nezvětšena


## Preprocessing

In [3]:
def clean(string):
    string = string.strip(" " + "".join(punctuation))
    # Remove first upper letter if not all letters are upper
    if len(string) >= 2:
        string = string[0].lower() + string[1:] if string[1].islower() else string
    # Replace multiple whitespaces with one
    return " ".join(string.split())


def clean_table(db):
    db_copy = db.copy()
    db_copy["text"] = db_copy["text"].apply(clean)
    db_copy["text"] = db_copy["text"].drop_duplicates()
    return db_copy.dropna()


def remove_punctioation_table(db):
    db = db.copy()  
    db["text"] = db["text"].apply(
        lambda text: "".join([l if l not in punctuation or l in ["."] else " " for l in text])).apply(clean)
    return db


def is_soft_negative_word(string):
    string = string.strip(" ")
    return string in ["0", "O", "bez", "ne", "neguje", "neg"] or " negat" in f" {string}"


def make_negation_table(db):
    db = db.copy()
    db["neg"] = db.apply(lambda x: "NE" in x.label and any(True for word in x.text.split(" ") if is_soft_negative_word(word)), axis=1)
    db["text"] = db.apply(lambda x: " ".join([word.strip(":") for word in x.text.split(" ")
                                              if not(is_soft_negative_word(word))])
                          if "NE" in x.label else x.text, axis=1)
    return db


def shortcuts(string):
    result = []
    for word in string.split(" "):
        if "." in word:
            result += word.strip(" .").split(".")
    return " ".join(result)


def make_shortcuts_table(db):
    db = db.copy()
    db["shortcuts"] = db["text"].apply(shortcuts)
    db["text"] = db["text"].apply(lambda x: " ".join([word for word in x.split(" ") if "." not in word]))
    return db

In [4]:
data["original_text"] = data["text"]
former_len = len(data)

data = clean_table(data)
data = remove_punctioation_table(data)   
data = make_negation_table(data)
data = make_shortcuts_table(data)

print(former_len, "->", len(data), ":", former_len - len(data))
data.head(10)

6034 -> 2803 : 3231


Unnamed: 0,label,text,original_text,neg,shortcuts
0,symptom,jemný fibrózní proužek,jemný fibrózní proužek,False,
1,procedura,neoadjuvantní CHT,neoadjuvantní CHT,False,
2,medikace,novalgin,Novalgin,False,
3,symptom,označena SLU v levé axile,Označena SLU v levé axile.,False,
4,procedura,totální ME SNB vlevo,st.p. totální ME + SNB vlevo,False,st p
5,medikace,NOVALGIN,NOVALGIN,False,
6,procedura,založení TE,Založení TE l.sin,False,l sin
7,procedura,cytostatika,Cytostatika,False,
8,NE symptom,přiměřené echogenity,"přiměřené echogenity,",False,
9,NE symptom,nezvětšena,nezvětšena,False,


In [5]:
procedura = data[data["label"] == "procedura"]
medikace = data[data["label"] == "medikace"]
symptom = data[data["label"] == "symptom"]
os_anamneza = data[data["label"] == "osobní anamnéza"]
ne_medikace = data[data["label"] == "NE medikace"]
ne_symptom = data[data["label"] == "NE symptom"]
ne_os_anamneza = data[data["label"] == "NE osobní anamnéza"]

assert (len(procedura) + len(medikace) + len(symptom) + len(os_anamneza)
                       + len(ne_medikace) + len(ne_symptom) + len(ne_os_anamneza)
        == len(data))

## Candidate Entity Generation

In [6]:
def patternize(string):
    result = []
    for i in string:
        if i in '<([{\\^-=$!|]})?*+.>]':
            result.append("\\" + i)
        else:
            result.append(i)
    return "".join(result)

In [7]:
def drop_long_searches_len_dep(dictionary, limit=50):
    for key in dictionary.copy():
        if len(dictionary[key]) > limit * len(key):
            dictionary[key] = []
    return dictionary


def create_doubles(dictionary):
    result = {}
    for k_1 in dictionary:
        for k_2 in dictionary:
            if k_1 == k_2:
                continue

            r = list(set(dictionary[k_1]).intersection(
                     set(dictionary[k_2])))
            if len(r) > 0 and (k_2 + " " + k_1) not in result:
                result[k_1 + " " + k_2] = r

    return result


def combine_searches(dictionary):
    combined_results = {}
    result = create_doubles(dictionary)
    combined_results.update(result)
    while result != {}:
        result = create_doubles(result)
        result = {" ".join(set(k.split(" "))) :result[k] for k in result}
        combined_results.update(result)

    return combined_results

### Linking to MeSH codes using MSHCZ

In [8]:
whole_mshcz = elt.parse('databaze/MeSH2023_Marc21_Alma.xml').getroot()

procedura_mshcz = []
for child in whole_mshcz:
    try:
        d = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "072" ]
        if any([True for x in d if x[0].text is not None and x[0].text[0] in ["E", "A"]]):
            procedura_mshcz.append(child)
    except IndexError:
        pass                
print("Number of descriptors for procedura:", len(procedura_mshcz))

symptomy_mshcz = []
for child in whole_mshcz:
    try:
        d = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "072" ]
        if any([True for x in d if x[0].text is not None and x[0].text[0] in ["A", "G", "C", "D", "E"]]):
            symptomy_mshcz.append(child)
    except IndexError:
        pass                
print("Number of descriptors for symptomy:", len(symptomy_mshcz))

os_anamneza_mshcz = []
for child in whole_mshcz:
    try:
        d = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "072" ]
        if any([True for x in d if x[0].text is not None and x[0].text[0] in ["A", "C", "E"]]):
            os_anamneza_mshcz.append(child)
    except IndexError:
        pass

print("Number of descriptors for osobní anamnéza:", len(os_anamneza_mshcz))

Number of descriptors for procedura: 4963
Number of descriptors for symptomy: 22191
Number of descriptors for osobní anamnéza: 9921


In [9]:
def medvik_without_descr_search(test, database):
    result = []
    for_loop_stopped = False
    
    for child in database:
        for subchild in child.iter("{http://www.loc.gov/MARC21/slim}datafield"):
            tag = int(subchild.attrib["tag"])
            if tag != 150 and tag != 450 and tag != 750:
                continue
                
            for subsubchild in subchild.iter("{http://www.loc.gov/MARC21/slim}subfield"):
                if subsubchild.text and test(subsubchild.text):
                    try:
                        code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
                        name = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "150" ][0][0].text
                        result.append((code, name))
                        for_loop_stopped = True
                        break
                    except IndexError:
                        for_loop_stopped = True
                        break
            if for_loop_stopped:
                for_loop_stopped = False
                break 
    
    return result

In [10]:
def medvik_combined_without_descr_search(string, database=whole_mshcz):
    string = string.lower()
    result = medvik_without_descr_search(lambda x: string == x.lower(), database)
    n = -2
    if len(result) == 0:
        result = medvik_without_descr_search(lambda x: (f" {string} " in x.lower()) or 
                                                       (f"{string} " == x.lower()[:(len(string) + 1)]) or
                                                       (f" {string}" == x.lower()[(-len(string) - 1):]),
                                                   database)
        n = -1
        
    if len(result) == 0:
        result = medvik_without_descr_search(lambda x: string in x.lower(), database)
        n = 0
    
    return (result, n)


def medvik_mistakes_search(string, mistakes, database):
    string = patternize(string)
    pattern = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    return (medvik_without_descr_search(lambda x: pattern.search(x) is not None, database), 0)

In [11]:
def shortcuts_medvik_without_descr_search(string, database=whole_mshcz):
    return (medvik_without_descr_search(lambda x: (f" {string.lower()}" in x.lower()) or
                                                    (f"{string.lower()}" == x.lower()[:(len(string) + 1)]), database), 0)


def shortcuts_medvik_mistakes_without_descr_search(string, mistakes, database):
    string = patternize(string)
    pattern_start = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    pattern_in = regex.compile(f"( {string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = (lambda x: pattern_start.search(x[:len(string) + mistakes]) is not None or
            pattern_in.search(x) is not None)
    return (medvik_without_descr_search(test, database), 0)

In [12]:
def interprete_explanation(dictionary):
    dictionary = {k: dictionary[k] for k in dictionary if dictionary[k] is not None}
    long_words = set()
    [k for k in dictionary if len(k.split(" ")) >= 2 and long_words.update(k.split(" "))]

    return {k : dictionary[k] for k in dictionary if k not in long_words}

### Linking to ATC codes using SUKL DLP

In [13]:
products = pd.read_csv("databaze/DLP20240229/dlp_lecivepripravky.csv",
                       encoding="cp1250", delimiter=";")[["KOD_SUKL", "NAZEV", "SILA", "FORMA", "ATC_WHO", "NAZEV_REG"]]
products = products[["NAZEV", "ATC_WHO"]].drop_duplicates()

In [14]:
substances_sukl_code = pd.read_csv("databaze/DLP20240229/dlp_latky.csv", encoding="cp1250", delimiter=";"
           )[["KOD_LATKY", "NAZEV_INN", "NAZEV_EN", "NAZEV"]]
content_drugs = pd.read_csv("databaze/DLP20240229/dlp_slozeni.csv", encoding="cp1250", delimiter=";")
products_whole = pd.read_csv("databaze/DLP20240229/dlp_lecivepripravky.csv",
                       encoding="cp1250", delimiter=";")[["KOD_SUKL", "NAZEV", "SILA", "FORMA", "ATC_WHO", "NAZEV_REG"]]

drug_code_to_substance_code = content_drugs[["KOD_SUKL", "KOD_LATKY"]].dropna()
drug_code_to_substance_code["KOD_LATKY"] = drug_code_to_substance_code["KOD_LATKY"].apply(lambda x: round(x))
drug_code_to_substance_code.index = drug_code_to_substance_code["KOD_SUKL"]
drug_code_to_substance_code = drug_code_to_substance_code[["KOD_LATKY"]]

drug_code_to_atc = products_whole.copy()
drug_code_to_atc.index = drug_code_to_atc["KOD_SUKL"]
drug_code_to_atc = drug_code_to_atc.drop(["KOD_SUKL"], axis=1)

substance_code_to_atc_duplicated = drug_code_to_substance_code.join(drug_code_to_atc, how="left")[["KOD_LATKY", "NAZEV", "ATC_WHO"]]
substance_code_to_atc_duplicated["KOD_SUKL"] = substance_code_to_atc_duplicated.index
substance_code_to_atc_duplicated.index = substance_code_to_atc_duplicated["KOD_LATKY"]
substance_code_to_atc_duplicated = substance_code_to_atc_duplicated.drop(["KOD_LATKY"], axis=1)

substance_code_to_atc = substance_code_to_atc_duplicated.drop([-1, 0]).groupby(["KOD_LATKY"], group_keys=True).apply(
    lambda x: x["ATC_WHO"].value_counts().sort_values(ascending=False).index[0])

substance_code_to_atc.head(3)

KOD_LATKY
1        V11
2    M02AA10
8    V10XA01
dtype: object

In [15]:
substances_with_ATC = pd.read_csv("databaze/DLP20240229/dlp_atc.csv", encoding="cp1250", delimiter=";")
temp = substances_with_ATC[["ATC", "NAZEV_EN"]]
temp = temp.rename(columns={"NAZEV_EN":"NAZEV"})
temp.index = temp.index + len(substances_with_ATC)
substances_with_ATC_concat = pd.concat([substances_with_ATC[["ATC", "NAZEV"]], temp])

synonyms = pd.read_csv("databaze/DLP20240229/dlp_synonyma.csv", encoding="cp1250", delimiter=";")
temp = synonyms["KOD_LATKY"].apply(lambda x: substance_code_to_atc[x] if x in substance_code_to_atc else "N/A")
synonyms_linked = synonyms[temp != "N/A"].copy()
synonyms_linked["ATC"] = temp[temp != "N/A"]
synonyms_linked = synonyms_linked[~(synonyms_linked[["NAZEV", "ATC"]].duplicated())]

name_to_atc = pd.DataFrame()
name_to_atc["NAME"] = "N/A"
name_to_atc["ATC"] = "N/A"

length = len(name_to_atc)
temp = pd.DataFrame()
temp["NAME"] = synonyms_linked["NAZEV"].reset_index(drop=True)
temp["ATC"] = synonyms_linked["ATC"].reset_index(drop=True)
temp.index = temp.index + length
name_to_atc = pd.concat([name_to_atc, temp])

length = len(name_to_atc)
temp = pd.DataFrame()
temp["NAME"] = substances_with_ATC_concat["NAZEV"].reset_index(drop=True)
temp["ATC"] = substances_with_ATC_concat["ATC"].reset_index(drop=True)
temp.index = temp.index + length
name_to_atc = pd.concat([name_to_atc, temp])

name_to_atc = name_to_atc.dropna()
name_to_atc = name_to_atc.rename(columns={"NAME": "NAZEV"})

print(len(name_to_atc))
name_to_atc.head(3)

85539


Unnamed: 0,NAZEV,ATC
0,"ARTEMISIA ABSINTHIUM L., HERBA",V11
1,HERBA ARTEMISIAE ABSINTHII,V11
2,ABSINTHII SUMMITAS,V11


In [16]:
def search_words_mistake_bottom(string, func1, func2, output_errors=False, limit=0.5):
    result_dict = {}
    for word in string.split(" "):
        search, n = func1(word)
        if len(search) == 0:
            while len(search) == 0 and n <= math.floor(len(word) * (limit)):
                n += 1
                search, _ = func2(word, n)

        if output_errors:
            result_dict[(word, n)] = search
        else:
            result_dict[word] = search
        
    return result_dict

In [17]:
def count_to_tuple(l):
    un, count = np.unique(l, return_counts=True)
    result = []
    for i, _ in enumerate(un):
        result.append((un[i], count[i]))
    return result


def link_atc_preserve_count(list_atc):
    count = count_to_tuple(list_atc[0])
    atc = list(set(list_atc[0]))
    r = substances_with_ATC[substances_with_ATC["ATC"].apply(lambda x: x in atc)]
    return ([(r["ATC"][i], r["NAZEV"][i], [x[1] for x in count if x[0] == r["ATC"][i]][0]) for i in r.index],
            list_atc[1])

In [18]:
# Search chemicals
def sukl_chemicals_search(string, database=substances_with_ATC_concat):
    string = string.lower()

    find, n = (database[database["NAZEV"].apply(
        lambda x: string == x.lower())], -2)
    
    find, n = (database[database["NAZEV"].apply(
        lambda x: (" " + string + " ") in (" " + x.lower() + " "))], -1) if len(find) == 0 else (find, n)
    
    find, n = (database[database["NAZEV"].apply(
        lambda x: string in x.lower())], 0) if len(find) == 0 else (find, n)
    
    return (list(find["ATC"]), n)


def sukl_chemicals_mistake_search(string, mistakes, database=substances_with_ATC_concat):
    string = patternize(string)
    pattern = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = lambda x: pattern.search(x) is not None
    find = database[database["NAZEV"].apply(test)]
    return list(find["ATC"])


def sukl_find_chemicals_search(string, output_errors=True,
                               database=substances_with_ATC_concat, limit=0.5):
    return search_words_mistake_bottom(string,
                         lambda x: link_atc_preserve_count(
                             sukl_chemicals_search(x, database=database)),
                         lambda x, y: link_atc_preserve_count(
                             (sukl_chemicals_mistake_search(x, y, database=database), 0)),
                         limit=limit,
                         output_errors=output_errors)

In [19]:
# Chemical shortcuts search
def sukl_chemicals_shortcuts_search(string, database=substances_with_ATC_concat):
    string = string.lower()
    find = database[database["NAZEV"].apply(lambda x: string == x[:len(string)].lower())]
    find = database[database["NAZEV"].apply(lambda x: (" " + string) in x.lower()
                                           )] if len(find) == 0 else find
    return (list(find["ATC"]), 0)


def sukl_chemicals_shortcuts_mistake_search(string, mistakes, database=substances_with_ATC_concat):
    string = patternize(string)
    pattern_start = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    pattern_in = regex.compile(f"( {string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = lambda x: (pattern_start.search(x[:(len(string) + mistakes)]) is not None or
                      pattern_in.search(x) is not None)
    find = database[database["NAZEV"].apply(test)]
    return list(find["ATC"])


def sukl_find_chemicals_shortcuts_search(string, output_errors=True,
                                         database=substances_with_ATC_concat, limit=0.5):
    return search_words_mistake_bottom(string,
                         lambda x: link_atc_preserve_count(
                             sukl_chemicals_shortcuts_search(x, database=database)),
                         lambda x, y: link_atc_preserve_count(
                             (sukl_chemicals_shortcuts_mistake_search(x, y, database=database), 0)),
                         limit=limit,
                         output_errors=output_errors)

In [20]:
# Search drugs
def sukl_drugs_search(string, database):
    string = string.lower()
    find, n = (database[database["NAZEV"].apply(lambda x: string == x.lower())], -2)
    find, n = (database[database["NAZEV"].apply(
        lambda x: (" " + string + " ") in (" " + x.lower() + " "))], -1) if len(find) == 0 else (find, n)
    find, n = (database[database["NAZEV"].apply(lambda x: string in x.lower())], 0) if len(find) == 0 else (find, n)
    return (list(find["ATC_WHO"]), n)


def sukl_drugs_mistake_search(string, mistakes, database):
    string = patternize(string)
    pattern = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = lambda x: pattern.search(x) is not None
    find = database[database["NAZEV"].apply(test)]
    return list(find["ATC_WHO"])


def sukl_find_drugs_search(string, database=products, output_errors=True, limit=0.5):
    return search_words_mistake_bottom(string,
                         lambda x: link_atc_preserve_count(sukl_drugs_search(x, database)),
                         lambda x, y: link_atc_preserve_count((sukl_drugs_mistake_search(x, y, database), 0)),
                         limit=limit,
                         output_errors=output_errors)

In [21]:
# Drugs shortcuts search
def sukl_drugs_shortcuts_search(string, database):
    string = string.lower()
    find = database[database["NAZEV"].apply(lambda x: string == x[:len(string)].lower())]
    find = database[database["NAZEV"].apply(lambda x: (" " + string) in x.lower())] if len(find) == 0 else find
    return list(find["ATC_WHO"])
    

def sukl_drugs_shortcuts_mistake_search(string, mistakes):
    string = patternize(string)
    pattern_start = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    pattern_in = regex.compile(f"( {string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = lambda x: (pattern_start.search(x[:(len(string) + mistakes)]) is not None or
                      pattern_in.search(x) is not None)
    find = database[database["NAZEV"].apply(test)]
    return list(find["ATC_WHO"])


def sukl_find_drugs_shortcuts_search(string, database=products, output_errors=True, limit=0.5):
    return search_words_mistake_bottom(string,
                     lambda x: link_atc_preserve_count(sukl_drugs_shortcuts_search(x, database)),
                     lambda x, y: link_atc_preserve_count((sukl_drugs_shortcuts_mistake_search(x, y, database), 0)),
                     limit=limit,
                     output_errors=output_errors)

## Candidate Entity Ranking

### Ranking MeSH codes

In [22]:
def medvik_find_by_code(string):
    if len(string) == 0:
        return ""
    
    for child in whole_mshcz:
        try:
            code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
            if code == string:
                d = [i.iter("{http://www.loc.gov/MARC21/slim}subfield") for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "680"][0]
                return next(d).text
        except IndexError:
            continue      
            
    return ""

In [23]:
def message_for_GPT(string, li, find, context=None):
    if len(li) == 0:
        return ""
            
    if context is None:
        result = [f"Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: \"{string}\":\n"]
    else:
        result = [f"Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: \"{string}\" v kontextu:  \"{context}\":\n"]
        
    j = 1
    for i in li:
        result.append(f"{j}. {find(i[0])} (pojem: {i[1]})\n")
        j += 1
        
    result.append("Jako odpověď mi pošli pouze číslo odpovědi. Pokud to nebude žádná z možností, pak odpověz NONE. Pokud to není lékařský pojem odpověz taky NONE.")
    
    return "".join(result)

In [24]:
with open("APIkeys/NIH", "r") as f:
    NIH_api = f.read()


def send_to_GPT(message):
    if message == "":
        return ""

    with open("APIkeys/chatGTP", "r") as f:
        chatgpt_api = f.read()

    client = OpenAI(api_key=chatgpt_api)
    return client.chat.completions.create(
                model="gpt-3.5-turbo-16k",
                messages=[{"role": "user", "content": message}],
                stream=False)

In [25]:
def find_int(string):
    result = []
    for i in string:
        if i.isdigit():
            result.append(i)

    return int("".join(result))


def from_GPT(result, li, find):
    try:
        i = find_int(result.choices[0].message.content) - 1
        return (li[i][0], li[i][1], find(li[i][0]))
    except ValueError:
        pass
    except IndexError:
        pass

    return None

### Ranking ATC codes

In [26]:
def combine_preserve_count(linked_text):
    combined = combine_searches({k: map(lambda x: (x[0], x[1]), linked_text[k]) for k in linked_text})
    result = {}
    for k in combined:
        links = []
        for l in combined[k]:
            sum_ = sum(
                next(filter(lambda y: y[0]==l[0], linked_text[x]))[2]
                    for x in k.split(" "))
            
            links.append((l[0], l[1], sum_))
        result[k] = links

    return result

In [27]:
def combine_2_searched_databaze(priority1, priority2, mistakes1, mistakes2):
    mistakes1 = mistakes1.copy()
    mistakes2 = mistakes2.copy()
    for j in priority1:
        if len(j.split(" ")) >= 2:
            mistakes1[j] = sum([mistakes1[x] for x in j.split(" ")])
    for j in priority2:
        if len(j.split(" ")) >= 2:
            mistakes2[j] = sum([mistakes2[x] for x in j.split(" ")])
    
    dict_with_priority = ({(len(k.split(" ")), -mistakes1.pop(k, 10), 2, k) : priority1[k] for k in priority1} |
                          {(len(k.split(" ")), -mistakes2.pop(k, 10), 1, k) : priority2[k] for k in priority2} )
    result = {}
    in_result = set()   
    for record in sorted(dict_with_priority, reverse=True):
        if any([x not in in_result for x in record[3].split(" ")]):
            for x in record[3].split(" "):
                in_result.add(x)
            result[record[3]] = dict_with_priority[record]
    return result

In [28]:
def prioritize_sukl(one_search, complete_search):
    in_count = sum([1 for k in complete_search if one_search[1] in k[1]])
    return (one_search[2], in_count, random.randint(0, 100))

## Empty Linking Prediction

In [29]:
def drop_long_searches(dictionary, limit=40):
    for key in dictionary.copy():
        if len(dictionary[key]) > limit:
            dictionary[key] = []
    return dictionary

In [30]:
def drop_empty_searches(dictionary):
    for w in dictionary.copy():
        if dictionary[w] == []:
            dictionary.pop(w)

    return dictionary

## Whole search

In [31]:
def whole_way_MeSH(table, index, database, lenght_dep_limit, drop_limit):
    # Search
    search = {}
    if table["text"][index] != "":
        search.update(search_words_mistake_bottom(table["text"][index],
                                              lambda x: medvik_combined_without_descr_search(x, database), 
                                              lambda x, y: medvik_mistakes_search(x, y, database=database),
                                              limit=0.25))
    if table["shortcuts"][index] != "":
        search.update(search_words_mistake_bottom(table["shortcuts"][index],
                                              lambda x: shortcuts_medvik_without_descr_search(x, database), 
                                              lambda x, y: shortcuts_medvik_mistakes_without_descr_search(
                                                  x, y, database=database),
                                              limit=0.25))

    # Drop & Combine
    search = drop_long_searches_len_dep(search, lenght_dep_limit)
    combined_search = combine_searches(search)
    search = drop_long_searches(search, drop_limit)
    search = drop_empty_searches(search)

    # Choose best Match
    result = {}
    combined_level = -np.infty
    is_find = set()
    for text in sorted(combined_search, key=lambda x: len(x.split(" ")), reverse=True):
        if combined_level > len(text.split(" ")) and all([w in is_find for w in text.split(" ")]):
            continue
        message = message_for_GPT(text, combined_search[text], medvik_find_by_code, context=table["original_text"][index])
        response = send_to_GPT(message)
        result[text] = from_GPT(response, combined_search[text], medvik_find_by_code)
        if result[text] is not None:
            is_find.update(text.split(" "))
            combined_level = len(text.split(" "))

    for text in search:
        if text in is_find:
            continue
        message = message_for_GPT(text, search[text], medvik_find_by_code, context=table["original_text"][index])
        response = send_to_GPT(message)
        result[text] = from_GPT(response, search[text], medvik_find_by_code)
        
    return interprete_explanation(result)


def whole_way_SUKL(table, index, database_chem, database_drugs, lenght_dep_limit_chem, lenght_dep_limit_drugs, drop_limit_chem, drop_limit_drugs):
    text = table["text"][index]
    shortcuts = table["shortcuts"][index]

    # Search
    search_chemicals = {}
    search_chemicals.update(sukl_find_chemicals_search(text,
                                                       output_errors=True, 
                                                       database=database_chem,
                                                       limit=0.25) if text != "" else {})
    search_chemicals.update(sukl_find_chemicals_shortcuts_search(shortcuts,
                                                                 output_errors=True,
                                                                 database=database_chem,
                                                                 limit=0.25) if shortcuts != "" else {})
    search_drugs = {}
    search_drugs.update(sukl_find_drugs_search(text,
                                               output_errors=True,
                                               database=database_drugs,
                                               limit=0.25) if text != "" else {})
    search_drugs.update(sukl_find_drugs_shortcuts_search(shortcuts,
                                                         output_errors=True,
                                                         database=database_drugs,
                                                         limit=0.25) if shortcuts != "" else {})
    
    search_chemicals_errors = (lambda x: {k[0]:k[1] for k in x})(search_chemicals)
    search_chemicals = (lambda x: {k[0]:x[k] for k in x})(search_chemicals)
    search_drugs_errors = (lambda x: {k[0]:k[1] for k in x})(search_drugs)
    search_drugs = (lambda x: {k[0]:x[k] for k in x})(search_drugs)

    # Drop & Combine
    search_chemicals = drop_long_searches_len_dep(search_chemicals, lenght_dep_limit_chem)
    search_drugs = drop_long_searches_len_dep(search_drugs, lenght_dep_limit_drugs)
    combined_chemicals = combine_preserve_count(search_chemicals)
    combined_drugs = combine_preserve_count(search_drugs)

    # Choosing best match
    search_chemicals = drop_long_searches(search_chemicals, drop_limit_chem)
    search_drugs = drop_long_searches(search_drugs, drop_limit_drugs)
    search_chemicals = drop_empty_searches(search_chemicals)
    search_drugs = drop_empty_searches(search_drugs)

    comb_searches = combine_2_searched_databaze(search_chemicals | combined_chemicals,
                                                search_drugs | combined_drugs,
                                                search_chemicals_errors,
                                                search_drugs_errors)
    
    return {k: sorted(comb_searches[k], key=lambda y: prioritize_sukl(y, comb_searches[k]), reverse=True)[0] 
              for k in comb_searches 
              if len(comb_searches[k]) != 0}

In [32]:
whole_way_procedura = (lambda table, index: whole_way_MeSH(table, index, procedura_mshcz, 30, 80))
whole_way_medikace = (lambda table, index: whole_way_SUKL(table, index, name_to_atc, products, 25 , 40, 10, 15))
whole_way_symptomy = (lambda table, index: whole_way_MeSH(table, index, symptomy_mshcz, 30, 80))
whole_way_os_anamneza = (lambda table, index: whole_way_MeSH(table, index, os_anamneza_mshcz, 70, 90))
whole_way_ne_medikace = whole_way_medikace
whole_way_ne_symptomy = whole_way_symptomy
whole_way_ne_os_anamneza = whole_way_os_anamneza

## Labeling

In [33]:
atc_drug_bank = pd.read_csv("databaze/drugbank_to_atc.csv", index_col=[0])
atc_drug_bank.head(3)

Unnamed: 0,atc,name,description,code
0,B01AE02,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,DB00001
1,L01FE01,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,DB00002
2,R05CB13,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,DB00003


In [34]:
def link_drug_drugbank(drug_code):
    if "DBSALT" in drug_code:
        return f"https://go.drugbank.com/salts/{drug_code}"
    return f"https://go.drugbank.com/drugs/{drug_code}"


def description_atc(atc_code):
    list_des = atc_drug_bank[atc_drug_bank["atc"] == atc_code].apply(
        lambda x: (x.code, x.description, link_drug_drugbank(x.code)), axis=1)
    
    if len(list_des) == 0:
        return ""
        
    result_string = ["Látky vedené v databázi DrugBank spojené s tímto ATC kódem:\n"]
    for code, des, link in list_des:
        result_string.append(f"{code}: {link} \n{des}\n")

    return "".join(result_string)

In [35]:
link_medvik = lambda code: f"https://www.medvik.cz/bmc/link.do?id={code}"
remove_cit = lambda des: regex.sub(" ?\((C|c)it\.([^\)])*\)?.", "",  des)
link_drugbank = lambda code: f"https://go.drugbank.com/atc/{code}"

In [36]:
def prepare_labeling_medvik(db, expl_column="explanation"):
    result = db[db[expl_column] != {}].copy()
    result["Link to MSHCZ"] = result[expl_column].apply(
        lambda dictr: [(k, dictr[k][0], dictr[k][1], remove_cit(dictr[k][2]), link_medvik(dictr[k][0])) for k in dictr ]
    )
    result = result[["label", "original_text", "Link to MSHCZ"]]    
    return result


def prepare_labeling_sukl(db, expl_column="explanation"):
    result = db[db[expl_column] != {}].copy()
    result["Link to MSHCZ"] = result[expl_column].apply(
        lambda dictr: [(k, dictr[k][0], dictr[k][1], description_atc(dictr[k][0]), link_drugbank(dictr[k][0])) for k in dictr]
    )
    result = result[["label", "original_text", "Link to MSHCZ"]]    
    return result

In [37]:
def predict_for_labeling_table(db, whole_way_func, n=35, random_state=None, negative=False):
    to_link = db.sample(frac=1, random_state=random_state)
    count_success, index = 0, 0
    to_link["explanation"] = "N/A"
    if negative:
        to_link["negation"] = False
    
    while count_success < n:
        if len(to_link) <= index:
            break
        j = to_link.index[index]
        
        result = whole_way_func(to_link, j)
        to_link.at[j, "explanation"] = result if not negative else result[0]
        if negative:
            to_link.at[j, "negation"] = result[1]
            
        if to_link["explanation"][j] != {}:
            count_success += 1

        index += 1
    
    return to_link.iloc[:index]

### Generating Results

In [38]:
# temp = predict_for_labeling_table(make_shortcuts_table(procedura),
#                                                     whole_way_procedura,
#                                                     n=35,
#                                                     random_state=124)

# temp.to_json("labeling_doctors/procedura_with_empty.json")
# prepare_labeling_medvik(temp).to_csv("labeling_doctors/procedura_labeling.csv")

In [39]:
# temp = predict_for_labeling_table(make_shortcuts_table(medikace),
#                                                     whole_way_medikace,
#                                                     n=35,
#                                                     random_state=125)

# temp
# temp.to_json("labeling_doctors/medikace_with_empty.json")
# prepare_labeling_sukl(temp).to_csv("labeling_doctors/medikace_labeling.csv")

In [40]:
# temp = predict_for_labeling_table(make_shortcuts_table(symptom),
#                                                     whole_way_symptomy,
#                                                     n=35,
#                                                     random_state=123)

# temp.to_json("labeling_doctors/symptom_with_empty.json")
# prepare_labeling_medvik(temp).to_csv("labeling_doctors/symptom_labeling.csv")

In [41]:
# temp = predict_for_labeling_table(make_shortcuts_table(os_anamneza),
#                                                     whole_way_os_anamneza,
#                                                     n=35,
#                                                     random_state=120)

# temp.to_json("labeling_doctors/osobni_with_empty.json")
# prepare_labeling_medvik(temp).to_csv("labeling_doctors/osobni_labeling.csv")

In [42]:
# temp = predict_for_labeling_table(make_shortcuts_table(ne_medikace),
#                                                     whole_way_ne_medikace,
#                                                     n=35,
#                                                     random_state=121,
#                                                     negative=True)

# temp.to_json("labeling_doctors/ne_medikace_with_empty.json")
# prepare_labeling_sukl(temp).to_csv("labeling_doctors/ne_medikace_labeling.csv")

In [43]:
# temp = predict_for_labeling_table(make_shortcuts_table(ne_symptom),
#                                                     whole_way_ne_symptomy,
#                                                     n=35,
#                                                     random_state=119,
#                                                     negative=True)

# temp.to_json("labeling_doctors/ne_symptom_with_empty.json")
# prepare_labeling_medvik(temp).to_csv("labeling_doctors/ne_symptom_labeling.csv")

In [44]:
# temp = predict_for_labeling_table(make_shortcuts_table(ne_os_anamneza),
#                                                     whole_way_ne_os_anamneza,
#                                                     n=35,
#                                                     random_state=122,
#                                                     negative=True)

# temp.to_json("labeling_doctors/ne_osobni_with_empty.json")
# prepare_labeling_medvik(temp).to_csv("labeling_doctors/ne_osobni_labeling.csv")

## Test

In [45]:
pd.options.display.max_colwidth = 200

In [46]:
temp = predict_for_labeling_table(make_shortcuts_table(medikace),
                                                    whole_way_medikace,
                                                    n=5,
                                                    random_state=42)

prepare_labeling_sukl(temp)

Unnamed: 0,label,original_text,Link to MSHCZ
2069,medikace,Lexaurin 3 mg,"[(lexaurin, N05BA08, BROMAZEPAM, , https://go.drugbank.com/atc/N05BA08), (mg, J01CR02, AMOXICILIN A INHIBITOR BETA-LAKTAMASY, , https://go.drugbank.com/atc/J01CR02), (3, V06XX, POTRAVINY PRO ZVLÁ..."
1703,medikace,Orcal Neo,"[(orcal Neo, C08CA01, AMLODIPIN, , https://go.drugbank.com/atc/C08CA01)]"
1043,medikace,Cerucal,"[(cerucal, A03FA01, METOKLOPRAMID, , https://go.drugbank.com/atc/A03FA01)]"
3987,medikace,Arimidex,"[(arimidex, L02BG03, ANASTROZOL, , https://go.drugbank.com/atc/L02BG03)]"
432,medikace,Zarzio 48 MU,"[(zarzio, L03AA02, FILGRASTIM, Látky vedené v databázi DrugBank spojené s tímto ATC kódem:\nDB00099: https://go.drugbank.com/drugs/DB00099 \nFilgrastim is a short-acting recombinant, non-pegylated..."


In [47]:
temp = predict_for_labeling_table(make_shortcuts_table(ne_medikace),
                                                    whole_way_ne_medikace,
                                                    n=5,
                                                    random_state=42)

prepare_labeling_sukl(temp)

Unnamed: 0,label,original_text,Link to MSHCZ
53,NE medikace,antikoncepce: 0,"[(antikoncepce, R05CA07, SULFID ANTIMONIČNÝ, , https://go.drugbank.com/atc/R05CA07)]"
557,NE medikace,Tramal,"[(tramal, N02AX02, TRAMADOL, Látky vedené v databázi DrugBank spojené s tímto ATC kódem:\nDBSALT000181: https://go.drugbank.com/salts/DBSALT000181 \nnan\n, https://go.drugbank.com/atc/N02AX02)]"
1620,NE medikace,ajatin,"[(ajatin, S01AX, JINÁ ANTIINFEKTIVA, , https://go.drugbank.com/atc/S01AX)]"
3356,NE medikace,FA: sine,"[(sine, V11, FYTOFARMAKA A ŽIVOČIŠNÉ PRODUKTY (ČESKÁ ATC SKUPINA), , https://go.drugbank.com/atc/V11), (FA, N02AA01, MORFIN, , https://go.drugbank.com/atc/N02AA01)]"
2233,NE medikace,AJATIN,"[(AJATIN, S01AX, JINÁ ANTIINFEKTIVA, , https://go.drugbank.com/atc/S01AX)]"


In [None]:
temp = predict_for_labeling_table(make_shortcuts_table(procedura),
                                                    whole_way_procedura,
                                                    n=5,
                                                    random_state=42)

prepare_labeling_medvik(temp)

In [None]:
temp = predict_for_labeling_table(make_shortcuts_table(symptom),
                                                    whole_way_symptomy,
                                                    n=5,
                                                    random_state=42)

prepare_labeling_medvik(temp)

In [None]:
temp = predict_for_labeling_table(make_shortcuts_table(os_anamneza),
                                                    whole_way_os_anamneza,
                                                    n=5,
                                                    random_state=42)

prepare_labeling_medvik(temp)

In [None]:
temp = predict_for_labeling_table(make_shortcuts_table(ne_symptom),
                                                    whole_way_ne_symptomy,
                                                    n=5,
                                                    random_state=42)

prepare_labeling_medvik(temp)