# Linking named entities
by Filip Gregora

In [1]:
import pandas as pd
from itertools import combinations
from string import punctuation
import numpy as np
import math
import random
import os
import xml.etree.ElementTree as elt
import regex
import requests
import json

In [2]:
data = pd.read_csv("data/NER_entities.csv")
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,Novalgin
3,symptom,Označena SLU v levé axile.
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,Založení TE l.sin
7,procedura,Cytostatika
8,NE symptom,"přiměřené echogenity,"
9,NE symptom,nezvětšena


## Data exploration
At the beginning we want to explore data.

- We can see that somewhere there is big letter at the beginning (but be carefull when whole first word is written in upper case)
- Somewhere at the end is interpuncion
- There are lots of duplicates
- the length of text is variable and the longest has 20 words (this can be problem in the future)


In [3]:
len(data[data.duplicated()])

2588

In [4]:
def clean(string):
    string = string.strip(" " + "".join(punctuation))
    # Remove first upper letter if not all letters are upper
    if len(string) >= 2:
        string = string[0].lower() + string[1:] if string[1].islower() else string
    # Replace multiple whitespaces with one
    return " ".join(string.split())

def clean_table(db):
    db_copy = db.copy()
    db_copy["text"] = db_copy["text"].apply(clean)
    db_copy["text"] = db_copy["text"].drop_duplicates()
    return db_copy.dropna()

data = clean_table(data)
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,novalgin
3,symptom,označena SLU v levé axile
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,založení TE l.sin
7,procedura,cytostatika
8,NE symptom,přiměřené echogenity
9,NE symptom,nezvětšena


In [5]:
def comb_sum(j):
    sum = 0
    for i in range(j, 0, -1):
        sum += math.comb(j,i)
    return sum

for i in range(1, 21):
    print(i, comb_sum(i), sep = ": ", end = " | ")
    
lenght_data = data["text"].apply(lambda x: len(x.split(" ")))
len(lenght_data[lenght_data >= 7])

1: 1 | 2: 3 | 3: 7 | 4: 15 | 5: 31 | 6: 63 | 7: 127 | 8: 255 | 9: 511 | 10: 1023 | 11: 2047 | 12: 4095 | 13: 8191 | 14: 16383 | 15: 32767 | 16: 65535 | 17: 131071 | 18: 262143 | 19: 524287 | 20: 1048575 | 

140

## Linking
Our approach to linking entities is:
- Link words to some medicinal database.
- Let some pretrained language model to choose the best from them.

So I can compare I decided to use two approaches for linking. The first one is to use international mesh and access it via web API of NIH (National Institute of Health).

The second is to use czech mesh. I accessed it from predownloaded file.


### Linking to international MESH through NIH
Mash is international medical databaze: https://uts.nlm.nih.gov/uts/.

I tried search all combinations of words from text in databaze. The longer combinations have higher priority. 

There is one big problem, the complexity grows exponentially with the lenght of the words (in the worst case for lenght of 20 we have to try around 10^6 combinations). My solution for this problem is go from bottom up, start with lenght 1 and continue only with combinations which success.

In [6]:
# Do not search in databaze if it number or it is too short (shorter than 2)
def filter_short(string):
    return len(string) < 2 or string.isdigit()
    
    
def print_linking_stats(data_list):
    empty = len(data_list[data_list.apply(lambda x: len(x) == 0)])
    print(f"Number of empty: {empty} ({empty / len(data_list) * 100} %)")

    number_of_matches = data_list.apply(lambda x: len(x))
    print(f"Mean from number of matches: {number_of_matches.mean()}")
    print(f"Median from number of matches: {number_of_matches.median()}")
    print(f"Maximal of matches: {number_of_matches.max()}")

In [7]:
# From string to list which contains tuples
def from_string_to_list(string):
    result = []
    for j in string.strip("[]()").split("), ("):
        if len(j) == 0:
            continue
        result.append(tuple([s.strip("'\" \\") for s in j.split("', ")]))
                
    return result


#From string to tuple
def from_string_to_tuple(string):
    if string == "N/A":
        return
    result = [i.strip("\\\"'(),") for i in string.strip("\\\" )('").split(", ")]
    return (result[0], result[1], ", ".join(result[2:]))


#From string to dictionary
def from_string_to_dict(string):
    result = {}
    for j in string.strip("{} ").split("], "):
        if j == "":
            continue
        i = list(j.split(": ["))
        assert len(i) == 2
        result[i[0].strip("\"\' \\")] = from_string_to_list(i[1])

    return result


#From string to dictionary which contains tuples
def from_string_to_dict_to_tuple(string):
    result = {}
    for j in regex.split("(\)| None), ('|\")", string.strip("{} ")):
        if j in ["", ')', ' None', "'", '"']:
            continue
        i = list(j.split(": ("))
        if len(i) == 1:
            i[0] = i[0].split(": None")[0].strip(": ")
            result[i[0].strip("\"\' \\")] = None
        else:
            result[i[0].strip("\"\' \\")] = from_string_to_tuple(i[1])

    return result

#From string to dictionary with ints
from_string_to_int_dict = (lambda x: {elem.split("': ")[0].strip("' \"") : int(elem.split("': ")[1])
                                      for elem in x.strip("{}\"' ").split(", '")})

In [8]:
with open("APIkeys/NIH", "r") as f:
    NIH_api = f.read()

def mash_slow_search(string):
    splitted_input = (string.split(" "))
    result = []
    for j in range(len(splitted_input), 0, -1):
        for string in combinations(splitted_input, j): 
            if filter_short(" ".join(string)):
                continue
                
            path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
            query = {
                     'string': " ".join(string),
                     'apiKey':NIH_api,
            }
            res = requests.get(path, params=query)

            if res.status_code <= 200:
                data = json.loads(res.text)
                for j in data["result"]["results"]:
                    result.append((j["ui"], j["name"]))
            else:
                print(res.status_code, res.text)
        
        if len(result) != 0:
            break
                        
    return result
        
    
def search_from_bottom(string, func, output_state = False):
    if (output_state):
        global count
        count += 1
        if count % 100 == 0:
            print(count)
    
    splitted_input = (string.split(" "))
    result = []
    last_result = []
    lenght = len(splitted_input)
    
    for j in range(1, lenght + 1):
        splitted_dict={}
        for elem in splitted_input:
            splitted_dict[j] = False
                    
        for words in combinations(splitted_input, j):
            data = func(" ".join(words))
            if len(data) != 0:
                for j in words:
                    splitted_dict[j] = True
                if filter_short(" ".join(words)):
                    continue
                result.append(data)

        splitted_input = [j for j, i in splitted_dict.items() if i]
        if len(splitted_input) == 0:
            break
        else:
            last_result, result = result, []
        
    temp = []
    for j in last_result:
        temp += list(enumerate(j))
    return [j for (i, j) in sorted(temp)]
    
    
def mash_search(string):
    path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
    query = {
             'string': string,
             'apiKey':NIH_api,
    }
    res = requests.get(path, params=query)

    if res.status_code <= 200:
        data = json.loads(res.text)          
        return [(j["ui"], j["name"]) for j in data["result"]["results"]]
    else:
        print(res.status_code, res.text)
        return []
    

def seach_table(db, func):
    db = db.copy()
    db["search"] = db["text"].apply(func)
    return db 
    

def mash_search_table(db):
    return seach_table(db, lambda x: search_from_bottom(x, mash_search))

In [9]:
if os.path.isfile("saved_search/data_mash.csv"):
    mash_linked_data = pd.read_csv("saved_search/data_mash.csv")
    mash_linked_data.index = mash_linked_data["Unnamed: 0"]
    mash_linked_data.drop(["Unnamed: 0"], axis=1, inplace=True)
    mash_linked_data["search"] = mash_linked_data["search"].apply(from_string_to_list)
else:
    mash_linked_data = mash_search_table(data)
    mash_linked_data.to_csv("saved_search/data_mash.csv")
    
print_linking_stats(mash_linked_data["search"])

Number of empty: 298 (10.631466286122013 %)
Mean from number of matches: 21.931858722797003
Median from number of matches: 25.0
Maximal of matches: 138


#### Not assigned
If we look at the random sample of 10 texts, which are not assigned, then we can see that in five of them there is typographical mistake (*"nejsou zn.plicní hpertenze"* = *"nejsou zn. plicní hypertenze"*, *"kumulce a nehomogenity"* = *"kumulace a nehomogenita"*, *"ceriucal"* = *"cerucal"*, *"paitace"* = *"palpitace"*, *"mamily klidné"* = ?). Others five are correct medical term, but in some non-typical grammatical form.

If we try to improve them we get 50 % improvement.

In [10]:
# empty_sample = mash_linked_data[mash_linked_data["search"].apply(lambda x: len(x) == 0)].sample(10, random_state=42)

# Because of my mistake (I had worser clean_table), the code above generate different sample than I have worked with.
# So I have to create the sample by hand:
empty_sample = mash_linked_data.loc[[878, 91, 5240, 3728, 1125, 2479, 4981, 1134, 5089, 1129]]

empty_sample["text"][878] = "nejsou zn. plicní hypertenze"
empty_sample["text"][91] = "hormostenické"
empty_sample["text"][5240] = "kumulace a nehomogenita"
empty_sample["text"][3728] = "biopsie"
empty_sample["text"][1125] = "chemobioterapie"
empty_sample["text"][2479] = "dysmorfické"
empty_sample["text"][4981] = "anikterické"
empty_sample["text"][1134] = "cerucal"
empty_sample["text"][5089] = "palpitace"
empty_sample["text"][1129] = "mamily klidné"

empty_sample = mash_search_table(empty_sample)
empty_sample

Unnamed: 0_level_0,text,search,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
878,nejsou zn. plicní hypertenze,"[(C0020542, Pulmonary Hypertension), (C0152171...",NE symptom
91,hormostenické,[],NE symptom
5240,kumulace a nehomogenita,[],symptom
3728,biopsie,"[(C0005558, Biopsy), (C0220797, biopsy charact...",procedura
1125,chemobioterapie,[],procedura
2479,dysmorfické,"[(C0005887, Body Dysmorphic Disorders)]",symptom
4981,anikterické,[],NE symptom
1134,cerucal,"[(C0701450, Cerucal)]",medikace
5089,palpitace,"[(C0030252, Palpitations), (C0549267, Palpitat...",NE symptom
1129,mamily klidné,[],NE symptom


There is one mistake which we can correct automaticly and it is not having space after punctuation mark. We can see that if we have space after punctuation then it find something, else it didn't.

In [11]:
print(len(search_from_bottom("zn. plicní", mash_search)))
print(len(search_from_bottom("zn.plicní", mash_search)))

50
0


We can see that there is around 150 examples of this mistakes.

In [12]:
def is_space_after_punc(string):
    punctuation = [".", ",", "!", "?", ":", ";", "+"]
    for i in range(len(string) - 1):
        if string[i] in punctuation and string[i+1] != " " and string[i+1] not in punctuation:
            return False
 
    return True

def insert_space_after_punc(string):
    punctuation = [".", ",", "!", "?", ":", ";", "+"]
    for i in range(len(string) - 1):
        if string[i] in punctuation and string[i+1] != " " and string[i+1] not in punctuation:
            string = string[:i+1] + " " + string[i+1:]
 
    return string

inserted_space_data = data.copy()
inserted_space_data["text"] = inserted_space_data["text"].apply(insert_space_after_punc)
no_space = data[~data["text"].apply(is_space_after_punc)]
len(no_space)

155

In [13]:
if os.path.isfile("saved_search/data_mash_inserted_space.csv"):
    inserted_space_mash_linked_data = pd.read_csv("saved_search/data_mash_inserted_space.csv")
    inserted_space_mash_linked_data.index = inserted_space_mash_linked_data["Unnamed: 0"]
    inserted_space_mash_linked_data.drop(["Unnamed: 0"], axis=1, inplace=True)
    inserted_space_mash_linked_data["search"] = inserted_space_mash_linked_data["search"].apply(from_string_to_list)
else:
    inserted_space_mash_linked_data = mash_linked_data.copy()
    no_space["text"] = no_space["text"].apply(insert_space_after_punc)
    for i in no_space.index:
        inserted_space_mash_linked_data["search"][i] = search_from_bottom(no_space["text"][i], mash_search)
        inserted_space_mash_linked_data["text"][i] = no_space["text"][i]
    inserted_space_mash_linked_data.to_csv("saved_search/data_mash_inserted_space.csv")    
        
print_linking_stats(mash_linked_data["search"])
print()
print_linking_stats(inserted_space_mash_linked_data["search"])

Number of empty: 298 (10.631466286122013 %)
Mean from number of matches: 21.931858722797003
Median from number of matches: 25.0
Maximal of matches: 138

Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.350338922582946
Median from number of matches: 25.0
Maximal of matches: 138


Thanks to this upgrade we improved search by finding 40 new matches.

### Linking to CZ Mash through Medvik 

Now we try to link through czech mash, I have downloaded it from NLK (národní lékařská knihovna): https://nlk.cz/pro-knihovny/data/#mesh-cz

I called this linking as Medvik, because there is web service called Medvik: https://www.medvik.cz/bmc/subject.do, where you can search in czech mash.

I used already improved methods from Mash_search. First I experimenced with search, which tests if contains gived text.

In [14]:
offline_mshcz = elt.parse('databaze/MeSH2023_Marc21_Alma.xml').getroot()

In [15]:
def patternize(string):
    result = []
    for i in string:
        if i in '<([{\\^-=$!|]})?*+.>]':
            result.append("\\" + i)
        else:
            result.append(i)
    return "".join(result)


def medvik_search(string, test):
    result = []
    for child in offline_mshcz:
        for subchild in child.iter("{http://www.loc.gov/MARC21/slim}subfield"):
            if subchild.text and test(string, subchild.text):
                try:
                    code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
                    name = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "150" ][0][0].text
                    result.append((code, name))
                    break
                except IndexError:
                    break                
    return result


def medvik_exact_search(string):              
    return medvik_search(string, lambda x, y: x.lower() == y.lower())


def medvik_words_search(string):      
    return medvik_search(string, lambda x, y: (" " + x.lower() + " ") in y.lower())


def medvik_match_search(string):
    return medvik_search(string, lambda x, y: x.lower() in y.lower())


def medvik_combined_search(string):
    result = medvik_exact_search(string)
    if len(result) == 0:
        result = medvik_words_search(string)
    if len(result) == 0:
        result = medvik_match_search(string)
    
    return result


def medvik_exact_search_table(db):
    return seach_table(db, lambda x: search_from_bottom(x, medvik_exact_search, output_state=True))


def medvik_words_search_table(db):
    return seach_table(db, lambda x: search_from_bottom(x, medvik_words_search, output_state=True))
    

def medvik_match_search_table(db):
    return seach_table(db, lambda x: search_from_bottom(x, medvik_match_search, output_state=True))


def medvik_combined_search_table(db):
    return seach_table(db, lambda x: search_from_bottom(x, medvik_combined_search, output_state=True))

In [16]:
if os.path.isfile("saved_search/data_medvik_contains.csv"):
    medvik_match_search_data = pd.read_csv("saved_search/data_medvik_contains.csv")
    medvik_match_search_data.index = medvik_match_search_data["Unnamed: 0"]
    medvik_match_search_data.drop(["Unnamed: 0"], axis=1, inplace=True)
    medvik_match_search_data["search"] = medvik_match_search_data["search"].apply(from_string_to_list)
else:
    count = 0
    medvik_match_search_data = medvik_match_search_table(inserted_space_data)
    medvik_match_search_data.to_csv("saved_search/data_medvik_contains.csv")
    
print_linking_stats(inserted_space_mash_linked_data["search"])
print()
print_linking_stats(medvik_match_search_data["search"])

Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.350338922582946
Median from number of matches: 25.0
Maximal of matches: 138

Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 1670.0117731002497
Median from number of matches: 28.0
Maximal of matches: 43684


In [17]:
print([i for i in medvik_match_search_data["search"].apply(lambda x: len(x)).sample(10, random_state=42)])

temp = medvik_match_search_data["search"].apply(lambda x: len(x))
print(f"number of searches longer than 100 matches in medvik_match_search_data: {len(temp[temp > 100])}")

[1904, 1720, 0, 27715, 1722, 94, 9, 10724, 0, 112]
number of searches longer than 100 matches in medvik_match_search_data: 1017


We can see, that for some examples this is working well. But for some we have really lots of samples whose lenght grows exponentially.

For this reasons it might be better to use some different match method instead:
- First method is contain search (I used it before) - test if contains given text
- Next method is word search - test if contains given text as word (there are spaces around)
- Next method is exact search - test if contains exactly given text
- The last method is combined search - first text exact, then word, then contains (if some success then end).

In [18]:
test_data = inserted_space_data.sample(20, random_state=42)

if os.path.isfile("saved_search/test_medvik.csv"):
    test_data = pd.read_csv("saved_search/test_medvik.csv")
    test_data["search_match"] = test_data["search_match"].apply(from_string_to_list)
    test_data["search_exact"] = test_data["search_exact"].apply(from_string_to_list)
    test_data["search_words"] = test_data["search_words"].apply(from_string_to_list)
    test_data["search_combined"] = test_data["search_combined"].apply(from_string_to_list)
else:
    count = 0
    test_data["search_match"] = medvik_match_search_table(test_data)["search"]
    test_data["search_exact"] = medvik_exact_search_table(test_data)["search"]
    test_data["search_words"] = medvik_words_search_table(test_data)["search"]
    test_data["search_combined"] = medvik_combined_search_table(test_data)["search"]
    test_data.to_csv("saved_search/test_medvik.csv")
    
print("Contains match")
print_linking_stats(test_data["search_match"])
print("\nWords match")
print_linking_stats(test_data["search_words"])
print("\nExact match")
print_linking_stats(test_data["search_exact"])
print("\nCombined match")
print_linking_stats(test_data["search_combined"])

Contains match
Number of empty: 3 (15.0 %)
Mean from number of matches: 2265.2
Median from number of matches: 15.5
Maximal of matches: 27715

Words match
Number of empty: 6 (30.0 %)
Mean from number of matches: 194.0
Median from number of matches: 5.5
Maximal of matches: 1918

Exact match
Number of empty: 11 (55.00000000000001 %)
Mean from number of matches: 3.05
Median from number of matches: 0.0
Maximal of matches: 22

Combined match
Number of empty: 3 (15.0 %)
Mean from number of matches: 261.95
Median from number of matches: 6.5
Maximal of matches: 1904


We can see that using exact match we get rid of the long matches but it have quite low success rate. Using words match is something in the middle (not good in both ways).

As last option we used combined match (first try exact, if don't success then words, then only match). This seems as the best methods (this doesn't create too large lists and has the same number of empty matches as contains match) 

In [19]:
if os.path.isfile("saved_search/data_medvik_combined.csv"):
    medvik_combined_search_data = pd.read_csv("saved_search/data_medvik_combined.csv")
    medvik_combined_search_data.index = medvik_combined_search_data["Unnamed: 0"]
    medvik_combined_search_data.drop(["Unnamed: 0"], axis=1, inplace=True)
    medvik_combined_search_data["search"] = medvik_combined_search_data["search"].apply(from_string_to_list)
else:
    count = 0
    medvik_combined_search_data = medvik_combined_search_table(inserted_space_data)
    medvik_combined_search_data.to_csv("saved_search/data_medvik_combined.csv")
       
print_linking_stats(medvik_combined_search_data["search"])

Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 488.3999286478773
Median from number of matches: 7.0
Maximal of matches: 28651


#### Duplicates
It is possible to get duplicates in list of matches, when getting the same match from two different words from text (or combinations of the same lenght) 

In mash search there are few duplicates, but in medvik search it can be serious problem - we can see, that maximum of matches in contains search is reduced nearly by 15 000.

In [20]:
len(inserted_space_mash_linked_data[~inserted_space_mash_linked_data["search"].apply(lambda x: len(set(x)) == len(x))])

5

In [21]:
def remove_dup_preserve_order(l):
    seen = set()
    seen_add = seen.add
    return [x for x in l if not (x in seen or seen_add(x))]


print("Before removing duplicates:")
print("Mash search")
print_linking_stats(inserted_space_mash_linked_data["search"])
print("\nMedvik contains search")
print_linking_stats(medvik_match_search_data["search"])
print("\nMedvik combined search")
print_linking_stats(medvik_combined_search_data["search"])

inserted_space_mash_linked_data["search"] = inserted_space_mash_linked_data["search"].apply(remove_dup_preserve_order)
medvik_match_search_data["search"] = medvik_match_search_data["search"].apply(remove_dup_preserve_order)
medvik_combined_search_data["search"] = medvik_combined_search_data["search"].apply(remove_dup_preserve_order)

print("\n\nAfter removing duplicates:")
print("Mash search")
print_linking_stats(inserted_space_mash_linked_data["search"])
print("\nMedvik contains search")
print_linking_stats(medvik_match_search_data["search"])
print("\nMedvik combined search")
print_linking_stats(medvik_combined_search_data["search"])

Before removing duplicates:
Mash search
Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.350338922582946
Median from number of matches: 25.0
Maximal of matches: 138

Medvik contains search
Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 1670.0117731002497
Median from number of matches: 28.0
Maximal of matches: 43684

Medvik combined search
Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 488.3999286478773
Median from number of matches: 7.0
Maximal of matches: 28651


After removing duplicates:
Mash search
Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.33927934356047
Median from number of matches: 25.0
Maximal of matches: 138

Medvik contains search
Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 1625.7213699607564
Median from number of matches: 28.0
Maximal of matches: 29327

Medvik combined search
Number of empty: 270 (9.632536567962898 %)
Mean from number of ma

## Choosing best match with Chat GPT
The idea behind our model is first link term to database and then choose the best one by some pretrained language model.

I use GPT-3.5, because it is free to access with limitations (There are some limits of access per day. And there is limited number of access per account. Then we have to pay.), it is fast and it is well known.

The message to GPT is in this format:

Který z nadcházejících popisů medicínských pojmů nejlépe popisuje text: "[MEDICAL TERM]":

    1. [DESCRIPTION_N.1] (pojem: [TERM_N.1])
    2. [DESCRIPTION_N.2] (pojem: [TERM_N.2])
    ...
    
Jako odpověď mi pošli pouze číslo odpovědi. Pokud to nebude žádná z možností, pak odpověz NONE.

In [22]:
def medvik_find_by_code(string):
    if len(string) == 0:
        return ""
    
    for child in offline_mshcz:
        try:
            code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
            if code == string:
                d = [i.iter("{http://www.loc.gov/MARC21/slim}subfield") for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "680"][0]
                return next(d).text
        except IndexError:
            continue      
            
    return ""


def mash_find_by_code(string):
    path = f'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{string}'
    query = {
             'apiKey':NIH_api,
    }
    res = requests.get(path, params=query)
    
    if res.status_code <= 200:
        try:
            data = json.loads(res.text)
            definition = data["result"]["definitions"]

            if "https://uts-ws.nlm.nih.gov/" in definition:
                path = definition
                res = requests.get(path,params=query)
                try:
                    return [i["value"] for i in json.loads(res.text)["result"] if i["rootSource"] == "MSHCZE"][0]
                except IndexError:
                    pass
                try:
                    return [i["value"] for i in json.loads(res.text)["result"] if i["rootSource"] == "MSH"][0]
                except IndexError:
                    definition = "NONE"
        except Exception:
            definition = "NONE"
            print(string, res.text)
        
        if definition == "NONE":
            return data["result"]["name"]
        
        return definition
    else:
        print(string)
        print(res.status_code, res.text)
        
    return ""

In [23]:
from openai import OpenAI
with open("APIkeys/NIH", "r") as f:
    NIH_api = f.read()

def send_to_GPT(message):
    if message == "":
        return ""

    with open("APIkeys/chatGTP", "r") as f:
        chatgpt_api = f.read()

    client = OpenAI(api_key=chatgpt_api)

    for m in ["gpt-3.5-turbo-16k", "gpt-3.5-turbo-16k-0613", "gpt-3.5-turbo", "gpt-3.5-turbo-1106"]:
        try:
            return client.chat.completions.create(
                model=m,
                messages=[{"role": "user", "content": message}],
                stream=False,
            )
        except Exception:
            pass

    return client.chat.completions.create(
                model="gpt-3.5-turbo-0125",
                messages=[{"role": "user", "content": message}],
                stream=False)


def message_for_GPT(string, li, find, context=None):
    if len(li) == 0:
        return ""
            
    if context is None:
        result = [f"Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: \"{string}\":\n"]
    else:
        result = [f"Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: \"{string}\" v kontextu:  \"{context}\":\n"]
        
    j = 1
    for i in li:
        result.append(f"{j}. {find(i[0])} (pojem: {i[1]})\n")
        j += 1
        
    result.append("Jako odpověď mi pošli pouze číslo odpovědi. Pokud to nebude žádná z možností, pak odpověz NONE. Pokud to není lékařský pojem odpověz taky NONE.")
    
    return "".join(result)
    

def from_GPT(result, li, find):
    try:
        i = int(result.choices[0].message.content.split(".")[0].strip(" ")) - 1
        return (li[i][0], li[i][1], find(li[i][0]))
    except ValueError:
        pass
    except IndexError:
        pass

    return None

Because ChatGPT has problem with long messages (and sometimes we get really long results with medvik_combined), we have to restrict these messages and drop them.

In [27]:
if os.path.isfile("saved_search/explanation_sample.csv"):
    explanation = pd.read_csv("saved_search/explanation_sample.csv")
    explanation.index = explanation["Unnamed: 0"]
    explanation.drop(["Unnamed: 0"], axis=1, inplace=True)
    explanation["mash_explanation"] = explanation["mash_explanation"].fillna(value="N/A").apply(from_string_to_tuple)
    explanation["medvik_explanation"] = explanation["medvik_explanation"].fillna(value="N/A").apply(from_string_to_tuple)
    explanation["mash_search"] = explanation["mash_search"].apply(from_string_to_list)
    explanation["medvik_search_combined"] = explanation["medvik_search_combined"].apply(from_string_to_list)
else:
    explanation = results.sample(100, random_state=38)
    explanation["mash_explanation"] = "N/A"
    for j in explanation.index:
        message = message_chatGTP(explanation["text"][j], explanation["mash_search"][j], mash_find_by_code)
        response = send_to_GPT(message)
        explanation["mash_explanation"][j] = from_GPT(response, explanation["mash_search"][j], mash_find_by_code)     
        
#         If the message is too long we cannot send it to chatGPT, so we drop shorter messages.
        if len(explanation["medvik_search_combined"][j]) > 50:
            continue
        message = message_for_GPT(explanation["text"][j], explanation["medvik_search_combined"][j], medvik_find_by_code)
        response = send_to_GPT(message)
        explanation["medvik_explanation"][j] = from_GPT(response, explanation["medvik_search_combined"][j], medvik_find_by_code)
    explanation.to_csv("saved_search/explanation_sample.csv")

explanation.head()

Unnamed: 0_level_0,label,text,mash_search,medvik_search_combined,mash_explanation,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1633,NE symptom,KI100,[],[],,
5536,procedura,st. p. tru-cut biopsii,"[(C1170898, Companion P/ST 1000ML), (C4015802,...","[(D000039, peritonzilární absces), (D000081182...","(C5700874, Percutaneous pulmonary artery revas...",
412,osobní anamnéza,konizace čípku,"[(C0195324, Conization)]","[(D002583, nádory děložního čípku), (D019092, ...","(C0195324, Conization, Kruhovité kuželovité vy...","(D019092, konizace děložního čípku, Kruhovité ..."
4517,procedura,operace: ITP,"[(C0398650, Immune thrombocytopenic purpura), ...","[(D007293, inosintrifosfát)]","(C3842543, Idiopathic thrombocytopenia (ITP, I...",
5757,symptom,neostře konturované ložisko 7x5mm,"[(C0241148, Cutaneous plaque), (C1533591, Calc...","[(D001253, astrocyty), (D002833, choroiditida)...","(C0235456, Thyroid nodular, Thyroid nodular)","(D002833, choroiditida, Zánět cévnatky, zadní ..."


In [32]:
len(explanation[explanation["medvik_search_combined"].apply(lambda x: len(x) > 50)])

22

## Conclusion Basic Access
There I am going to evaluate the basic access for linking entities.

### Results for Linking
Now we look how we have been successful with linking to database with respect to different labels.

In [33]:
results = inserted_space_data.copy()
results["mash_search"] = inserted_space_mash_linked_data["search"]
results["medvik_search_combined"] = medvik_combined_search_data["search"]
results.head(3)

Unnamed: 0,label,text,mash_search,medvik_search_combined
0,symptom,jemný fibrózní proužek,"[(C0030848, Peyronie Disease), (C0227365, Taen...","[(D000077275, fibrózní dysplazie kraniofaciáln..."
1,procedura,neoadjuvantní CHT,"[(C0600558, Neoadjuvant Therapy), (C1422359, S...","[(D000014, abnormality vyvolané léky), (D00313..."
2,medikace,novalgin,"[(C0917937, Novalgin)]","[(D004177, metamizol)]"


In [34]:
os_anamneza = results[results.label == "osobní anamnéza"]
ne_os_anamneza = results[results.label == "NE osobní anamnéza"]
medikace = results[results.label == "medikace"]
ne_medikace = results[results.label == "NE medikace"]
symptom = results[results.label == "symptom"]
ne_symptom = results[results.label == "NE symptom"]
procedura = results[results.label == "procedura"]
    
print("osobní anamnéza:")    
print_linking_stats(os_anamneza["mash_search"])
print_linking_stats(os_anamneza["medvik_search_combined"])

print("\nNE osobní anamnéza:")
print_linking_stats(ne_os_anamneza["mash_search"])
print_linking_stats(ne_os_anamneza["medvik_search_combined"])

print("\nmedikace:")
print_linking_stats(medikace["mash_search"])
print_linking_stats(medikace["medvik_search_combined"])

print("\nNE medikace:")
print_linking_stats(ne_medikace["mash_search"])
print_linking_stats(ne_medikace["medvik_search_combined"])

print("\nsymptom:")
print_linking_stats(symptom["mash_search"])
print_linking_stats(symptom["medvik_search_combined"])

print("\nNE symptom:")
print_linking_stats(ne_symptom["mash_search"])
print_linking_stats(ne_symptom["medvik_search_combined"])

print("\nprocedura:")
print_linking_stats(procedura["mash_search"])
print_linking_stats(procedura["medvik_search_combined"])

print("\n\nall:")
print_linking_stats(results["mash_search"])
print_linking_stats(results["medvik_search_combined"])
print("Number of empty in both search:", len(results[(results["mash_search"].apply(lambda x: len(x) == 0)) & (results["medvik_search_combined"].apply(lambda x: len(x) == 0))]))

osobní anamnéza:
Number of empty: 11 (5.14018691588785 %)
Mean from number of matches: 22.939252336448597
Median from number of matches: 24.0
Maximal of matches: 86
Number of empty: 18 (8.411214953271028 %)
Mean from number of matches: 459.1588785046729
Median from number of matches: 11.5
Maximal of matches: 13185

NE osobní anamnéza:
Number of empty: 4 (6.557377049180328 %)
Mean from number of matches: 20.83606557377049
Median from number of matches: 25.0
Maximal of matches: 75
Number of empty: 14 (22.950819672131146 %)
Mean from number of matches: 190.34426229508196
Median from number of matches: 1.0
Maximal of matches: 5777

medikace:
Number of empty: 73 (23.934426229508198 %)
Mean from number of matches: 14.462295081967213
Median from number of matches: 8.0
Maximal of matches: 82
Number of empty: 101 (33.114754098360656 %)
Mean from number of matches: 96.88196721311475
Median from number of matches: 1.0
Maximal of matches: 5134

NE medikace:
Number of empty: 1 (6.25 %)
Mean from nu

We can see, that most labels work quite similar to each others. One big exception is label "medikace" (in english medication). This label have significantly higher empty rate, but have less of matches. 

I this is caused by shorter text (have less of words), but these words are usually more concrete (as names of medicine).

### Results for Mash and Medvik search
Now I am going to evaluate results from GPT for linking.

For this I have created a sample of 100 entries, which I have send to GPT to find best match. From them I picked up another sample of 35 entries which I have evaluated manually.

In [35]:
print("Number of examples: {}".format(len(explanation)))

print("Number of empty Linking for medvik_search_combined: {}".format(explanation["medvik_search_combined"].apply(lambda x: len(x) == 0).sum()))
print("Number of empty Linking for Mash_search: {}".format(explanation["mash_search"].apply(lambda x: len(x) == 0).sum()))

print("Number of not assigned for Medvik: {}".format(explanation["medvik_explanation"].apply(lambda x: x is None).sum()))
print("Number of not assigned for Mash: {}".format(explanation["mash_explanation"].apply(lambda x: x is None).sum()))

Number of examples: 100
Number of empty Linking for medvik_search_combined: 11
Number of empty Linking for Mash_search: 10
Number of not assigned for Medvik: 39
Number of not assigned for Mash: 24


We can see, that there is high number of not assigned in both searches. But in Medvik it is much higher. The higher number in Medvik is because we have to drop very long searches. In the future accesses it is important to handle the long.

In [36]:
asign = []
for j in explanation.sample(35, random_state=42).index:   
    if explanation["medvik_explanation"][j] is not None:
        x = explanation["medvik_explanation"][j]
        asign.append((explanation["text"][j], "Medvik", "{} ({})".format(x[1], x[2])))
    if explanation["mash_explanation"][j] is not None:
        x = explanation["mash_explanation"][j]
        asign.append((explanation["text"][j], "Mash", "{} ({})".format(x[1], x[2])))
    if explanation["mash_explanation"][j] is None and explanation["medvik_explanation"][j] is None:
        asign.append((explanation["text"][j], "None", "Empty"))

i = 0
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)

assert len(set([i[3] for i in asign])) == 4

In [37]:
print("Not assigned medvik:", 35 - len([i for i in asign if i[1] == "Medvik"]))
print("Mistakes from medvik:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]))
print("Partially right from medvik:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]))
print("Right from medvik:", len([i for i in asign if i[3] == 'Right' and i[1] == "Medvik"]))

print("\nNot assigned mash:", 35 - len([i for i in asign if i[1] == "Mash"]))
print("Mistakes from mash:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]))
print("Partially right from mash:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Mash"]))
print("Right from mash:", len([i for i in asign if i[3] == 'Right' and i[1] == "Mash"]))

Not assigned medvik: 15
Mistakes from medvik: 13
Partially right from medvik: 5
Right from medvik: 2

Not assigned mash: 11
Mistakes from mash: 13
Partially right from mash: 5
Right from mash: 6


The results are not so good (but it was base line).

For medvik we have got 20 % for at least partially right linking. And 6 % for completely right linking.

For mash we have got 31 % for at least partially right linking. And 17 % for completely right linking.

These results need improvement. For this reason I create new Improved linking based on different method. But before it I look at the wrong, partially and right results and observe if there is some pattern.


In [38]:
med = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]
print("\ntext wrong in medvik and mash:", [i for i in med if i in mash])
print("text wrong only in medvik", [i for i in med if i not in mash])
print("text wrong only in mash", [i for i in mash if i not in med])

med = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Mash"]
print("\ntext partially right in medvik and mash:", [i for i in med if i in mash])
print("text partially right only in medvik", [i for i in med if i not in mash])
print("text partially right only in mash", [i for i in mash if i not in med])

med = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Mash"]
print("\ntext right in medvik and mash:", [i for i in med if i in mash])
print("text right only in medvik", [i for i in med if i not in mash])
print("text right only in mash", [i for i in mash if i not in med])


text wrong in medvik and mash: ['MG vlevo', 'bez šelestu', 'neostře konturované ložisko 7x5mm', 'bez poruchy kinetiky myokardu', 'norm. velikost srdeč. oddílů', 'jizevnaté změny v ZDK', 'tumorozní ložisko']
text wrong only in medvik ['benigní verifikované ložisko v HKK', 'játra bez solidních patologických ložiskových změn', 'USG pravé mammy a axilly', 'mírný sekund lymfedém pod axilou, v zadní axil. řase', 'apokrinní metaplazií a místy i adenóza', 'mastitis: ne']
text wrong only in mash ['někdy tahv oblasti jizvy', 'subjektivně bez bolestí', 'močový měchýř hypodenzní homogenní náplně', 'bránice hladká', 'fibropleurální změny', 'dlouhodobě stac. nález']

text partially right in medvik and mash: ['parciální mastektomie vlevo']
text partially right only in medvik ['močový měchýř hypodenzní homogenní náplně', 'bránice hladká', 'zn. krvácení', 'průjmy']
text partially right only in mash ['játra bez solidních patologických ložiskových změn', 'USG pravé mammy a axilly', 'uzliny fyziologické'

- We can see, that there are higher percentage of partially right. This is because we have been looking for one sumarizing linking. But very often such linking doesn't exists. At these cases we only find linking which corresponds to part of the text not for the whole. 
- The completely right one, are usualy very short (I think the reason is same as above).

## Improved Search

Now I am going to try another access, where I try handle former mistakes. The biggest change is not to have one list of links for whole text, but to have one list for each word from the text. And then try to explain this word.

In [451]:
def search_from_bottom_no_drop(string, func):
    splitted_input = (string.split(" "))
    lenght = len(splitted_input)
    
    result_dict = {}
    for word in splitted_input:
        result_dict[word] = []
        
    for j in range(1, lenght + 1):
        splitted_dict={}
        for elem in splitted_input:
            splitted_dict[j] = False
                    
        for words in combinations(splitted_input, j):
            data = func(" ".join(words))
            if len(data) == 0:
                continue
            for j in words:
                splitted_dict[j] = True
                result_dict[j] += [(i[0], i[1], words) for i in data]
                    
        splitted_input = [j for j, i in splitted_dict.items() if i]
        if len(splitted_input) == 0:
            break
            
    for k, v in result_dict.copy().items():
        result_dict[k] = [i for i in v if len(i[2]) == len(v[-1][2])]
        pop_key = True
        for n in set([i[2] for i in result_dict[k]]):
            string = " ".join(n)
            if string == k:
                pop_key = False
            if string in result_dict:
                continue
            result_dict[string] = [(i[0], i[1]) for i in result_dict[k] if " ".join(i[2]) == string]
            
        if pop_key:
            result_dict.pop(k)
    
    return result_dict

First we need to link them to databases.

In [41]:
if os.path.isfile("saved_search/new_access.csv"):
    test_new_access = pd.read_csv("saved_search/new_access.csv")
    test_new_access.index = test_new_access["Unnamed: 0"]
    test_new_access.drop(["Unnamed: 0"], axis=1, inplace=True)
    test_new_access["mash_search"] = test_new_access["mash_search"].apply(from_string_to_dict)
    test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(from_string_to_dict)

else:
    test_new_access = results[["text"]].sample(100, random_state=25)
    test_new_access["mash_search"] = "N/A"
    for j in test_new_access.index:
        test_new_access["mash_search"][j] = search_from_bottom_no_drop(test_new_access["text"][j], mash_search)

    test_new_access["medvik_search"] = "N/A"
    for j in test_new_access.index:
        test_new_access["medvik_search"][j] = search_from_bottom_no_drop(test_new_access["text"][j], medvik_combined_search)

    test_new_access.to_csv("saved_search/new_access.csv")

To send message to GPT we need not to exceed certain length. We try to discover some lenght, by which the medvik search returns only noice (or really probably).

In [42]:
def print_long_searches(db, limit=20):
    temp = db.apply(lambda x: [(i, len(x[i])) for i in x if len(x[i]) > limit])
    print(sorted(list(temp[temp.apply(lambda x: len(x) != 0)]), key=(lambda x: x[0][1])))

print_long_searches(test_new_access["medvik_search"])

[[('vlně', 22)], [('krvácení', 23)], [('mírné', 23)], [('stomatologické', 27), ('vyš', 874)], [('nových', 30)], [('stabilní', 32)], [('strukturou', 36)], [('laloku', 38)], [('susp', 52)], [('i na', 58), ('tlustého střeva', 30)], [('operace', 58)], [('nebol', 66)], [('léčí s', 76)], [('spíš', 77)], [('příl', 84)], [('e', 95)], [('genetické', 99)], [('p.', 119)], [('NACT', 128), ('-', 5334)], [('plicní', 162)], [('klinické', 164)], [('ME s', 166)], [('ME s', 166)], [('léčba', 181)], [('vyšetření', 186)], [('příznaky', 314)], [('není', 327)], [('TAD', 331), ('l. I', 72)], [('- po', 386)], [('pomocí', 393)], [('pm', 548)], [('PM', 548), ('se', 7588)], [('negat', 740)], [('Cor', 1304)], [('patol', 6932)], [('v', 7248)], [('toxicita', 11218)], [('v', 14496)]]


We can see, that for longer length than 40 we get mostly non-medical terms (or general medical terms).

In [43]:
def drop_long_searches(dictionary, limit=40):
    for key in dictionary.copy():
        if len(dictionary[key]) > limit:
            dictionary[key] = []
    return dictionary

test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(drop_long_searches)

The second part is to choose the best one by GPT.

In [44]:
if os.path.isfile("saved_search/new_access_explanation.csv"):
    test_new_access = pd.read_csv("saved_search/new_access_explanation.csv")
    test_new_access.index = test_new_access["Unnamed: 0"]
    test_new_access.drop(["Unnamed: 0"], axis=1, inplace=True)
    test_new_access["mash_search"] = test_new_access["mash_search"].apply(from_string_to_dict)
    test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(from_string_to_dict)
    test_new_access["mash_explanation"] = test_new_access["mash_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    test_new_access["medvik_explanation"] = test_new_access["medvik_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)

else:   
    test_new_access["medvik_explanation"] = "N/A"
    for i in test_new_access.index:
        l = test_new_access["medvik_search"][i]
        result = {}
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        test_new_access["medvik_explanation"][i] = result

    test_new_access["mash_explanation"] = "N/A"
    for i in test_new_access.index:
        l = test_new_access["mash_search"][i]
        result = {}
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        test_new_access["mash_explanation"][i] = result

    test_new_access.to_csv("saved_search/new_access_explanation.csv")

### Results
Now its time to examine the results of improved search.

In [45]:
print("Number of examples: {}".format(len(test_new_access)))

print("Number of empty Linking for Medvik_combined_search: {}".format(test_new_access["medvik_search"].apply(lambda x: len(x) == 0).sum()))
print("Number of empty Linking for Mash_search: {}".format(test_new_access["mash_search"].apply(lambda x: len(x) == 0).sum()))

print("Number of not assigned for Medvik: {}".format(test_new_access["medvik_explanation"].apply(lambda x: len(x) == 0).sum()))
print("Number of not assigned for Mash: {}".format(test_new_access["mash_explanation"].apply(lambda x: len(x) == 0).sum()))

Number of examples: 100
Number of empty Linking for Medvik_combined_search: 12
Number of empty Linking for Mash_search: 10
Number of not assigned for Medvik: 12
Number of not assigned for Mash: 15


There we have got much more better results than in basic access. We were able to choose the best match for nearly everything, what we have been able to find in database.

In [46]:
asign = []
for j in test_new_access.sample(35, random_state=42).index:   
    if test_new_access["medvik_explanation"][j] is not None:
        x = test_new_access["medvik_explanation"][j]
        asign.append((test_new_access["text"][j], "Medvik", "{}".format([(e, x[e]) for e in x])))
    if test_new_access["mash_explanation"][j] is not None:
        x = test_new_access["mash_explanation"][j]
        asign.append((test_new_access["text"][j], "Mash", "{}".format([(e, x[e]) for e in x])))
    if test_new_access["mash_explanation"][j] is None and test_new_access["medvik_explanation"][j] is None:
        asign.append((test_new_access["text"][j], "None", "Empty"))

i = 0
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)

In [47]:
print("Not assigned medvik:", len([i for i in asign if i[1] == "Medvik" and i[3] == ""]))
print("Mistakes from medvik:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]))
print("Partially right from medvik:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]))
print("Right from medvik:", len([i for i in asign if i[3] == 'Right' and i[1] == "Medvik"]))

print("\nNot assigned mash:", len([i for i in asign if i[1] == "Mash" and i[3] == ""]))
print("Mistakes from mash:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]))
print("Partially right from mash:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Mash"]))
print("Right from mash:", len([i for i in asign if i[3] == 'Right' and i[1] == "Mash"]))

Not assigned medvik: 1
Mistakes from medvik: 20
Partially right from medvik: 6
Right from medvik: 8

Not assigned mash: 2
Mistakes from mash: 18
Partially right from mash: 10
Right from mash: 5


The results are better than previous access:

For medvik we have got 40 % for at least partially right linking. And 23 % for completely right linking.

For mash we have got 43 % for at least partially right linking. And 14 % for completely right linking.

But even these results are very good, they need next improvement. For this reason I would like to try another improved linking. But first I look at the wrong, partially and right results and observe if there is some pattern.


In [48]:
med = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]
print("text wrong in medvik and mash:", [i for i in med if i in mash])
print("text wrong only in medvik", [i for i in med if i not in mash])
print("text wrong only in mash", [i for i in mash if i not in med])

med = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Mash"]
print("\ntext partially right in medvik and mash:", [i for i in med if i in mash])
print("text partially right only in medvik", [i for i in med if i not in mash])
print("text partially right only in mash", [i for i in mash if i not in med])

med = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Mash"]
print("\ntext right in medvik and mash:", [i for i in med if i in mash])
print("text right only in medvik", [i for i in med if i not in mash])
print("text right only in mash", [i for i in mash if i not in med])

text wrong in medvik and mash: ['DKK: bez otoků', 'kličky tenkého i tlustého střeva na necíleném vyšetření přiměřeného kalibru i norm. šíře stěny', 'fibrocystické změny s mnohočetnými intraduktálními papilomy', 'beze změny zdra. satvu', 'oboustranné totální mastektomii', 'vpačování bradavek 0', 'bez patrných MTS', 'menzes no', 'mírné velikostní progresi', 'tamoxifenu', 'mamila: pravidelné stavby', 'mutace v genu NBN', 'vlevo bez patol', 'bez nových poíží']
text wrong only in medvik ['gynekologické operace', 'AS reg', 'normě', 'váha stabilní', 'kompletní klinické regrese', 'regrese v prsu']
text wrong only in mash ['parc. ME s disekcí axily', 'anastrozol', 'jizva v ZHQ zhojena', 'neurotoxicita']

text partially right in medvik and mash: ['kůže intaktní', 'stolice spíš zácpovitá', 'hysterectomii pro krvácení', 'klinicky lipom při sternu', 'GIT toxicita G1']
text partially right only in medvik ['parc. ME s disekcí axily']
text partially right only in mash ['USG (Mamma, Axilla', 'váha stab

- I have noticed, that most of the words which have been linked correctly have been in basic form (Sg 1).
- Some from the wrong one have mistake or there a punctuation is doing some mess.


### Some Adititional comments on results
The results above aren't pretty good. Some changes and improvements can be done in the medical text. But lots of these changes will be label specific.

During labeling I have noticed, that there can be problem with punctuation (because of punctuation we usualy don't find match). For this reason I tried to discover if there is some punctuation we can remove.

In [49]:
punc_in_data = set()
data["text"].apply(lambda x: punc_in_data.update(set([i for i in x if i in punctuation])))
punc_in_data

{'%', '(', ')', '+', ',', '-', '.', '/', ':', ';', '['}

In [50]:
# temp = data.sample(200, random_state=10)
for p in sorted(list(punc_in_data)):
    print(f"Punctuation: {p}")
    data[data["text"].apply(lambda x: p in x)].sample(frac=1, random_state=10).head(10)["text"].apply(lambda x: print(x))
    print()


Punctuation: %
ki 100%, aloepcie,sliznice prokrveny, oběh. komp
ki 100%,alopecie,sliznice prokrveny, oběh. komp
inf. glucosi 10% 500 ml
růst trabekulárně, solidně, s polarizací do 10 %, jádra s jemným chromatinem
KI 100% lucidní

Punctuation: (
proliferace dle Ki67 (automat) 35
RTG (Plíce
cílenou axilární disekcí (SNB + klipovaná uzlina
biopsie sentinelové uzliny (SNB
vyrážka na kůži (kopřivka
re-resekce - laterální části (kůže + podkoží) + disekce pravé axily - en bloc
adjuvantní radioterapii na hrudní stěnu vpravo + axilu vpravo ( I-IV.etáž
páteře C+ L( diskopatie
USG (Břicho
alergie: Ketazon (urtika

Punctuation: )
proliferace dle Ki67 (automat) 35
odstranění klipované (a ev. sentinelové) uzliny
re-resekce - laterální části (kůže + podkoží) + disekce pravé axily - en bloc

Punctuation: +
adjuv CHt paclitaxel weekly 12x + trastuzuab
ko + trombo + dif
LHRH+IA
H+L nehmatné
hye+ae
UZ+MMG
totální ME + SNB vlevo
cílenou axilární disekcí (SNB + klipovaná uzlina
LDK, st.p. HYE+AE
páteře C+ 

We can see, that most of the punctuation have no specific meaning, so we can substitute them with space (most of them stands there instead of space).

On the other hand colon has specific meaning, its meaning is specific some category and the rest from the text is about it.

And dot have specific meaning, which specify that the word is only shortcut or dash which is part of some words.

In [51]:
data_improved_punctuation = data.copy()
data_improved_punctuation["original_text"] = data["text"]
data_improved_punctuation["text"] = data_improved_punctuation["text"].apply(
    lambda text: "".join([l if l not in punctuation or l in [".", ":"] else " " for l in text]))
data_improved_punctuation["text"] = data_improved_punctuation["text"].apply(clean)

In [52]:
for i in data_improved_punctuation.index:
    if len(data_improved_punctuation["text"][i].split(":")) >= 2:
        print(data_improved_punctuation["text"][i])

uzliny: fyziologické
antikoncepce: 0
operace: 0
alergie: Ketazon urtika
alergie: Červená paprika a kočky
břicho: v niveau
alergie: O
DKK:O
ložiska: l.dx. ZDK solitární 6x6x5 mm
gynekologická onemocnění: ne
operace: krční mandle
mastitis: neguje
perikard:bez výpotku
mastitis: ne
plíce: poklep plný jasný
gynekologické operace: 0
dušnost: 0
srdce: as reg
uzliny: patologické
hlava : poklepově nebolestivá
hormonální léčba: 0
axila i nadkl:O
karnofsky index: 100
alergie: neuvádí
břicho: měkké
mastitis: 0
hormonální léčba: HA HRT
hormonální léčba: HRT
ITP: 1
alergie: 0
antikoncepce: cca 2roky
hormonální léčba: HRT dříve
DKK: bez otoků
trávicí potíže: 0
gynekologická onemocnění: 0
gynekologické operace: neguje
DKK :hybnost volná v plném rozsahu
gynekologické operace: ne
uzliny: nejsou patrné
axilly: 0
krk: nápln žil v normě
antikoncepce: IUD jaydess
FA: sine
ao: trojcípá
operace: neguje
uzliny: suspektní
OPERACE: Mastectomia part. l. sin
cyklus: pravidelný
gynekologická onemocnění: neguje
O IT

In [53]:
data_improved_punctuation["about"] = "N/A"
for i in data_improved_punctuation.index:
    colon_split = data_improved_punctuation["text"][i].split(":")
    if len(colon_split) >= 2 and colon_split[1].strip(" ") not in ["0", "ne", "neguje", "negat", "neuvádí", "O"]:
        data_improved_punctuation.at[i, "about"] = colon_split[0].strip(" ")
        data_improved_punctuation.at[i, "text"] = ": ".join([x.strip(" ") for x in colon_split[1:]])

If we inspect the longer matches, we can see, that many times we've got there nonsense combination (like word with conjunction behind it). For this reason I am going to modify the algorithm, where it will preserve even the shorter matches.

In [54]:
temp = test_new_access["medvik_explanation"].apply(lambda x: {k: x[k][1] for k in x if len(k.split(" ")) >= 2 if x[k] is not None})
print(list(temp[temp != {}].head(5)))
temp = test_new_access["mash_explanation"].apply(lambda x: {k: x[k][1] for k in x if len(k.split(" ")) >= 2 if x[k] is not None})
print(list(temp[temp != {}].head(5)))

[{'beze změny': 'tropismus'}, {'sekund a': 'syndromy spánkové apnoe', 'lymfedém a': 'syndrom žlutých nehtů', 'pod hrudní': 'bederní obratle', 'pod a': 'parotis', 'hrudní a': 'bránice', 'stěnou a': 'sagitální abdominální rozměr'}, {'změny s': 'incontinentia pigmenti', 's mnohočetnými': 'vrozené srdeční vady'}, {'vše v': 'naučená bezmocnost'}, {'v kloubech': 'synoviální cysta', 'v a': 'parciální tromboplastinový čas', 'a kloubech': 'juvenilní artritida', 'a kyčelních': 'artróza kyčelních kloubů'}]
[{'hrudní a': 'Thoracoabdominal aortic aneurysm'}, {'jizva v': 'Vaccination site scar'}, {'Baker. cysta': 'Popliteal Cyst'}, {'změny s': 'Mood alterations with depressive symptoms'}, {'TEN 0': 'WHODAS 2.0 12-item Version Proxy-administered - Concentrating for Ten Minutes'}]


## Label specific Search
The access, where we have been handling all labels the same way have the issues. Because we cannot do label focus improvements. So all improvements we will do, we will only focus on the most common labels.

### New functions used there

Functions for searching with mistake:

In [55]:
def search_words_bottom(string, func):
    splitted_input = (string.split(" "))
    lenght = len(splitted_input)
    
    result_dict = {}
    for j in range(1, lenght + 1):
        splitted_dict={}
        for elem in splitted_input:
            splitted_dict[j] = False
                    
        for words in combinations(splitted_input, j):
            data = func(" ".join(words))

            if len(data) == 0:
                continue
            for j in words:
                splitted_dict[j] = True
            result_dict[" ".join(words)] = [(i[0], i[1]) for i in data]
                    
        splitted_input = [j for j, i in splitted_dict.items() if i]
        if len(splitted_input) == 0:
            break 
            
    return result_dict


def search_words_mistake_bottom(string, func1, func2, output_errors=False, limit=0.5):
    result_dict = {}
    for word in string.split(" "):
        n = 0
        search = func1(word)
        if len(search) == 0:
            while len(search) == 0 and n <= math.floor(len(word) * (limit)):
                n += 1
                search = func2(word, n)

        if output_errors:
            result_dict[(word, n)] = search
        else:
            result_dict[word] = search
        
    return result_dict

In [56]:
def medvik_mistakes_search(string, mistakes, database):
    result = []
    string = patternize(string)
    for_loop_stopped = False
    pattern = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    
    for child in database:
        for subchild in child.iter("{http://www.loc.gov/MARC21/slim}datafield"):
            tag = int(subchild.attrib["tag"])
            if tag <= 72 or 690 >= tag >= 680:
                continue
            for subsubchild in subchild.iter("{http://www.loc.gov/MARC21/slim}subfield"):
                if subsubchild.text and pattern.search(subsubchild.text) is not None:
                    try:
                        code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
                        name = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "150" ][0][0].text
                        result.append((code, name))
                        for_loop_stopped = True
                        break
                    except IndexError:
                        for_loop_stopped = True
                        break
            if for_loop_stopped:
                for_loop_stopped = False
                break 
            
    return result


def medvik_find_mistakes_search(dictionary, test=False, database=offline_mshcz):
    for w in dictionary:
        if dictionary[w] != []:
            continue

        n = 1
        r = []
        while r == [] and n <= len(w) // 2:
            r = medvik_mistakes_search(w, n, database)
            n += 1
            
        if r != [] and test:
            test_right.append(w)
            print("Find for:", w, ", mistakes:", n - 1)
        elif test:
            print("Don't find for:", w, "mistakes:", n - 1)
            
        dictionary[w] = r
    return dictionary

Function for searching in database

In [57]:
def medvik_without_descr_search(string, test, database):
    result = []
    for_loop_stopped = False
    
    for child in database:
        for subchild in child.iter("{http://www.loc.gov/MARC21/slim}datafield"):
            tag = int(subchild.attrib["tag"])
            if tag <= 72 or 690 >= tag >= 680:
                continue
            for subsubchild in subchild.iter("{http://www.loc.gov/MARC21/slim}subfield"):
                if subsubchild.text and test(string, subsubchild.text):
                    try:
                        code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
                        name = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "150" ][0][0].text
                        result.append((code, name))
                        for_loop_stopped = True
                        break
                    except IndexError:
                        for_loop_stopped = True
                        break
            if for_loop_stopped:
                for_loop_stopped = False
                break 
    
    return result


def medvik_combined_without_descr_search(string, database=offline_mshcz):
    result = medvik_without_descr_search(string, lambda x, y: x.lower() == y.lower(), database)
    if len(result) == 0:
        result = medvik_without_descr_search(string, 
                                                   lambda x, y: (f" {x.lower()} " in y.lower()) or
                                                                (f"{x.lower()} " == y.lower()[:(len(x) + 1)]) or
                                                                (f" {x.lower()}" == y.lower()[(-len(x) - 1):]),
                                                   database)
    if len(result) == 0:
        result = medvik_without_descr_search(string, lambda x, y: x.lower() in y.lower(), database)
    
    return result

Help function for testing how we were good at finding mistake

In [58]:
def add_all_searches(string, dictionary):
    for w in string.split(" "):
        if w not in dictionary:
            dictionary[w] = []

    return dictionary


def drop_empty_searches(dictionary):
    for w in dictionary.copy():
        if dictionary[w] == []:
            dictionary.pop(w)

    return dictionary

Function for more precise asking GPT

In [164]:
def message_for_GPT_with_empty(string, li, find, context=None):
    if len(li) == 0:
        return ""
            
    if context is None:
        result = [f"Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: \"{string}\":\n"]
    else:
        result = [f"Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: \"{string}\" v kontextu:  \"{context}\":\n"]
        
    j = 1
    for i in li:
        result.append(f"{j}. {find(i[0])} (pojem: {i[1]})\n")
        j += 1

    result.append(f"{j}. Žádná z výše uvedených možností plně nevystihuje daný pojem.\n")
    result.append("Jako odpověď mi pošli pouze číslo odpovědi. Pokud to nebude žádná z možností, pak odpověz NONE. "
                  "Pokud to není lékařský pojem odpověz taky NONE.")
    
    return "".join(result)

Function for combining words

In [60]:
def print_long_searches_len_dep(db, limit=20):
    temp = db.apply(lambda x: [(i, len(x[i]), len(x[i]) / len(i)) for i in x if len(x[i]) > limit * len(i)])
    print(sorted(list(temp[temp.apply(lambda x: len(x) != 0)]), key=(lambda x: x[0][2])))


def drop_long_searches_len_dep(dictionary, limit=50):
    for key in dictionary.copy():
        if len(dictionary[key]) > limit * len(key):
            dictionary[key] = []
    return dictionary


def create_doubles(dictionary):
    result = {}
    for k_1 in dictionary:
        for k_2 in dictionary:
            if k_1 == k_2:
                continue

            r = list(set(dictionary[k_1]).intersection(
                     set(dictionary[k_2])))
            if len(r) > 0 and (k_2 + " " + k_1) not in result:
                result[k_1 + " " + k_2] = r

    return result


def combine_searches(dictionary):
    combined_results = {}
    result = create_doubles(dictionary)
    combined_results.update(result)
    while result != {}:
        result = create_doubles(result)
        result = {" ".join(set(k.split(" "))) :result[k] for k in result}
        combined_results.update(result)

    return combined_results

In [375]:
def is_negative_word(string):
    string = string.strip(" ")
    return string in ["0", "O", "bez"] or " ne" in f" {string}"


def is_soft_negative_word(string):
    string = string.strip(" ")
    return string in ["0", "O", "bez", "ne", "neguje", "neg"] or " negat" in f" {string}"


def make_negation_table(db):
    db = db.copy()
    db["ne"] = db["text"].apply(lambda x: any(True for word in x.split(" ") if is_soft_negative_word(word)))
    db["text"] = db["text"].apply(lambda text: " ".join(
        [x.strip(":") for x in text.split(" ") if not(is_soft_negative_word(x))]))

    return db

In [387]:
def shortcuts(string):
    result = []
    for word in string.split(" "):
        if "." in word:
            result += word.strip(" .").split(".")
    return " ".join(result)


def shortcuts_medvik_without_descr_search(string, database=offline_mshcz):
    return medvik_without_descr_search(
        string, lambda x, y: (f" {x.lower()}" in y.lower()) or
                             (f"{x.lower()}" == y.lower()[:(len(x) + 1)]), database)


def shortcuts_medvik_mistakes_without_descr_search(string, mistakes, database):
    return medvik_mistakes_search(" " + string, mistakes, database)
    

def make_shortcuts_table(db):
    db = db.copy()
    db["shortcuts"] = db["text"].apply(shortcuts)
    db["text"] = db["text"].apply(lambda x: " ".join([word for word in x.split(" ") if "." not in word]))
    return db

Interprets results

In [279]:
def interprete_explanation(dictionary):
    dictionary = {k: dictionary[k] for k in dictionary if dictionary[k] is not None}
    long_words = set()
    [k for k in dictionary if len(k.split(" ")) >= 2 and long_words.update(k.split(" "))]

    return {k : dictionary[k] for k in dictionary if k not in long_words}

In [548]:
def medvik_with_mistake_all_search_table(db, database, output_errors=False):
    db = db.copy()
    db["medvik_search"] = "N/A"
    
    for j in db.index:
        x = {}
        if db["text"][j] != "":
            x.update(search_words_mistake_bottom(
                db["text"][j], 
                lambda x: medvik_combined_without_descr_search(x, database),
                lambda x, y: medvik_mistakes_search(x, y, database=database),
                output_errors=output_errors,
                limit=0.25))
        
        if db["about"][j] != "N/A":
            x.update(search_words_mistake_bottom(
                db["about"][j],
                lambda x: medvik_combined_without_descr_search(x, osobni_a),
                lambda x, y: medvik_mistakes_search(x, y, database=database),
                output_errors=output_errors,
                limit=0.25))

        if db["shortcuts"][j] != "": 
            x.update(search_words_mistake_bottom(
                db["shortcuts"][j],
                lambda x: shortcuts_medvik_without_descr_search(x, database),
                lambda x, y: shortcuts_medvik_mistakes_without_descr_search(x, y, database=database),
                output_errors=output_errors,
                limit=0.25))
            
        db.at[j, "medvik_search"] = x

    if output_errors:
        db["medvik_search_errors"] = db["medvik_search"].apply(lambda x: {k[0]:k[1] for k in x})
        db["medvik_search"] = db["medvik_search"].apply(lambda x: {k[0]:x[k] for k in x})

    return db


def medvik_search_all_read_csv(path):
    db = pd.read_csv(path)
    db.index = db["Unnamed: 0"]
    db.drop(["Unnamed: 0"], axis=1, inplace=True)
    db["medvik_search"] = db["medvik_search"].apply(from_string_to_dict)
    db["about"] = db["about"].fillna("N/A")
    db["shortcuts"] = db["shortcuts"].fillna("")

    return db


def medvik_choose_GPT_table(db):
    db = db.copy()
    db["medvik_explanation"] = "N/A"
    
    for i in db.index:
        result = {}
        combined_level = -np.infty
        is_find = set()
        
        l = db["combined_medvik"][i]
        for text in sorted(l, key=lambda x: len(x.split(" ")), reverse=True):
            if combined_level > len(text.split(" ")) and all([w in is_find for w in text.split(" ")]):
                continue
            message = message_for_GPT(text, l[text], medvik_find_by_code, context=db["original_text"][i])
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            if result[text] is not None:
                is_find.update(text.split(" "))
                combined_level = len(text.split(" "))

        l = db["medvik_search"][i]
        for text in l:
            if text in is_find:
                continue
            message = message_for_GPT(text, l[text], medvik_find_by_code, context=db["original_text"][i])
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        db.at[i, "medvik_explanation"] = result
        
    return db


def medvik_explanation_all_read_csv(path):
    db = pd.read_csv(path)
    db.index = db["Unnamed: 0"]
    db.drop(["Unnamed: 0"], axis=1, inplace=True)
    db["medvik_search"] = db["medvik_search"].apply(from_string_to_dict)
    db["combined_medvik"] = db["combined_medvik"].apply(from_string_to_dict)
    db["medvik_explanation"] = db["medvik_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    db["about"] = db["about"].fillna("N/A")
    db["shortcuts"] = db["shortcuts"].fillna("")

    return db

### Label procedura 01

In [66]:
improved_procedura = data_improved_punctuation[(data_improved_punctuation["label"] == "procedura")].copy()
print("Number of 'procedura' label:", len(improved_procedura))
procedura_test = improved_procedura.sample(35, random_state=34)
procedura_test.head(3)

Number of 'procedura' label: 596


Unnamed: 0,label,text,original_text,about
4791,procedura,genet. testování,genet. testování,
1223,procedura,parciální mastektomie,parciální mastektomie,
108,procedura,rekonstrukce,rekonstrukce,


#### Linking and linking with mistake

In [67]:
procedury = []
for child in offline_mshcz:
    try:
        d = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "072" ][0][0].text
        if d is not None and d[0] == "E":
            procedury.append(child)
    except IndexError:
        pass                
print(len(procedury))

if os.path.isfile("saved_search/procedura_test.csv"):
    procedura_test = pd.read_csv("saved_search/procedura_test.csv")
    procedura_test.index = procedura_test["Unnamed: 0"]
    procedura_test.drop(["Unnamed: 0"], axis=1, inplace=True)
    procedura_test["mash_search"] = procedura_test["mash_search"].apply(from_string_to_dict)
    procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(from_string_to_dict)

else:
    procedura_test = improved_procedura.sample(35, random_state=34)
    procedura_test["mash_search"] = "N/A"
    for j in procedura_test.index:
        procedura_test.at[j, "mash_search"] = search_words_bottom(procedura_test["text"][j], mash_search)
        if procedura_test["about"][j] != "N/A":
            x = procedura_test["mash_search"][j]
            x.update(search_words_bottom(procedura_test["about"][j], mash_search))
            procedura_test.at[j, "mash_search"] = x
        
    procedura_test["medvik_search"] = "N/A"
    for j in procedura_test.index:
        procedura_test.at[j, "medvik_search"] = search_words_bottom(procedura_test["text"][j], lambda x: medvik_combined_without_descr_search(x, procedury))
        if procedura_test["about"][j] != "N/A":
            x = procedura_test["medvik_search"][j]
            x.update(search_words_bottom(procedura_test["about"][j], lambda x: medvik_combined_without_descr_search(x, procedury)))
            procedura_test.at[j, "medvik_search"] = x

    procedura_test.to_csv("saved_search/procedura_test.csv")

procedura_test.head(3)

3019


Unnamed: 0_level_0,label,text,original_text,about,mash_search,medvik_search
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4791,procedura,genet. testování,genet. testování,,"{'genet.': [('C0325074', 'Genets'), ('C0032246...","{'testování': [('D000067108', 'DTC screening a..."
1223,procedura,parciální mastektomie,parciální mastektomie,,"{'parciální': [('C0020608', 'Hypodontia'), ('C...","{'parciální': [('D010314', 'parciální trombopl..."
108,procedura,rekonstrukce,rekonstrukce,,"{'rekonstrukce': [('C0085076', 'Mammaplasty'),...","{'rekonstrukce': [('D000070638', 'rekonstrukce..."


In [68]:
# for j in procedura_test.index:
#     procedura_test.at[j, "mash_search"] = add_all_searches(procedura_test["text"][j], procedura_test["mash_search"][j])
#     procedura_test.at[j, "medvik_search"] = add_all_searches(procedura_test["text"][j], procedura_test["medvik_search"][j])

# for j in procedura_test.index:
#     procedura_test.at[j, "medvik_search"] = medvik_find_mistakes_search(procedura_test["medvik_search"][j], test=False, database=procedury)

find_with_mistake = [('genet.', 1), ('nechemo', 1), ('adjuv.', 1), ('CT1', 1), ('epigastria', 3), ('disekci', 1),
            ('prsou', 1), ('mastektomii', 1), ('vpravo', 1), ('vyšších', 2), ('etáží', 1), ('parc.', 1), ('SNb', 1),
            ('l.dx', 2), ('st.p.NACT', 4), ('vpravo', 1), ('BRCA1', 2), ('BRCA2', 2), ('PALB2', 2), ('CHEK2', 2), ('TP53', 2),
            ('dalších', 2), ('parc.', 1), ('SNB', 1), ('adjuvantním',              1), ('simplexní', 2), ('vlevo', 1), ('parc.', 1),
            ('mastektomii', 1), ('SNB', 1), ('bilat.', 1), ('SNB', 1), ('vlevo', 1), ('mastectomia', 2), ('simplex', 1),
            ('mammae', 1), ('l.', 1), ('etáži', 1)]
print(find_with_mistake)

[('genet.', 1), ('nechemo', 1), ('adjuv.', 1), ('CT1', 1), ('epigastria', 3), ('disekci', 1), ('prsou', 1), ('mastektomii', 1), ('vpravo', 1), ('vyšších', 2), ('etáží', 1), ('parc.', 1), ('SNb', 1), ('l.dx', 2), ('st.p.NACT', 4), ('vpravo', 1), ('BRCA1', 2), ('BRCA2', 2), ('PALB2', 2), ('CHEK2', 2), ('TP53', 2), ('dalších', 2), ('parc.', 1), ('SNB', 1), ('adjuvantním', 1), ('simplexní', 2), ('vlevo', 1), ('parc.', 1), ('mastektomii', 1), ('SNB', 1), ('bilat.', 1), ('SNB', 1), ('vlevo', 1), ('mastectomia', 2), ('simplex', 1), ('mammae', 1), ('l.', 1), ('etáži', 1)]


#### Combining words

In [69]:
print("Lenght_dependent:")
print_long_searches_len_dep(procedura_test["medvik_search"], limit=40)
print("\nLength_independent:")
print_long_searches(procedura_test["medvik_search"], limit=40)

Lenght_dependent:
[[('TM', 86, 43.0)], [('do', 237, 118.5)], [('s', 129, 129.0)], [('a', 405, 405.0)], [('a', 405, 405.0)], [('a', 405, 405.0)], [('ME', 1076, 538.0)], [('ME', 1076, 538.0)], [('ME', 1076, 538.0)], [('ME', 1076, 538.0)], [('i', 2725, 2725.0), ('s', 129, 129.0)]]

Length_independent:
[[('NAC', 66)], [('HT', 70)], [('TM', 86)], [('s', 129)], [('do', 237)], [('a', 405)], [('a', 405)], [('a', 405)], [('ME', 1076)], [('ME', 1076)], [('ME', 1076)], [('ME', 1076)], [('i', 2725), ('s', 129)]]


We can notice that there is limit, where we can remove single letters or common words, which don't have medical meaning. The limit is around 50. This filtered words we will use to create doubles (or more), where we will have probably real medical terms.

In [70]:
procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, limit=40))

In [71]:
temp = procedura_test["medvik_search"].apply(lambda x: {k : x[k] for k in x if not len(k.split(" ")) > 1}).apply(combine_searches)
temp[temp != {}]

Series([], Name: medvik_search, dtype: object)

In [72]:
procedura_test["combined_medvik"] = procedura_test["medvik_search"].apply(
    lambda x: {k : x[k] for k in x if not len(k.split(" ")) > 1}).apply(combine_searches)

procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(lambda x: {k : x[k] for k in x if not len(k.split(" ")) > 1})
procedura_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,mash_search,medvik_search,combined_medvik
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4791,procedura,genet. testování,genet. testování,,"{'genet.': [('C0325074', 'Genets'), ('C0032246...","{'testování': [('D000067108', 'DTC screening a...",{}
1223,procedura,parciální mastektomie,parciální mastektomie,,"{'parciální': [('C0020608', 'Hypodontia'), ('C...","{'parciální': [('D010314', 'parciální trombopl...",{}
108,procedura,rekonstrukce,rekonstrukce,,"{'rekonstrukce': [('C0085076', 'Mammaplasty'),...","{'rekonstrukce': [('D000070638', 'rekonstrukce...",{}


#### Choosing best match

In [73]:
print_long_searches(procedura_test["medvik_search"], limit=10)

[[('rekonstrukce', 12)], [('se', 13)], [('se', 13)], [('RAME', 22)], [('amb', 25)], [('BT', 26)], [('PM', 33)], [('sono', 36), ('axil', 20)], [('NAC', 66)], [('HT', 70)]]


In [74]:
procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(lambda x: drop_long_searches(x, 40))
procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(drop_empty_searches)
procedura_test["mash_search"] = procedura_test["mash_search"].apply(drop_empty_searches)

In [75]:
if os.path.isfile("saved_search/procedura_test_explanation.csv"):
    procedura_test = pd.read_csv("saved_search/procedura_test_explanation.csv")
    procedura_test.index = procedura_test["Unnamed: 0"]
    procedura_test.drop(["Unnamed: 0"], axis=1, inplace=True)
    procedura_test["mash_search"] = procedura_test["mash_search"].apply(from_string_to_dict)
    procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(from_string_to_dict)
    procedura_test["combined_medvik"] = procedura_test["combined_medvik"].apply(from_string_to_dict)
    procedura_test["mash_explanation"] = procedura_test["mash_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    procedura_test["medvik_explanation"] = procedura_test["medvik_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    
else:
    procedura_test["medvik_explanation"] = "N/A"
    for i in procedura_test.index:
        l = procedura_test["medvik_search"][i]
        result = {}
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        procedura_test.at[i, "medvik_explanation"] = result
        
    procedura_test["mash_explanation"] = "N/A"
    for i in procedura_test.index:
        l = procedura_test["mash_search"][i]
        result = {}
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
        l = procedura_test["combined_medvik"][i]
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        
        procedura_test.at[i, "mash_explanation"] = result
    procedura_test.to_csv("saved_search/procedura_test_explanation.csv")

procedura_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,mash_search,medvik_search,combined_medvik,medvik_explanation,mash_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4791,procedura,genet. testování,genet. testování,,"{'genet.': [('C0325074', 'Genets'), ('C0032246...","{'testování': [('D000067108', 'DTC screening a...",{},"{'testování': ('D000086742', 'testování na COV...","{'genet.': ('C0314603', 'Genetic', ''), 'testo..."
1223,procedura,parciální mastektomie,parciální mastektomie,,"{'parciální': [('C0020608', 'Hypodontia'), ('C...","{'parciální': [('D010314', 'parciální trombopl...",{},"{'parciální': None, 'mastektomie': ('D015409',...","{'parciální': ('C0080024', 'Piebaldism', ''), ..."
108,procedura,rekonstrukce,rekonstrukce,,"{'rekonstrukce': [('C0085076', 'Mammaplasty'),...","{'rekonstrukce': [('D000070638', 'rekonstrukce...",{},"{'rekonstrukce': ('D001178', 'artroplastika', ...","{'rekonstrukce': ('C0195196', 'Reconstruction ..."


#### Results

In [76]:
asign = []
for j in procedura_test.sample(35, random_state=42).index:   
    if procedura_test["medvik_explanation"][j] is not None:
        x = procedura_test["medvik_explanation"][j]
        asign.append((procedura_test["text"][j], "Medvik", "{}".format([(e, x[e]) for e in x])))
    if procedura_test["mash_explanation"][j] is not None:
        x = procedura_test["mash_explanation"][j]
        asign.append((procedura_test["text"][j], "Mash", "{}".format([(e, x[e]) for e in x])))
    if procedura_test["mash_explanation"][j] is None and procedura_test["medvik_explanation"][j] is None:
        asign.append((procedura_test["text"][j], "None", "Empty"))

i = 0
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)

In [77]:
print("Number of examples: {}".format(len(procedura_test)))

print("Number of empty Linking for Medvik_combined_search: {}".format(procedura_test["medvik_search"].apply(lambda x: len(x) == 0).sum()))
print("Number of empty Linking for Mash_search: {}".format(procedura_test["mash_search"].apply(lambda x: len(x) == 0).sum()))

Number of examples: 35
Number of empty Linking for Medvik_combined_search: 4
Number of empty Linking for Mash_search: 3


In [78]:
print("Not assigned medvik:", len([i for i in asign if i[1] == "Medvik" and i[3] == ""]))
print("Mistakes from medvik:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]))
print("Partially right from medvik:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]))
print("Right from medvik:", len([i for i in asign if i[3] == 'Right' and i[1] == "Medvik"]))

print("\nNot assigned mash:", len([i for i in asign if i[1] == "Mash" and i[3] == ""]))
print("Mistakes from mash:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]))
print("Partially right from mash:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Mash"]))
print("Right from mash:", len([i for i in asign if i[3] == 'Right' and i[1] == "Mash"]))

Not assigned medvik: 5
Mistakes from medvik: 10
Partially right from medvik: 17
Right from medvik: 3

Not assigned mash: 3
Mistakes from mash: 21
Partially right from mash: 9
Right from mash: 2


- Medvik Right: 9 %
- Medvik Partially: 57 %

- Mash Right: 6 %
- Mash Partially: 31 %

Problems
- shorcuts
- too afraid to say don't know
- Lots of parts of body, don't recognize them.

Good
- All combined words were right

### Label procedura 02

In [79]:
procedura_test2 = improved_procedura.sample(35, random_state=123)       
procedura_test2.head(3)

Unnamed: 0,label,text,original_text,about
2769,procedura,z core biopsie,z core biopsie,
45,procedura,krve ve stolici hemokult test,krve ve stolici - hemokult test,
2517,procedura,adjuvance,adjuvance,


#### Linking with mistakes

In [80]:
procedury_anathomy = []
for child in offline_mshcz:
    try:
        d = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "072" ][0][0].text
        if d is not None and d[0] in ["E", "A"]:
            procedury_anathomy.append(child)
    except IndexError:
        pass                
print(len(procedury_anathomy))

4915


In [81]:
if os.path.isfile("saved_search/procedura_test2.csv"):
    procedura_test2 = pd.read_csv("saved_search/procedura_test2.csv")
    procedura_test2.index = procedura_test2["Unnamed: 0"]
    procedura_test2.drop(["Unnamed: 0"], axis=1, inplace=True)
    procedura_test2["medvik_search"] = procedura_test2["medvik_search"].apply(from_string_to_dict)

else:
    procedura_test2["medvik_search"] = "N/A"
    for j in procedura_test2.index:
        procedura_test2.at[j, "medvik_search"] = search_words_mistake_bottom(
            procedura_test2["text"][j], lambda x: medvik_combined_without_descr_search(x, procedury_anathomy), 
                                        lambda x, y: medvik_mistakes_search(x, y, database=procedury_anathomy))
        
        if procedura_test2["about"][j] != "N/A":
            x = procedura_test2["medvik_search"][j]
            x.update(search_words_mistake_bottom(procedura_test2["text"][j],
                                              lambda x: medvik_combined_without_descr_search(x, procedury_anathomy), 
                                              lambda x: medvik_find_mistakes_search(x, database=procedury_anathomy)))
            procedura_test2.at[j, "medvik_search"] = x

    procedura_test2.to_csv("saved_search/procedura_test2.csv")

procedura_test2.head(3)

Unnamed: 0_level_0,label,text,original_text,about,medvik_search
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2769,procedura,z core biopsie,z core biopsie,,"{'z': [('D000071936', 'výtok z bradavky'), ('D..."
45,procedura,krve ve stolici hemokult test,krve ve stolici - hemokult test,,"{'krve': [('D000071997', 'kultivační vyšetření..."
2517,procedura,adjuvance,adjuvance,,"{'adjuvance': [('D017358', 'skelet buněčné stě..."


In [82]:
print("Lenght_dependent:")
print_long_searches_len_dep(procedura_test2["medvik_search"], limit=30)
print("\nLength_independent:")
print_long_searches(procedura_test2["medvik_search"], limit=50)

procedura_test2["medvik_search"] = procedura_test2["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, 40))

Lenght_dependent:
[[('test', 129, 32.25)], [('sin.', 136, 34.0), ('I', 4470, 4470.0)], [('z', 42, 42.0)], [('z', 42, 42.0), ('v', 136, 136.0)], [('oper', 173, 43.25), ('bilt', 295, 73.75)], [('sin', 135, 45.0)], [('SNB', 722, 240.66666666666666), ('sin.', 136, 34.0), ('No', 77, 38.5), ('I', 4470, 4470.0)], [('SNB', 722, 240.66666666666666)], [('op.', 1378, 459.3333333333333)], [('USG', 2178, 726.0), ('a', 609, 609.0)], [('USG', 2178, 726.0)], [('RT', 1472, 736.0)], [('RT', 1472, 736.0), ('na', 149, 74.5)], [('ME', 1805, 902.5), ('s', 140, 140.0)], [('ko', 2050, 1025.0), ('dif', 152, 50.666666666666664)]]

Length_independent:
[[('krve', 61), ('test', 129)], [('terapie', 84)], [('Cov', 86)], [('sin', 135)], [('sin.', 136), ('I', 4470)], [('v', 136)], [('oper', 173), ('bilt', 295)], [('SNB', 722), ('sin.', 136), ('No', 77), ('I', 4470)], [('SNB', 722)], [('op.', 1378)], [('RT', 1472)], [('RT', 1472), ('na', 149)], [('ME', 1805), ('s', 140)], [('ko', 2050), ('dif', 152)], [('USG', 2178), (

In [83]:
temp = procedura_test2["medvik_search"].apply(combine_searches)
temp[temp != {}]

Unnamed: 0
45      {'krve stolici': [('D001794', 'krevní tlak')],...
2520    {'oblast mammy': [('D007034', 'zadní hypotalam...
356     {'mastectomia partialis': [('D015412', 'segmen...
1046    {'adjuvantní chemobioterapi': [('D017024', 'ad...
4692    {'odběry Cov': [('D013048', 'odběr biologickéh...
468     {'konizaci čípku': [('D019092', 'konizace dělo...
30      {'radikální mastektomie': [('D008408', 'mastek...
1548    {'biopsie uzliny': [('D001706', 'biopsie'), ('...
Name: medvik_search, dtype: object

In [84]:
procedura_test2["combined_medvik"] = procedura_test2["medvik_search"].apply(combine_searches)
procedura_test2.head(3)

Unnamed: 0_level_0,label,text,original_text,about,medvik_search,combined_medvik
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2769,procedura,z core biopsie,z core biopsie,,"{'z': [], 'core': [('D000088482', 'denzní váčk...",{}
45,procedura,krve ve stolici hemokult test,krve ve stolici - hemokult test,,"{'krve': [('D000071997', 'kultivační vyšetření...","{'krve stolici': [('D001794', 'krevní tlak')],..."
2517,procedura,adjuvance,adjuvance,,"{'adjuvance': [('D017358', 'skelet buněčné stě...",{}


#### Choosing best match

In [85]:
print_long_searches(procedura_test2["medvik_search"], limit=10)

[[('adjuv', 11), ('oblast', 41), ('mammy', 46)], [('mastectomia', 11), ('partialis', 27)], [('adjuv', 11)], [('doppler', 11)], [('disekcí', 12)], [('biopsie', 15)], [('biopsie', 15)], [('biopsie', 15), ('uzliny', 11), ('axille', 33)], [('bez', 16)], [('carpl', 18)], [('2', 18)], [('se', 19)], [('trombo', 27)], [('chemobioterapi', 29)], [('chemobioterapie', 29)], [('odběry', 35), ('Cov', 86)], [('prsou', 36), ('axill', 32)], [('prsou', 36)], [('krve', 61), ('ve', 19), ('test', 129)], [('terapie', 84)], [('sin.', 136), ('No', 77)], [('sin.', 136)]]


In [86]:
procedura_test2["medvik_search"] = procedura_test2["medvik_search"].apply(lambda x: drop_long_searches(x, 65))
procedura_test2["medvik_search"] = procedura_test2["medvik_search"].apply(drop_empty_searches)

In [87]:
if os.path.isfile("saved_search/procedura_test2_explanation.csv"):
    procedura_test2 = pd.read_csv("saved_search/procedura_test2_explanation.csv")
    procedura_test2.index = procedura_test2["Unnamed: 0"]
    procedura_test2.drop(["Unnamed: 0"], axis=1, inplace=True)
    procedura_test2["medvik_search"] = procedura_test2["medvik_search"].apply(from_string_to_dict)
    procedura_test2["combined_medvik"] = procedura_test2["combined_medvik"].apply(from_string_to_dict)
    procedura_test2["medvik_explanation"] = procedura_test2["medvik_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    
else:
    procedura_test2["medvik_explanation"] = "N/A"
    for i in procedura_test2.index:
        l = procedura_test2["medvik_search"][i]
        result = {}
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        procedura_test2.at[i, "medvik_explanation"] = result
        
        l = procedura_test2["combined_medvik"][i]
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
        procedura_test2.at[i, "medvik_explanation"] = result
    procedura_test2.to_csv("saved_search/procedura_test2_explanation.csv")

procedura_test2.head(3)

Unnamed: 0_level_0,label,text,original_text,about,medvik_search,combined_medvik,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
2769,procedura,z core biopsie,z core biopsie,,"{'core': [('D000088482', 'denzní váčky'), ('D0...",{},"{'core': None, 'biopsie': ('D001706', 'biopsie..."
45,procedura,krve ve stolici hemokult test,krve ve stolici - hemokult test,,"{'ve': [('D000067491', 'téměř pochybení ve zdr...","{'krve stolici': [('D001794', 'krevní tlak')],...","{'krve': ('D000094902', 'krevní bankovnictví',..."
2517,procedura,adjuvance,adjuvance,,"{'adjuvance': [('D017358', 'skelet buněčné stě...",{},{'adjuvance': None}


#### Results

In [88]:
asign = []
for j in procedura_test2.sample(35, random_state=42).index:   
    if procedura_test2["medvik_explanation"][j] is not None:
        x = procedura_test2["medvik_explanation"][j]
        asign.append((procedura_test2["text"][j], "Medvik", "{}".format([(e, x[e]) for e in x])))
    else:
        asign.append((procedura_test2["text"][j], "None", "Empty"))

i = 0
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)

In [89]:
print("Number of examples: {}".format(len(procedura_test2)))
print("Number of empty Linking for Medvik_combined_search: {}".format(procedura_test2["medvik_search"].apply(lambda x: len(x) == 0).sum()))

Number of examples: 35
Number of empty Linking for Medvik_combined_search: 0


In [90]:
print("Not assigned medvik:", len([i for i in asign if i[1] == "Medvik" and i[3] == ""]))
print("Mistakes from medvik:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]))
print("Partially right from medvik:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]))
print("Right from medvik:", len([i for i in asign if i[3] == 'Right' and i[1] == "Medvik"]))

Not assigned medvik: 0
Mistakes from medvik: 13
Partially right from medvik: 10
Right from medvik: 12


Right: 34 %

Partially or Right: 63 %

### Label procedura 03 - Solving shortcuts & Limit for mistake

In [91]:
procedura_test = improved_procedura.sample(35, random_state=9)       
procedura_test.head(3)

Unnamed: 0,label,text,original_text,about
672,procedura,SNB,SNB,
875,procedura,lokalizace TU a uzliny vlevo jod. zrno,lokalizace TU a uzliny vlevo jod. zrno,
2952,procedura,RT CHT,"RT, CHT",


In [92]:
procedura_test[procedura_test["text"].apply(lambda x: "." in x)]

Unnamed: 0,label,text,original_text,about
875,procedura,lokalizace TU a uzliny vlevo jod. zrno,lokalizace TU a uzliny vlevo jod. zrno,
625,procedura,parc. ME s disekcí axily,parc. ME s disekcí axily,
3826,procedura,adjuvantní radioterapie na oblast mammy l.sin....,adjuvantní radioterapie na oblast mammy l.sin....,
5350,procedura,2.čtení,2.čtení,
202,procedura,SNB axillae l.dx,SNB axillae l.dx,
1944,procedura,stomatolog. výkonům,stomatolog. výkonům,
1335,procedura,stp. mastectomia partialis l. sin,stp. mastectomia partialis l. sin,


In [93]:
procedura_test["shortcuts"] = procedura_test["text"].apply(shortcuts)
procedura_test["text"] = procedura_test["text"].apply(lambda x: " ".join([word for word in x.split(" ") if "." not in word]))

In [95]:
if os.path.isfile("saved_search/procedura_test3.csv"):
    procedura_test = pd.read_csv("saved_search/procedura_test3.csv")
    procedura_test.index = procedura_test["Unnamed: 0"]
    procedura_test.drop(["Unnamed: 0"], axis=1, inplace=True)
    procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(from_string_to_dict)
    procedura_test["medvik_search_errors"] = procedura_test["medvik_search_errors"].apply(from_string_to_int_dict)

else:
    procedura_test["medvik_search"] = "N/A"
    for j in procedura_test.index:
        x = {}
        if procedura_test["text"][j] != "":
            x.update(search_words_mistake_bottom(procedura_test["text"][j],
                                              lambda x: medvik_combined_without_descr_search(x, procedury_anathomy), 
                                              lambda x, y: medvik_mistakes_search(x, y, database=procedury_anathomy),
                                              output_errors=True))
        if procedura_test["about"][j] != "N/A":
            x.update(search_words_mistake_bottom(procedura_test["about"][j],
                                              lambda x: medvik_combined_without_descr_search(x, procedury_anathomy), 
                                              lambda x, y: medvik_mistakes_search(x, y, database=procedury_anathomy),
                                              output_errors=True))
        if procedura_test["shortcuts"][j] != "": 
            x.update(search_words_mistake_bottom(procedura_test["shortcuts"][j],
                                              lambda x: shortcuts_medvik_without_descr_search(x, procedury_anathomy), 
                                              lambda x, y: shortcuts_medvik_mistakes_without_descr_search(x, y, database=procedury_anathomy),
                                              output_errors=True))
        procedura_test.at[j, "medvik_search"] = x

    procedura_test["medvik_search_errors"] = procedura_test["medvik_search"].apply(lambda x: {k[0]:k[1] for k in x})
    procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(lambda x: {k[0]:x[k] for k in x})
    procedura_test.to_csv("saved_search/procedura_test3.csv")
    
procedura_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,medvik_search_errors
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
672,procedura,SNB,SNB,,,"{'SNB': [('D000067029', 'fyzický vzhled těla')...",{'SNB': 1}
875,procedura,lokalizace TU a uzliny vlevo zrno,lokalizace TU a uzliny vlevo jod. zrno,,jod,"{'lokalizace': [('D001931', 'mapování mozku')]...","{'lokalizace': 0, 'TU': 0, 'a': 0, 'uzliny': 0..."
2952,procedura,RT CHT,"RT, CHT",,,"{'RT': [('D000009', 'břišní svaly'), ('D000010...","{'RT': 0, 'CHT': 0}"


In [96]:
print_long_searches_len_dep(procedura_test["medvik_search"], limit=30)
procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, 40))

[[('výkonům', 256, 36.57142857142857)], [('klip', 155, 38.75)], [('sin', 135, 45.0), ('stp', 1545, 515.0), ('l', 1094, 1094.0)], [('na', 149, 74.5), ('a', 609, 609.0), ('l', 1094, 1094.0)], [('na', 149, 74.5)], [('na', 149, 74.5)], [('2', 81, 81.0)], [('klipu', 488, 97.6)], [('SNB', 722, 240.66666666666666)], [('SNB', 722, 240.66666666666666), ('l', 1094, 1094.0)], [('PHK', 978, 326.0)], [('TU', 1125, 562.5), ('a', 609, 609.0)], [('USG', 2178, 726.0)], [('USG', 2178, 726.0), ('a', 609, 609.0)], [('USG', 2178, 726.0)], [('RT', 1472, 736.0)], [('RT', 1472, 736.0)], [('ME', 1805, 902.5), ('s', 140, 140.0)]]


In [549]:
procedura_test["combined_medvik"] = procedura_test["medvik_search"].apply(combine_searches)
temp = procedura_test["medvik_search"].apply(combine_searches)
temp[temp != {}]

Unnamed: 0
2350    {'adjuvantní lymfatik': [('D011878', 'radioter...
4504    {'totální mastektomie': [('D015413', 'prostá m...
840     {'adjuvantní chemoterapie': [('D004358', 'farm...
1126    {'adjuv stěny': [('D017358', 'skelet buněčné s...
1911    {'indikována adjuv': [('D000084262', 'hyperter...
Name: medvik_search, dtype: object

#### Choosing best match

In [99]:
print_long_searches(procedura_test["medvik_search"], limit=30)

[[('MK', 36)], [('axil', 39)], [('oblast', 41), ('mammy', 46), ('sin', 40), ('reg', 74), ('lymf', 56)], [('oblast', 41), ('mammy', 46)], [('pME', 60)], [('genetické', 63), ('konzultaci', 39)], [('léčba', 70)], [('terapie', 84)], [('vyšetření', 113)], [('vyšetření', 113), ('chirurgické', 206)], [('vyšetření', 113)], [('klip', 155)], [('výkonům', 256)]]


In [101]:
procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(lambda x: drop_long_searches(x, 65))
procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(drop_empty_searches)

if os.path.isfile("saved_search/procedura_test3_explanation.csv"):
    procedura_test = pd.read_csv("saved_search/procedura_test3_explanation.csv")
    procedura_test.index = procedura_test["Unnamed: 0"]
    procedura_test.drop(["Unnamed: 0"], axis=1, inplace=True)
    procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(from_string_to_dict)
    procedura_test["combined_medvik"] = procedura_test["combined_medvik"].apply(from_string_to_dict)
    procedura_test["medvik_explanation"] = procedura_test["medvik_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    procedura_test["medvik_search_errors"] = procedura_test["medvik_search_errors"].apply(from_string_to_int_dict)

else:
    procedura_test["medvik_explanation"] = "N/A"
    for i in procedura_test.index:
        l = procedura_test["medvik_search"][i]
        result = {}
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        procedura_test.at[i, "medvik_explanation"] = result
        
        l = procedura_test["combined_medvik"][i]
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
        procedura_test.at[i, "medvik_explanation"] = result
    procedura_test.to_csv("saved_search/procedura_test3_explanation.csv")

procedura_test["medvik_explanation"] = procedura_test["medvik_explanation"].apply(interprete_explanation)
procedura_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,medvik_search_errors,combined_medvik,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
672,procedura,SNB,SNB,,,{},{'SNB': 1},{},{}
875,procedura,lokalizace TU a uzliny vlevo zrno,lokalizace TU a uzliny vlevo jod. zrno,,jod,"{'lokalizace': [('D001931', 'mapování mozku')]...","{'lokalizace': 0, 'TU': 0, 'a': 0, 'uzliny': 0...",{},"{'lokalizace': ('D001931', 'mapování mozku', '..."
2952,procedura,RT CHT,"RT, CHT",,,"{'CHT': [('D000068618', 'výběrové šlechtění'),...","{'RT': 0, 'CHT': 0}",{},{'CHT': None}


#### Results

In [103]:
procedura_test["asign"] = "N/A"

i = 0
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"

# print(procedura_test["original_text"].iloc[i])
# print(procedura_test["medvik_explanation"].iloc[i])

In [104]:
procedura_test["asign"].value_counts()

Right        14
Wrong        13
Partially     8
Name: asign, dtype: int64

Right 40 % | Partially 63 %
- Bad filter
- Bad with established abbreviations (written in upper letters)

#### Testing Limit

In [105]:
print("RIGHT")
for i in procedura_test[procedura_test["asign"] == "Right"].sample(14, random_state=32).index:
    x = procedura_test["medvik_search_errors"][i]
    y = procedura_test["medvik_search"][i]
    print(procedura_test["original_text"][i])
    print(x)
    print(y)
    temp = {k : y[k] for k in y if print(k, type(x)) or x[k] != 0}
    if len(temp) != 0:
        print(temp)
    temp2 = {k: (x[k], len(k), round(x[k]/len(k), 3)) for k in x if x[k] != 0}
    if len(temp2) != 0:
        print(temp2)
    print()

print("WRONG")
for i in procedura_test[procedura_test["asign"] != "Right"].sample(10, random_state=32).index:
    x = procedura_test["medvik_search_errors"][i]
    y = procedura_test["medvik_search"][i]
    temp = {k : y[k] for k in y if x[k] != 0}
    print(procedura_test["original_text"][i])
    if len(temp) != 0:
        print(temp)
    temp2 = {k: (x[k], len(k), round(x[k]/len(k), 3)) for k in x if x[k] != 0}
    if len(temp2) != 0:
        print(temp2)
    print()

RIGHT
gynekologické vyšetření
{'gynekologické': 0, 'vyšetření': 0}
{'gynekologické': [('D000074883', 'vulvektomie'), ('D003127', 'kolposkopie'), ('D003464', 'kuldoskopie'), ('D004107', 'dilatace a kyretáž'), ('D007044', 'hysterektomie'), ('D010052', 'ovarektomie'), ('D010808', 'fyzikální vyšetření'), ('D012489', 'salpingostomie'), ('D013246', 'tubární sterilizace'), ('D013509', 'gynekologické chirurgické výkony'), ('D013519', 'urogenitální chirurgické výkony'), ('D015907', 'hysteroskopie'), ('D019093', 'cirkumcize u žen'), ('D020884', 'kolpotomie'), ('D055356', 'techniky ablace endometria'), ('D055357', 'embolizace děložní tepny'), ('D058869', 'gynekologické vyšetření'), ('D058994', 'salpingektomie'), ('D063186', 'myomektomie')]}
gynekologické <class 'dict'>

PET/CT
{'PET': 0, 'CT': 0}
{'PET': [('D000072078', 'PET/CT'), ('D049268', 'pozitronová emisní tomografie'), ('D056447', 'zooterapie')], 'CT': [('D014057', 'počítačová rentgenová tomografie')]}
PET <class 'dict'>
CT <class 'dict'>


Limit 0.25 percentage of mistakes will filter more trash and let go maximum of results results.

In [106]:
for i in procedura_test[procedura_test["asign"] != "Right"].sample(frac=0.5, random_state=16).index:
    print(procedura_test["original_text"][i])
    print(procedura_test["medvik_explanation"][i])

    y = procedura_test["medvik_search"][i]
    {k for k in y if print("\n", k, ":", y[k][:20], end="")}
    print()
                           
    print(procedura_test["medvik_search_errors"][i])
    print("\n")

zavedení klipu
{'zavedení': ('D003559', 'cystostomie', 'Chirurgické vyústění močového měchýře na povrch břišní stěnou, obv. dočasné. Cílem zákroku je umožnit odtok moči při uzávěru dolní části močového měchýře nebo močové trubice.')}

 zavedení : [('D002404', 'katetrizace'), ('D003559', 'cystostomie'), ('D003608', 'dakryocystorinostomie'), ('D004766', 'enterostomie'), ('D004946', 'ezofagostomie'), ('D005774', 'gastrostomie'), ('D008876', 'ventilace středního ucha'), ('D009403', 'perkutánní nefrostomie'), ('D010030', 'zavedení chirurgických vývodů'), ('D010613', 'faryngostomie'), ('D012599', 'sklerostomie'), ('D013514', 'chirurgie operační'), ('D013907', 'torakostomie'), ('D014139', 'tracheostomie'), ('D014519', 'ureterostomie'), ('D014546', 'katetrizace močového měchýře')]
{'zavedení': 0, 'klipu': 2}


MK
{'MK': ('D000072936', 'fitness náramky', 'Zařízení používaná k měření fyzické aktivity jako údaje ZDRAVOTNÍHO STAVU.')}

 MK : [('D000072936', 'fitness náramky'), ('D000076251', 'nosi

Lots of mistakes is there because GPT didn't know the context.

### Label Procedura 04 - Improving message for GPT

#### Test Init

In [528]:
procedura_test = improved_procedura.sample(35, random_state=8)
procedura_test = make_shortcuts_table(procedura_test)

In [529]:
if os.path.isfile("saved_search/procedura_test4.csv"):
    procedura_test = medvik_search_all_read_csv("saved_search/procedura_test4.csv")
    procedura_test["medvik_search_errors"] = procedura_test["medvik_search_errors"].apply(from_string_to_int_dict)
else:
    procedura_test = medvik_with_mistake_all_search_table(procedura_test, procedury_anathomy, output_errors=True)
    procedura_test.to_csv("saved_search/procedura_test4.csv")

procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, 40))
procedura_test["combined_medvik"] = procedura_test["medvik_search"].apply(combine_searches)
procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(lambda x: drop_long_searches(x, 65))
procedura_test["medvik_search"] = procedura_test["medvik_search"].apply(drop_empty_searches)

procedura_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,medvik_search_errors,combined_medvik
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2350,procedura,adjuvantní radioterapii na oblast mammy lymfatik,"st.p. adjuvantní radioterapii, na oblast mammy...",,st p l sin sv,"{'adjuvantní': [('D000084262', 'hypertermická ...","{'adjuvantní': 0, 'radioterapii': 0, 'na': 0, ...","{'adjuvantní lymfatik': [('D011878', 'radioter..."
2674,procedura,verifikováno MTS v axile,verifikováno MTS v axile,,,"{'axile': [('D000080886', 'spatium parapharyng...","{'verifikováno': 4, 'MTS': 1, 'v': 0, 'axile': 1}",{}
2517,procedura,adjuvance,adjuvance,,,"{'adjuvance': [('D017358', 'skelet buněčné stě...",{'adjuvance': 1},{}


#### Testing different messages to choosing by GPT
- The first message is previously used for method for linking.
- The second message has the same form as previous one, but it have additional option: if None of these.
- The last message has also the same form, but it adds also the content, where the term is.

In [526]:
if os.path.isfile("saved_search/procedura_test4_explanation_without_empty.csv"):
    procedura_without_empty = medvik_explanation_all_read_csv("saved_search/procedura_test4_explanation_without_empty.csv")
    procedura_without_empty["medvik_search_errors"] = procedura_without_empty["medvik_search_errors"].apply(from_string_to_int_dict)

else:
    procedura_without_empty = procedura_test.copy()
    procedura_without_empty["medvik_explanation"] = "N/A"
    for i in procedura_without_empty.index:
        result = {}
        combined_level = -np.infty
        is_find = set()
        l = procedura_without_empty["combined_medvik"][i]
        for text in sorted(l, key=lambda x: len(x.split(" ")), reverse=True):
            if combined_level > len(text.split(" ")) and all([w in is_find for w in text.split(" ")]):
                continue
                
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
            if result[text] is not None:
                is_find.update(text.split(" "))
                combined_level = len(text.split(" "))

        l = procedura_without_empty["medvik_search"][i]
        for text in l:
            if text in is_find:
                continue
                
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
        procedura_without_empty.at[i, "medvik_explanation"] = result
    procedura_without_empty.to_csv("saved_search/procedura_test4_explanation_without_empty.csv")

procedura_without_empty.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,medvik_search_errors,combined_medvik,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2350,procedura,adjuvantní radioterapii na oblast mammy lymfatik,"st.p. adjuvantní radioterapii, na oblast mammy...",,st p l sin sv,"{'adjuvantní': [('D000084262', 'hypertermická ...","{'adjuvantní': 0, 'radioterapii': 0, 'na': 0, ...","{'adjuvantní lymfatik': [('D011878', 'radioter...","{'adjuvantní lymfatik': None, 'oblast mammy': ..."
2674,procedura,verifikováno MTS v axile,verifikováno MTS v axile,,,"{'axile': [('D000080886', 'spatium parapharyng...","{'verifikováno': 4, 'MTS': 1, 'v': 0, 'axile': 1}",{},"{'axile': ('D001365', 'axila', 'Oblast lidskéh..."
2517,procedura,adjuvance,adjuvance,,,"{'adjuvance': [('D017358', 'skelet buněčné stě...",{'adjuvance': 1},{},"{'adjuvance': ('D017358', 'skelet buněčné stěn..."


In [525]:
if os.path.isfile("saved_search/procedura_test4_explanation_old_message.csv"):
    procedura_old_message = medvik_explanation_all_read_csv("saved_search/procedura_test4_explanation_old_message.csv")
    procedura_old_message["medvik_search_errors"] = procedura_old_message["medvik_search_errors"].apply(from_string_to_int_dict)

else:
    procedura_old_message = procedura_test.copy()
    procedura_old_message["medvik_explanation"] = "N/A"
    for i in procedura_old_message.index:
        result = {}
        combined_level = -np.infty
        is_find = set()
        l = procedura_old_message["combined_medvik"][i]
        for text in sorted(l, key=lambda x: len(x.split(" ")), reverse=True):
            if combined_level > len(text.split(" ")) and all([w in is_find for w in text.split(" ")]):
                continue
                
            message = message_for_GPT_with_empty(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
            if result[text] is not None:
                is_find.update(text.split(" "))
                combined_level = len(text.split(" "))

        l = procedura_old_message["medvik_search"][i]
        for text in l:
            if text in is_find:
                continue
                
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
        procedura_old_message.at[i, "medvik_explanation"] = result
    procedura_old_message.to_csv("saved_search/procedura_test4_explanation_old_message.csv")

procedura_old_message.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,medvik_search_errors,combined_medvik,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2350,procedura,adjuvantní radioterapii na oblast mammy lymfatik,"st.p. adjuvantní radioterapii, na oblast mammy...",,st p l sin sv,"{'adjuvantní': [('D000084262', 'hypertermická ...","{'adjuvantní': 0, 'radioterapii': 0, 'na': 0, ...","{'adjuvantní lymfatik': [('D011878', 'radioter...","{'adjuvantní lymfatik': None, 'oblast mammy': ..."
2674,procedura,verifikováno MTS v axile,verifikováno MTS v axile,,,"{'axile': [('D000080886', 'spatium parapharyng...","{'verifikováno': 4, 'MTS': 1, 'v': 0, 'axile': 1}",{},"{'axile': ('D000080886', 'spatium parapharynge..."
2517,procedura,adjuvance,adjuvance,,,"{'adjuvance': [('D017358', 'skelet buněčné stě...",{'adjuvance': 1},{},"{'adjuvance': ('D017358', 'skelet buněčné stěn..."


In [539]:
if os.path.isfile("saved_search/procedura_test4_explanation.csv"):
    procedura_test = medvik_explanation_all_read_csv("saved_search/procedura_test4_explanation.csv")
    procedura_test["medvik_search_errors"] = procedura_test["medvik_search_errors"].apply(from_string_to_int_dict)

else:
    procedura_test["medvik_explanation"] = "N/A"
    for i in procedura_test.index:
        result = {}
        combined_level = -np.infty
        is_find = set()
        l = procedura_test["combined_medvik"][i]
        for text in sorted(l, key=lambda x: len(x.split(" ")), reverse=True):
            if combined_level > len(text.split(" ")) and all([w in is_find for w in text.split(" ")]):
                continue
                
            message = message_for_GPT_with_empty(text, l[text], medvik_find_by_code, context=procedura_test["original_text"][i])
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
            if result[text] is not None:
                is_find.update(text.split(" "))
                combined_level = len(text.split(" "))

        l = procedura_test["medvik_search"][i]
        for text in l:
            if text in is_find:
                continue
                
            message = message_for_GPT_with_empty(text, l[text], medvik_find_by_code, context=procedura_test["original_text"][i])
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
        procedura_test.at[i, "medvik_explanation"] = result
        procedura_test.to_csv("saved_search/procedura_test4_explanation.csv")

procedura_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,medvik_search_errors,combined_medvik,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
2350,procedura,adjuvantní radioterapii na oblast mammy lymfatik,"st.p. adjuvantní radioterapii, na oblast mammy...",,st p l sin sv,"{'adjuvantní': [('D000084262', 'hypertermická ...","{'adjuvantní': 0, 'radioterapii': 0, 'na': 0, ...","{'adjuvantní lymfatik': [('D011878', 'radioter...","{'adjuvantní lymfatik': None, 'oblast mammy': ..."
2674,procedura,verifikováno MTS v axile,verifikováno MTS v axile,,,"{'axile': [('D000080886', 'spatium parapharyng...","{'verifikováno': 4, 'MTS': 1, 'v': 0, 'axile': 1}",{},"{'axile': ('D001366', 'arteria axillaris', 'Ar..."
2517,procedura,adjuvance,adjuvance,,,"{'adjuvance': [('D017358', 'skelet buněčné stě...",{'adjuvance': 1},{},"{'adjuvance': ('D017358', 'skelet buněčné stěn..."


In [534]:
procedura_without_empty["medvik_explanation"] = procedura_without_empty["medvik_explanation"].apply(interprete_explanation)
procedura_old_message["medvik_explanation"] = procedura_old_message["medvik_explanation"].apply(interprete_explanation)
procedura_test["medvik_explanation"] = procedura_test["medvik_explanation"].apply(interprete_explanation)

#### Results

In [116]:
procedura_without_empty["asign"] = "N/A"

i = 0
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Partially"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Right"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Empty"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Empty"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Partially"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Partially"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Right"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Empty"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Right"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Partially"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Right"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Right"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Empty"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Empty"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Right"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Wrong"
i += 1
j = procedura_without_empty.index[i]
procedura_without_empty.at[j, "asign"] = "Empty"

# print(procedura_without_empty["original_text"].iloc[i])
# print(procedura_without_empty["medvik_explanation"].iloc[i])

In [117]:
procedura_old_message["asign"] = "N/A"

i = 0
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Partially"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Partially"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Partially"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Empty"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Empty"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Partially"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Partially"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Right"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Empty"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Right"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Partially"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Right"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Right"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Empty"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Empty"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Right"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Wrong"
i += 1
j = procedura_old_message.index[i]
procedura_old_message.at[j, "asign"] = "Empty"

# print(procedura_old_message["original_text"].iloc[i])
# print(procedura_old_message["medvik_explanation"].iloc[i])

In [118]:
procedura_test["asign"] = "N/A"

i = 0
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Empty"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Empty"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Empty"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Empty"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Empty"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Right"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Wrong"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Partially"
i += 1
j = procedura_test.index[i]
procedura_test.at[j, "asign"] = "Empty"

# print(procedura_test["original_text"].iloc[i])
# print(procedura_test["medvik_explanation"].iloc[i])
# print()
# print(procedura_test["medvik_search"].iloc[i])
# print(procedura_test["combined_medvik"].iloc[i])

In [119]:
procedura_without_empty["asign"].value_counts()

Wrong        19
Right         6
Empty         6
Partially     4
Name: asign, dtype: int64

In [120]:
procedura_old_message["asign"].value_counts()

Wrong        18
Partially     6
Empty         6
Right         5
Name: asign, dtype: int64

In [121]:
procedura_test["asign"].value_counts()

Wrong        13
Right        10
Partially     6
Empty         6
Name: asign, dtype: int64

- Not so good results, lots terms we have not been able to link.
- We can see that, using context improves results. As we remove long nonsense, we create space for shorter which are correct.
- We can see that there is no difference between linking with non of these option and without.

### Label osobní anamnéza 

In [122]:
improved_osobni = data_improved_punctuation[(data_improved_punctuation["label"] == "osobní anamnéza")].copy()
print("Number of 'osobní anamnéza' label:", len(improved_osobni))
osobni_test = improved_osobni.sample(35, random_state=10)
osobni_test.head(3)

Number of 'osobní anamnéza' label: 214


Unnamed: 0,label,text,original_text,about
3555,osobní anamnéza,st.p. thyreoidectomiam,st.p. thyreoidectomiam,
2516,osobní anamnéza,cysta v ZHK vlevo,cysta v ZHK vlevo,
2155,osobní anamnéza,st.p. operaci močového měcýře,st.p. operaci močového měcýře,


In [123]:
osobni_test = make_shortcuts_table(osobni_test)

#### Linking with mistakes

In [124]:
osobni_a = []
for child in offline_mshcz:
    try:
        d = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "072" ][0][0].text
        if d is not None and d[0] in ["A", "C", "E"]:
            osobni_a.append(child)
    except IndexError:
        pass                

9902


In [125]:
print(len(osobni_a))

if os.path.isfile("saved_search/osobni_test.csv"):
    osobni_test = medvik_search_all_read_csv("saved_search/osobni_test.csv")
else:
    osobni_test = medvik_with_mistake_all_search_table(osobni_test, osobni_a)
    osobni_test.to_csv("saved_search/osobni_test.csv")

symptom_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
3555,osobní anamnéza,thyreoidectomiam,st.p. thyreoidectomiam,,st p,"{'thyreoidectomiam': [('D010280', 'paratyreoid..."
2516,osobní anamnéza,cysta v ZHK vlevo,cysta v ZHK vlevo,,,"{'cysta': [('D003560', 'cysty')], 'v': [('D000..."
2155,osobní anamnéza,operaci močového měcýře,st.p. operaci močového měcýře,,st p,"{'operaci': [('D000072656', 'lymfedém souvisej..."


In [126]:
print_long_searches_len_dep(osobni_test["medvik_search"], limit=20)
osobni_test["medvik_search"] = osobni_test["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, 70))

[[('plic', 85, 21.25)], [('onemocnění', 306, 30.6)], [('oper', 124, 31.0), ('žl', 221, 110.5)], [('typu', 161, 40.25), ('na', 301, 150.5), ('II', 135, 67.5)], [('typ', 139, 46.333333333333336)], [('příl', 317, 79.25)], [('na', 301, 150.5)], [('CHCE', 1087, 271.75)], [('v', 331, 331.0), ('ZHK', 1010, 336.6666666666667)], [('LCHCE', 1857, 371.4), ('p', 5821, 5821.0)], [('st', 1215, 607.5), ('p', 5821, 5821.0)], [('st', 1215, 607.5), ('p', 5821, 5821.0)], [('st', 1215, 607.5), ('p', 5821, 5821.0)], [('st', 1215, 607.5), ('p', 5821, 5821.0)], [('st', 1215, 607.5), ('p', 5821, 5821.0)], [('LDK', 2100, 700.0), ('st', 1215, 607.5), ('p', 5821, 5821.0)], [('stp', 3201, 1067.0)], [('hy', 2459, 1229.5)], [('op', 3394, 1697.0), ('p', 5821, 5821.0)], [('op', 3394, 1697.0), ('st', 1215, 607.5), ('p', 5821, 5821.0)], [('ME', 3566, 1783.0), ('s', 406, 406.0), ('st', 1215, 607.5), ('p', 5821, 5821.0)], [('i', 9160, 9160.0), ('v', 3007, 3007.0), ('s', 6049, 6049.0)]]


In [127]:
temp = osobni_test["medvik_search"].apply(combine_searches)
temp[temp != {}]

Unnamed: 0
2155    {'močového měcýře': [('D001750', 'neurogenní m...
1167    {'DM typu': [('D003920', 'diabetes mellitus')]...
3081    {'štítné strumu': [('D013959', 'nemoci štítné ...
4972    {'konisace čípku': [('D019092', 'konizace dělo...
1344    {'embolie plic': [('D011655', 'plicní embolie')]}
4630    {'konizaci čípku': [('D019092', 'konizace dělo...
2428    {'arteriální hypertenze': [('D001794', 'krevní...
2690    {'ulcerozní kolitis': [('D003092', 'kolitida')...
Name: medvik_search, dtype: object

In [128]:
osobni_test["combined_medvik"] = osobni_test["medvik_search"].apply(combine_searches)
osobni_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,combined_medvik
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
3555,osobní anamnéza,thyreoidectomiam,st.p. thyreoidectomiam,,st p,"{'thyreoidectomiam': [('D010280', 'paratyreoid...",{}
2516,osobní anamnéza,cysta v ZHK vlevo,cysta v ZHK vlevo,,,"{'cysta': [('D003560', 'cysty')], 'v': [], 'ZH...",{}
2155,osobní anamnéza,operaci močového měcýře,st.p. operaci močového měcýře,,st p,"{'operaci': [('D000072656', 'lymfedém souvisej...","{'močového měcýře': [('D001750', 'neurogenní m..."


#### Choosing best match

In [129]:
print_long_searches(osobni_test["medvik_search"], limit=30)

[[('hepatodystrofie', 33)], [('arteriální', 40)], [('močového', 42), ('měcýře', 34)], [('leukémií', 44)], [('kolitis', 46)], [('benigní', 50)], [('vrozená', 53)], [('hypretenze', 53)], [('operace', 58), ('ruky', 34)], [('srsti', 59)], [('plic', 85)], [('strumu', 87), ('oper', 124)], [('typ', 139)], [('typu', 161), ('dietě', 42), ('II', 135)], [('onemocnění', 306)]]


In [130]:
osobni_test["medvik_search"] = osobni_test["medvik_search"].apply(lambda x: drop_long_searches(x, 90))
osobni_test["medvik_search"] = osobni_test["medvik_search"].apply(drop_empty_searches)

In [131]:
if os.path.isfile("saved_search/osobni_test_explanation.csv"):
    osobni_test = pd.read_csv("saved_search/osobni_test_explanation.csv")
    osobni_test.index = osobni_test["Unnamed: 0"]
    osobni_test.drop(["Unnamed: 0"], axis=1, inplace=True)
    osobni_test["medvik_search"] = osobni_test["medvik_search"].apply(from_string_to_dict)
    osobni_test["combined_medvik"] = osobni_test["combined_medvik"].apply(from_string_to_dict)
    osobni_test["medvik_explanation"] = osobni_test["medvik_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    osobni_test["about"] = osobni_test["about"].fillna("N/A")
    osobni_test["shortcuts"] = osobni_test["shortcuts"].fillna("")

else:
    osobni_test["medvik_explanation"] = "N/A"
    for i in osobni_test.index:
        result = {}
        combined_level = -np.infty
        is_find = set()
        l = osobni_test["combined_medvik"][i]
        for text in sorted(l, key=lambda x: len(x.split(" ")), reverse=True):
            if combined_level > len(text.split(" ")) and all([w in is_find for w in text.split(" ")]):
                continue
                
            message = message_for_GPT(text, l[text], medvik_find_by_code, context=osobni_test["original_text"][i])
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
            if result[text] is not None:
                is_find.update(text.split(" "))
                combined_level = len(text.split(" "))

        l = osobni_test["medvik_search"][i]
        for text in l:
            if text in is_find:
                continue
                
            message = message_for_GPT(text, l[text], medvik_find_by_code, context=osobni_test["original_text"][i])
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
            
        osobni_test.at[i, "medvik_explanation"] = result
        osobni_test.to_csv("saved_search/osobni_test_explanation.csv")

osobni_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,combined_medvik,medvik_explanation,asign
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
3555,osobní anamnéza,thyreoidectomiam,st.p. thyreoidectomiam,,st p,"{'thyreoidectomiam': [('D010280', 'paratyreoid...",{},"{'thyreoidectomiam': ('D013965', 'tyreoidektom...",Right
2516,osobní anamnéza,cysta v ZHK vlevo,cysta v ZHK vlevo,,,"{'cysta': [('D003560', 'cysty')], 'vlevo': [('...",{},"{'cysta': ('D003560', 'cysty', 'Patologická du...",Partially
2155,osobní anamnéza,operaci močového měcýře,st.p. operaci močového měcýře,,st p,"{'operaci': [('D000072656', 'lymfedém souvisej...","{'močového měcýře': [('D053201', 'hyperaktivní...","{'močového měcýře': ('D001743', 'močový měchýř...",Partially


In [132]:
osobni_test["medvik_explanation"] = osobni_test["medvik_explanation"].apply(interprete_explanation)

#### Results

In [133]:
osobni_test["asign"] = "N/A"

i = 0
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Wrong"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Empty"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Wrong"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Wrong"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Empty"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Empty"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Partially"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Wrong"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Empty"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Empty"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"
i += 1
j = osobni_test.index[i]
osobni_test.at[j, "asign"] = "Right"

# print(osobni_test["original_text"].iloc[i])
# print(osobni_test["medvik_explanation"].iloc[i])
# print()
# print(osobni_test["medvik_search"].iloc[i])
# print(osobni_test["combined_medvik"].iloc[i])

In [134]:
osobni_test["asign"].value_counts()

Right        16
Partially    10
Empty         5
Wrong         4
Name: asign, dtype: int64

Right: 46 % | Partially or Right: 74 % | Empty: 14 %

### Label symptom

In [282]:
improved_symptom = data_improved_punctuation[(data_improved_punctuation["label"] == "symptom")].copy()
print("Number of 'symptom' label:", len(improved_symptom))
symptom_test = improved_symptom.sample(35, random_state=1)
symptom_test = make_shortcuts_table(symptom_test)

symptom_test.head(5)

Number of 'symptom' label: 587


Unnamed: 0,label,text,original_text,about
5955,symptom,vpravo prominující tu léze v rozhraní HKK,vpravo prominující tu léze v rozhraní HKK,
2707,symptom,mutace v genu NBN,mutace v genu NBN,
182,symptom,vpravo vtažení kůže vysoko v ZHK laterálně,vpravo vtažení kůže vysoko v ZHK laterálně,
5045,symptom,nevolnosti,nevolnosti,
5719,symptom,karieznímu a defktnímu chrupu,karieznímu a defktnímu chrupu,


#### Linking with mistakes

In [284]:
symptomy = []
for child in offline_mshcz:
    try:
        d = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "072" ][0][0].text
        if d is not None and d[0] in ["A", "G", "C", "D", "E"]:
            symptomy.append(child)
    except IndexError:
        pass                
print(len(symptomy))

22086


In [285]:
if os.path.isfile("saved_search/symptom_test.csv"):
    symptom_test = medvik_search_all_read_csv("saved_search/symptom_test.csv")
else:
    symptom_test = medvik_with_mistake_all_search_table(symptom_test, symptomy)
    symptom_test.to_csv("saved_search/symptom_test.csv")

symptom_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts,medvik_search
5955,symptom,vpravo prominující tu léze v rozhraní HKK,vpravo prominující tu léze v rozhraní HKK,,,"{'vpravo': [('D000069584', 'jednostranný karci..."
2707,symptom,mutace v genu NBN,mutace v genu NBN,,,"{'mutace': [('D000067552', 'akumulace mutací')..."
182,symptom,vpravo vtažení kůže vysoko v ZHK laterálně,vpravo vtažení kůže vysoko v ZHK laterálně,,,"{'vpravo': [('D000069584', 'jednostranný karci..."


In [291]:
print_long_searches_len_dep(symptom_test["medvik_search"], limit=20)
symptom_test["medvik_search"] = symptom_test["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, 30))

[[('lož', 174, 58.0)], [('neostře', 643, 91.85714285714286)], [('k', 163, 163.0)], [('mamy', 740, 185.0)], [('7mm', 1269, 423.0), ('vel', 70, 23.333333333333332)], [('v', 481, 481.0), ('HKK', 708, 236.0)], [('v', 481, 481.0), ('NBN', 7622, 2540.6666666666665)], [('v', 481, 481.0), ('ZHK', 1375, 458.3333333333333)], [('v', 481, 481.0), ('HKK', 708, 236.0)], [('v', 481, 481.0), ('obl', 208, 69.33333333333333)], [('v', 481, 481.0), ('až', 227, 113.5), ('do', 2028, 1014.0)], [('DKK', 2230, 743.3333333333334)], [('dkk', 2230, 743.3333333333334)], [('dkk', 2230, 743.3333333333334)], [('s', 964, 964.0), ('lemem', 338, 67.6)], [('s', 964, 964.0)], [('a', 2766, 2766.0)]]


In [292]:
temp = symptom_test["medvik_search"].apply(combine_searches)
temp[temp != {}]

2707    {'mutace genu': [('D013489', 'suprese genetick...
182     {'vpravo laterálně': [('D000069584', 'jednostr...
2743    {'pozitivní resekční': [('D000072662', 'resekč...
4812    {'dosahuje bazi': [('D019465', 'kraniofaciální...
2685    {'pozitivní resekční': [('D000072662', 'resekč...
4716    {'dosahuje krátkém': [('D000074583', 'dehydrog...
Name: medvik_search, dtype: object

In [293]:
symptom_test["combined_medvik"] = symptom_test["medvik_search"].apply(combine_searches)
symptom_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts,medvik_search,combined_medvik
5955,symptom,vpravo prominující tu léze v rozhraní HKK,vpravo prominující tu léze v rozhraní HKK,,,"{'vpravo': [('D000069584', 'jednostranný karci...",{}
2707,symptom,mutace v genu NBN,mutace v genu NBN,,,"{'mutace': [('D000067552', 'akumulace mutací')...","{'mutace genu': [('D013489', 'suprese genetick..."
182,symptom,vpravo vtažení kůže vysoko v ZHK laterálně,vpravo vtažení kůže vysoko v ZHK laterálně,,,"{'vpravo': [('D000069584', 'jednostranný karci...","{'vpravo laterálně': [('D000069584', 'jednostr..."


#### Choosing best match

In [294]:
print_long_searches(symptom_test["medvik_search"], limit=20)

[[('léze', 21)], [('vtažená', 21)], [('satelit', 25), ('vel', 70)], [('aktivních', 26)], [('mutace', 27), ('genu', 65)], [('fibrózní', 27)], [('cysty', 30), ('ledvin', 89)], [('kraniodorsální', 32)], [('parciální', 35)], [('vysoko', 38), ('laterálně', 48)], [('mezi', 51), ('stac', 80)], [('vlevo', 56)], [('jater', 71)], [('dosahuje', 88), ('bazi', 62)], [('dosahuje', 88), ('krátkém', 21)], [('bolesi', 101)]]


In [297]:
symptom_test["medvik_search"] = symptom_test["medvik_search"].apply(lambda x: drop_long_searches(x, 80))
symptom_test["medvik_search"] = symptom_test["medvik_search"].apply(drop_empty_searches)

if os.path.isfile("saved_search/symptom_test_explanation.csv"):
    symptom_test = medvik_explanation_all_read_csv("saved_search/symptom_test_explanation.csv")
else:
    symptom_test = medvik_choose_GPT_table(symptom_test)
    symptom_test.to_csv("saved_search/symptom_test_explanation.csv")

symptom_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts,medvik_search,combined_medvik,medvik_explanation
5955,symptom,vpravo prominující tu léze v rozhraní HKK,vpravo prominující tu léze v rozhraní HKK,,,"{'vpravo': [('D000069584', 'jednostranný karci...",{},"{'vpravo': ('D006333', 'srdeční selhání', 'Sta..."
2707,symptom,mutace v genu NBN,mutace v genu NBN,,,"{'mutace': [('D000067552', 'akumulace mutací')...","{'mutace genu': [('D013489', 'suprese genetick...","{'mutace genu': ('D009154', 'mutace', 'Jakákol..."
182,symptom,vpravo vtažení kůže vysoko v ZHK laterálně,vpravo vtažení kůže vysoko v ZHK laterálně,,,"{'vpravo': [('D000069584', 'jednostranný karci...","{'vpravo laterálně': [('D000069584', 'jednostr...","{'vpravo laterálně': None, 'vpravo': ('D055456..."


In [298]:
symptom_test["medvik_explanation"] = symptom_test["medvik_explanation"].apply(interprete_explanation)

#### Results

In [343]:
symptom_test["asign"] = "N/A"

i = 0
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Empty"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partiallly"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Empty"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Empty"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Empty"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Empty"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Right"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = symptom_test.index[i]
symptom_test.at[j, "asign"] = "Partially"

# print(symptom_test["original_text"].iloc[i])
# print(symptom_test["medvik_explanation"].iloc[i])
# print()
# print(symptom_test["medvik_search"].iloc[i])

In [344]:
symptom_test["asign"].value_counts()

Wrong         11
Right          9
Partially      9
Empty          5
Partiallly     1
Name: asign, dtype: int64

Right: 26 % | Partially: 51 % | Empty 15 %

Observations:
- Text with lot of words, mostly not in base form.
- Lots of shortcuts
- Lot of not medical specific words - will help extracting potential entities

### Label NE osobní anamnéza

In [397]:
improved_ne_osobni = data_improved_punctuation[(data_improved_punctuation["label"] == "NE osobní anamnéza")].copy()
print("Number of 'NE osobní label' label:", len(improved_ne_osobni))

Number of 'NE osobní label' label: 61


In [398]:
print( set( [x[0] for x in improved_ne_osobni["text"].apply(
                        lambda x: [y for y in x.split(" ") if is_negative_word(y)])
        if len(x) >= 1 and (len(x) >= 2 and print(x) or True)] ))

['O', 'O']
{'ne', 'negativní', 'O', 'negat', 'neurologická', 'bez', 'neléčí', 'negat.', 'neuvádí', 'neguje', 'nebere', '0'}


In [383]:
print( set( [x[0] for x in improved_ne_osobni["text"].apply(
                        lambda x: [y for y in x.split(" ") if is_soft_negative_word(y)])
        if len(x) >= 1 and (len(x) >= 2 and print(x) or True)] ))

['O', 'O']
{'ne', 'negativní', 'O', 'negat', 'bez', 'negat.', 'neguje', '0'}


In [399]:
ne_osobni_test = improved_ne_osobni.sample(35, random_state=25)
ne_osobni_test.head(3)

Unnamed: 0,label,text,original_text,about
759,NE osobní anamnéza,HAK nebere,HAK nebere,
862,NE osobní anamnéza,transfuze 0,transfuze 0,
923,NE osobní anamnéza,ITP,ITP,


In [400]:
ne_osobni_test["ne"] = False
for i in ne_osobni_test.index:
    words_split = ne_osobni_test["text"][i].split(" ")
    if any([True for x in words_split if is_soft_negative_word(x)]):
        ne_osobni_test.at[i, "ne"] = True

ne_osobni_test["text"] = ne_osobni_test["text"].apply(lambda text: " ".join([x.strip(":") for x in text.split(" ") if not(is_soft_negative_word(x))]))
ne_osobni_test.head(3)

Unnamed: 0,label,text,original_text,about,ne
759,NE osobní anamnéza,HAK nebere,HAK nebere,,False
862,NE osobní anamnéza,transfuze,transfuze 0,,True
923,NE osobní anamnéza,ITP,ITP,,False


In [401]:
ne_osobni_test = make_shortcuts_table(ne_osobni_test)

#### Linking with mistake

In [402]:
print(len(osobni_a))

if os.path.isfile("saved_search/ne_osobni_test.csv"):
    ne_osobni_test = medvik_search_all_read_csv("saved_search/ne_osobni_test.csv")
else:
    ne_osobni_test = medvik_with_mistake_all_search_table(ne_osobni_test, osobni_a)
    ne_osobni_test.to_csv("saved_search/ne_osobni_test.csv")

ne_osobni_test.head(3)

9902


Unnamed: 0,label,text,original_text,about,ne,shortcuts,medvik_search
759,NE osobní anamnéza,HAK nebere,HAK nebere,,False,,"{'HAK': [('D001035', 'afakie'), ('D001036', 'a..."
862,NE osobní anamnéza,transfuze,transfuze 0,,True,,"{'transfuze': [('D000087526', 'podvázání pupeč..."
923,NE osobní anamnéza,ITP,ITP,,False,,"{'ITP': [('D000007', 'poranění břicha'), ('D00..."


In [405]:
print_long_searches_len_dep(ne_osobni_test["medvik_search"], limit=20)
ne_osobni_test["medvik_search"] = ne_osobni_test["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, limit=70))

[[('onemocnění', 306, 30.6)], [('onemocnění', 306, 30.6)], [('onemocnění', 306, 30.6), ('a', 1296, 1296.0)], [('onemocnění', 306, 30.6)], [('onemocnění', 306, 30.6)], [('je', 757, 378.5)], [('s', 406, 406.0), ('se', 70, 35.0)], [('CA', 2823, 1411.5), ('kompl', 235, 47.0)], [('ITP', 6029, 2009.6666666666667)], [('ITP', 6029, 2009.6666666666667)], [('PE', 4024, 2012.0)], [('fce', 6322, 2107.3333333333335), ('syst', 617, 154.25)]]


In [406]:
temp = ne_osobni_test["medvik_search"].apply(combine_searches)
ne_osobni_test["combined_medvik"] = ne_osobni_test["medvik_search"].apply(combine_searches)
temp[temp != {}]

2193    {'potraty spont': [('D000022', 'samovolný potr...
1882    {'gynekologická onemocnění': [('D000091662', '...
2656    {'gynekologická onemocnění': [('D000091662', '...
1687    {'COVID 19': [('D000086742', 'testování na COV...
4231    {'onemocnění jater': [('D004443', 'echinokokóz...
964     {'ničím se': [('D000077299', 'nozokomiální pne...
961     {'gynekologické operace': [('D013519', 'urogen...
2536    {'neurologická onemocnění': [('D009422', 'nemo...
367     {'gynekologická onemocnění': [('D000091662', '...
Name: medvik_search, dtype: object

#### Choosing best match

In [407]:
print_long_searches(ne_osobni_test["medvik_search"], limit=50)

[[('operace', 58)], [('operace', 58)], [('operace', 58)], [('operace', 58)], [('se', 70)], [('kompl', 235)], [('onemocnění', 306)], [('onemocnění', 306)], [('onemocnění', 306), ('jater', 66), ('ledvin', 85)], [('onemocnění', 306)], [('onemocnění', 306)]]


In [409]:
ne_osobni_test["medvik_search"] = ne_osobni_test["medvik_search"].apply(lambda x: drop_long_searches(x, 90))
ne_osobni_test["medvik_search"] = ne_osobni_test["medvik_search"].apply(drop_empty_searches)

if os.path.isfile("saved_search/ne_osobni_test_explanation.csv"):
    ne_osobni_test = medvik_explanation_all_read_csv("saved_search/ne_osobni_test_explanation.csv")
else:
    ne_osobni_test = medvik_choose_GPT_table(ne_osobni_test)
    ne_osobni_test.to_csv("saved_search/ne_osobni_test_explanation.csv")

ne_osobni_test["medvik_explanation"] = ne_osobni_test["medvik_explanation"].apply(interprete_explanation)
ne_osobni_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,ne,shortcuts,medvik_search,combined_medvik,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
759,NE osobní anamnéza,HAK nebere,HAK nebere,,False,,"{'HAK': [('D001035', 'afakie'), ('D001036', 'a...",{},"{'HAK': ('D014402', 'tuberózní skleróza', 'Fak..."
862,NE osobní anamnéza,transfuze,transfuze 0,,True,,"{'transfuze': [('D000087526', 'podvázání pupeč...",{},"{'transfuze': ('D001803', 'krevní transfuze', ..."
923,NE osobní anamnéza,ITP,ITP,,False,,{},{},{}


#### Results

In [449]:
ne_osobni_test["asign"] = "N/A"

i = 0
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Empty"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Partially"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Partially"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Partially"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Partially"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Partially"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Partially"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Empty"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Empty"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Partially"
i += 1
j = ne_osobni_test.index[i]
ne_osobni_test.at[j, "asign"] = "Right"

# print(ne_osobni_test["original_text"].iloc[i])
# print(ne_osobni_test["medvik_explanation"].iloc[i])
# print()
# print(ne_osobni_test["medvik_search"].iloc[i])

In [450]:
ne_osobni_test["asign"].value_counts()

Right        21
Partially     7
Wrong         4
Empty         3
Name: asign, dtype: int64

Right: 60 % | Partially: 80 % | Empty: 9 %

Observations
- Not so good in filtering
- Bad with shortcuts

### Label NE symptom

In [366]:
improved_ne_symptom = data_improved_punctuation[(data_improved_punctuation["label"] == "NE symptom")].copy()
print("Number of 'NE symptom' label:", len(improved_ne_symptom))

ne_symptom_test = improved_ne_symptom.sample(35, random_state=23)
ne_symptom_test.head(3)

Number of 'NE symptom' label: 1024


Unnamed: 0,label,text,original_text,about
4163,NE symptom,kůže klidná,kůže klidná,
1281,NE symptom,KI 100 lucidní,KI 100% lucidní,
2348,NE symptom,spráná systol.i diastol.funkce LK,spráná systol.i diastol.funkce LK,


In [367]:
ne_symptom_test["ne"] = False
for i in ne_symptom_test.index:
    words_split = ne_symptom_test["text"][i].split(" ")
    if any([True for x in words_split if is_soft_negative_word(x)]):
        ne_symptom_test.at[i, "ne"] = True

ne_symptom_test["text"] = ne_symptom_test["text"].apply(lambda text: " ".join([x.strip(":") for x in text.split(" ") if not(is_soft_negative_word(x))]))
ne_symptom_test.sample(3)

Unnamed: 0,label,text,original_text,about,ne
508,NE symptom,volné tekutiny,volné tekutiny,,False
1428,NE symptom,nejsou patrné,nejsou patrné,,False
4163,NE symptom,kůže klidná,kůže klidná,,False


In [368]:
ne_symptom_test = make_shortcuts_table(ne_symptom_test)

In [392]:
print(len(symptomy))

if os.path.isfile("saved_search/ne_symptom_test.csv"):
    ne_symptom_test = medvik_search_all_read_csv("saved_search/ne_symptom_test.csv")
else:
    ne_symptom_test = medvik_with_mistake_all_search_table(ne_symptom_test, symptomy)
    ne_symptom_test.to_csv("saved_search/ne_symptom_test.csv")

ne_symptom_test.head(3)

22086


Unnamed: 0_level_0,label,text,original_text,about,ne,shortcuts,medvik_search
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
4163,NE symptom,kůže klidná,kůže klidná,,False,,"{'kůže': [('D004817', 'epidermis'), ('D012627'..."
1281,NE symptom,KI 100 lucidní,KI 100% lucidní,,False,,"{'KI': [('D000002', 'temefos'), ('D000042', 'a..."
2348,NE symptom,spráná LK,spráná systol.i diastol.funkce LK,,False,systol i diastol funkce,"{'spráná': [('D003951', 'chybná diagnóza'), ('..."


In [393]:
print_long_searches_len_dep(ne_symptom_test["medvik_search"], limit=20)
ne_symptom_test["medvik_search"] = ne_symptom_test["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, limit=30))

[[('patrné', 188, 31.333333333333332)], [('neg', 145, 48.333333333333336), ('bil', 61, 20.333333333333332)], [('norm', 411, 102.75)], [('k', 163, 163.0), ('v', 481, 481.0), ('normě', 417, 83.4)], [('komp', 846, 211.5)], [('na', 621, 310.5)], [('V', 481, 481.0)], [('v', 481, 481.0), ('normě', 417, 83.4), ('lab', 102, 34.0)], [('v', 481, 481.0)], [('KI', 1416, 708.0)], [('SNB', 2482, 827.3333333333334)], [('je', 1794, 897.0), ('otoku', 104, 20.8)], [('je', 1794, 897.0)], [('do', 2028, 1014.0)], [('a', 2766, 2766.0)], [('i', 20886, 20886.0)]]


In [394]:
temp = ne_symptom_test["medvik_search"].apply(combine_searches)
ne_symptom_test["combined_medvik"] = ne_symptom_test["medvik_search"].apply(combine_searches)
temp[temp != {}]

Unnamed: 0
2348    {'LK systol': [('D018487', 'dysfunkce levé srd...
511     {'zácpu průjem': [('D012817', 'příznaky a symp...
3027     {'axila paže': [('D034941', 'horní končetina')]}
1022    {'covid 19': [('D000086742', 'testování na COV...
884     {'chuť jídlu': [('D001069', 'regulace chuti k ...
492     {'patologické změny': [('D012816', 'příznaky a...
Name: medvik_search, dtype: object

In [395]:
print_long_searches(ne_symptom_test["medvik_search"], limit=30)

[[('tekutiny', 34)], [('niveau', 38)], [('včera', 43)], [('otoků', 50)], [('myokardu', 53)], [('bil', 61)], [('pacientka', 67)], [('funkce', 72)], [('perif', 97)], [('patologické', 102), ('změny', 33)], [('otoku', 104)], [('doresekát', 168)]]


In [452]:
ne_symptom_test["medvik_search"] = ne_symptom_test["medvik_search"].apply(lambda x: drop_long_searches(x, 80))
ne_symptom_test["medvik_search"] = ne_symptom_test["medvik_search"].apply(drop_empty_searches)

if os.path.isfile("saved_search/ne_symptom_test_explanation.csv"):
    ne_symptom_test = medvik_explanation_all_read_csv("saved_search/ne_symptom_test_explanation.csv")
else:
    ne_symptom_test = medvik_choose_GPT_table(ne_symptom_test)
    ne_symptom_test.to_csv("saved_search/ne_symptom_test_explanation.csv")

ne_symptom_test["medvik_explanation"] = ne_symptom_test["medvik_explanation"].apply(interprete_explanation)
ne_symptom_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,ne,shortcuts,medvik_search,combined_medvik,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
4163,NE symptom,kůže klidná,kůže klidná,,False,,"{'kůže': [('D004817', 'epidermis'), ('D012627'...",{},"{'kůže': ('D012867', 'kůže', 'Orgán pokrývajíc..."
1281,NE symptom,KI 100 lucidní,KI 100% lucidní,,False,,"{'100': [('D004121', 'dimethylsulfoxid'), ('D0...",{},"{'100': ('D020652', 'elongační faktor 2', 'Pep..."
2348,NE symptom,spráná LK,spráná systol.i diastol.funkce LK,,False,systol i diastol funkce,"{'spráná': [('D003951', 'chybná diagnóza'), ('...","{'LK systol': [('D018487', 'dysfunkce levé srd...","{'diastol LK systol': ('D018487', 'dysfunkce l..."


#### Results

In [496]:
ne_symptom_test["asign"] = "N/A"

i = 0
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Right"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Empty"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Empty"
i += 1
j = ne_symptom_test.index[i]
ne_symptom_test.at[j, "asign"] = "Partially"


# print(ne_symptom_test["original_text"].iloc[i])
# print(ne_symptom_test["medvik_explanation"].iloc[i])
# print()
# print(ne_symptom_test["medvik_search"].iloc[i])
# print(ne_symptom_test["combined_medvik"].iloc[i])

In [497]:
ne_symptom_test["asign"].value_counts()

Right        14
Partially    12
Wrong         7
Empty         2
Name: asign, dtype: int64

Right: 40 % | Partially: 74 % | Empty: 6 %

Observations
- Shortcuts
- It should filter better

### Label medikace 01 - Linking medikace to MSHCZ

In [542]:
improved_medikace = data_improved_punctuation[(data_improved_punctuation["label"] == "medikace")].copy()
print("Number of 'medikace' label:", len(improved_medikace))
medikace_test = improved_medikace.sample(35, random_state=54)
medikace_test = make_shortcuts_table(medikace_test)

medikace_test.head(3)

Number of 'medikace' label: 305


Unnamed: 0,label,text,original_text,about,shortcuts
5604,medikace,DEPAKINE CHRONO,DEPAKINE CHRONO,,
2539,medikace,NACHT EC,NACHT EC,,
1929,medikace,neoadjuv CHT na bazi antracyklinů a taxanů,neoadjuv CHT na bazi antracyklinů a taxanů,,


In [515]:
medikace = []
for child in offline_mshcz:
    try:
        d = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "072" ][0][0].text
        if d is not None and d[0] in ["D", "E"]:
            medikace.append(child)
    except IndexError:
        pass      
        
print(len(medikace))

13410


In [558]:
if os.path.isfile("saved_search/medikace_test.csv"):
    medikace_test = medvik_search_all_read_csv("saved_search/medikace_test.csv")
else:
    medikace_test = medvik_with_mistake_all_search_table(medikace_test, medikace)
    medikace_test.to_csv("saved_search/medikace_test.csv")

medikace_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
5604,medikace,DEPAKINE CHRONO,DEPAKINE CHRONO,,,"{'DEPAKINE': [('D014635', 'kyselina valproová'..."
2539,medikace,NACHT EC,NACHT EC,,,"{'NACHT': [('D000071199', 'protein NLRP3')], '..."
1929,medikace,neoadjuv CHT na bazi antracyklinů a taxanů,neoadjuv CHT na bazi antracyklinů a taxanů,,,"{'neoadjuv': [('D003131', 'kombinovaná terapie..."


In [560]:
print_long_searches_len_dep(medikace_test["medvik_search"], limit=20)
medikace_test["medvik_search"] = medikace_test["medvik_search"].apply(lambda x: drop_long_searches_len_dep(x, 30))

[[('D', 114, 114.0)], [('na', 375, 187.5), ('a', 1698, 1698.0)], [('na', 375, 187.5)], [('st', 695, 347.5), ('p', 6735, 6735.0), ('NACT', 938, 234.5)], [('80mg', 1749, 437.25)]]


In [561]:
temp = medikace_test["medvik_search"].apply(combine_searches)
medikace_test["combined_medvik"] = medikace_test["medvik_search"].apply(combine_searches)
temp[temp != {}]

Unnamed: 0
1929    {'bazi taxanů': [('D009930', 'organické látky')]}
5308    {'bazi taxanů': [('D009930', 'organické látky')]}
Name: medvik_search, dtype: object

In [562]:
print_long_searches(medikace_test["medvik_search"], limit=10)

[[('forte', 11)], [('CHT', 13), ('bazi', 35)], [('CHT', 13), ('bazi', 35)], [('analgetika', 20)], [('elicea', 22)], [('ditahiden', 30)], [('10', 37)], [('vigantol', 60)], [('taxHea', 102)], [('detralex', 139)]]


In [563]:
medikace_test["medvik_search"] = medikace_test["medvik_search"].apply(lambda x: drop_long_searches(x, 35))
medikace_test["medvik_search"] = medikace_test["medvik_search"].apply(drop_empty_searches)

if os.path.isfile("saved_search/medikace_test_explanation.csv"):
    medikace_test = medvik_explanation_all_read_csv("saved_search/medikace_test_explanation.csv")
else:
    medikace_test = medvik_choose_GPT_table(medikace_test)
    medikace_test.to_csv("saved_search/medikace_test_explanation.csv")

medikace_test["medvik_explanation"] = medikace_test["medvik_explanation"].apply(interprete_explanation)
medikace_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,shortcuts,medvik_search,combined_medvik,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
5604,medikace,DEPAKINE CHRONO,DEPAKINE CHRONO,,,"{'DEPAKINE': [('D014635', 'kyselina valproová'...",{},"{'DEPAKINE': ('D014635', 'kyselina valproová',..."
2539,medikace,NACHT EC,NACHT EC,,,"{'NACHT': [('D000071199', 'protein NLRP3')], '...",{},"{'NACHT': ('D000071199', 'protein NLRP3', 'An ..."
1929,medikace,neoadjuv CHT na bazi antracyklinů a taxanů,neoadjuv CHT na bazi antracyklinů a taxanů,,,"{'neoadjuv': [('D003131', 'kombinovaná terapie...","{'bazi taxanů': [('D009930', 'organické látky')]}","{'neoadjuv': ('D020360', 'neoadjuvantní terapi..."


#### Results

In [603]:
medikace_test["asign"] = "N/A"

i = 0
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Partially"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Partially"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"

# print(medikace_test["original_text"].iloc[i])
# print(medikace_test["medvik_explanation"].iloc[i])
# print()
# print(medikace_test["medvik_search"].iloc[i])

In [604]:
medikace_test["asign"].value_counts()

Right        17
Empty         9
Wrong         7
Partially     2
Name: asign, dtype: int64

Right: 49 % | Partially: 54 % | 26 %
    
Observations
- Not so good filter (for example weekly don't filtered)
- Many medicines not in database
- Lots of right thanks to short text

### Label medikace 02 - Linking medikace to SUKL, Limit for mistake
Many medicines is unknown for MSHCZ, for this reason I will use SUKL databaze, where I will search. Because GPT is not suitable for linking chemicals with drugs I will use only mathematical methods for linking.

In [200]:
products = pd.read_csv("databaze/DLP20240229/dlp_lecivepripravky.csv",
                       encoding="cp1250", delimiter=";")[["KOD_SUKL", "NAZEV", "SILA", "FORMA", "ATC_WHO", "NAZEV_REG"]]
products.head(3)

Unnamed: 0,KOD_SUKL,NAZEV,SILA,FORMA,ATC_WHO,NAZEV_REG
0,9,ACYLCOFFIN,450MG/50MG,TBL NOB,N02BA51,ACYLCOFFIN
1,113,DILURAN,250MG,TBL NOB,S01EC01,DILURAN
2,168,HYDROCHLOROTHIAZID LÉČIVA,25MG,TBL NOB,C03AA03,HYDROCHLOROTHIAZID LÉČIVA


In [201]:
substances_with_ATC = pd.read_csv("databaze/DLP20240229/dlp_atc.csv", encoding="cp1250", delimiter=";")

temp = substances_with_ATC[["ATC", "NAZEV_EN"]]
temp = temp.rename(columns={"NAZEV_EN":"NAZEV"})
temp.index = temp.index + len(substances_with_ATC)
substances_with_ATC_concat = pd.concat([substances_with_ATC[["ATC", "NAZEV"]], temp])

substances_with_ATC_concat.head(3)

Unnamed: 0,ATC,NAZEV
0,A,TRÁVICÍ TRAKT A METABOLISMUS
1,A01,STOMATOLOGICKÉ PŘÍPRAVKY
2,A01A,STOMATOLOGICKÉ PŘÍPRAVKY


In [615]:
# Linking to atc name by code
def count_to_tuple(l):
    un, count = np.unique(l, return_counts=True)
    result = []
    for i, _ in enumerate(un):
        result.append((un[i], count[i]))
    return result


def link_atc_preserve_count(list_atc):
    count = count_to_tuple(list_atc)
    atc = list(set(list_atc))
    r = substances_with_ATC[substances_with_ATC["ATC"].apply(lambda x: x in atc)]
    return [(r["ATC"][i], r["NAZEV"][i], [x[1] for x in count if x[0] == r["ATC"][i]][0]) for i in r.index]
    

# Search chemicals
def sukl_chemicals_search(string, database=substances_with_ATC_concat):
    string = string.lower()

    find = database[database["NAZEV"].apply(
        lambda x: string == x.lower())]
    find = database[database["NAZEV"].apply(
        lambda x: string in x.lower())] if len(find) == 0 else find
    
    return list(find["ATC"])


def sukl_chemicals_mistake_search(string, mistakes, database=substances_with_ATC_concat):
    string = patternize(string)
    pattern = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = lambda x: pattern.search(x) is not None
    find = database[database["NAZEV"].apply(test)]
    return list(find["ATC"])


def sukl_find_chemicals_search(string, output_errors=True,
                               database=substances_with_ATC_concat, limit=0.5):
    return search_words_mistake_bottom(string,
                         lambda x: link_atc_preserve_count(
                             sukl_chemicals_search(x, database=database)),
                         lambda x, y: link_atc_preserve_count(
                             sukl_chemicals_mistake_search(x, y, database=database)),
                         limit=limit,
                         output_errors=output_errors)

# Search drugs
def sukl_drugs_search(string):
    string = string.lower()
    find = products[products["NAZEV"].apply(lambda x: string == x.lower())]
    find = products[products["NAZEV"].apply(lambda x: string in x.lower())] if len(find) == 0 else find
    return list(find["ATC_WHO"])


def sukl_drugs_mistake_search(string, mistakes):
    string = patternize(string)
    pattern = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = lambda x: pattern.search(x) is not None
    find = products[products["NAZEV"].apply(test)]
    return list(find["ATC_WHO"])


def sukl_find_drugs_search(string, output_errors=True, limit=0.5):
    return search_words_mistake_bottom(string,
                         lambda x: link_atc_preserve_count(sukl_drugs_search(x)),
                         lambda x, y: link_atc_preserve_count(sukl_drugs_mistake_search(x, y)),
                         limit=limit,
                         output_errors=output_errors)


# Chemical shortcuts search
def sukl_chemicals_shortcuts_search(string, database=substances_with_ATC_concat):
    string = string.lower()
    find = database[database["NAZEV"].apply(lambda x: string == x[:len(string)].lower())]
    find = database[database["NAZEV"].apply(lambda x: (" " + string) in x.lower()
                                           )] if len(find) == 0 else find
    return list(find["ATC"])


def sukl_chemicals_shortcuts_mistake_search(string, mistakes, database=substances_with_ATC_concat):
    string = patternize(string)
    pattern_start = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    pattern_in = regex.compile(f"( {string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = lambda x: (pattern_start.search(x[:(len(string) + mistakes)]) is not None or
                      pattern_in.search(x) is not None)
    find = database[database["NAZEV"].apply(test)]
    return list(find["ATC"])


def sukl_find_chemicals_shortcuts_search(string, output_errors=True,
                                         database=substances_with_ATC_concat, limit=0.5):
    return search_words_mistake_bottom(string,
                         lambda x: link_atc_preserve_count(
                             sukl_chemicals_shortcuts_search(x, database=database)),
                         lambda x, y: link_atc_preserve_count(
                             sukl_chemicals_shortcuts_mistake_search(x, y, database=database)),
                         limit=limit,
                         output_errors=output_errors)


# Drugs shortcuts search
def sukl_drugs_shortcuts_search(string):
    string = string.lower()
    find = products[products["NAZEV"].apply(lambda x: string == x[:len(string)].lower())]
    find = products[products["NAZEV"].apply(lambda x: (" " + string) in x.lower())] if len(find) == 0 else find
    return list(find["ATC_WHO"])
    

def sukl_drugs_shortcuts_mistake_search(string, mistakes):
    string = patternize(string)
    pattern_start = regex.compile(f"({string}){{e<={mistakes}}}", regex.IGNORECASE)
    pattern_in = regex.compile(f"( {string}){{e<={mistakes}}}", regex.IGNORECASE)
    test = lambda x: (pattern_start.search(x[:(len(string) + mistakes)]) is not None or
                      pattern_in.search(x) is not None)
    find = products[products["NAZEV"].apply(test)]
    return list(find["ATC_WHO"])


def sukl_find_drugs_shortcuts_search(string, output_errors=True, limit=0.5):
    return search_words_mistake_bottom(string,
                     lambda x: link_atc_preserve_count(sukl_drugs_shortcuts_search(x)),
                     lambda x, y: link_atc_preserve_count(sukl_drugs_shortcuts_mistake_search(x, y)),
                     limit=limit,
                     output_errors=output_errors)

In [616]:
# Statistically choosing best match
def combine_preserve_count(linked_text):
    combined = combine_searches({k: map(lambda x: (x[0], x[1]), linked_text[k]) for k in linked_text})
    result = {}
    for k in combined:
        links = []
        for l in combined[k]:
            sum_ = sum(
                next(filter(lambda y: y[0]==l[0], linked_text[x]))[2]
                    for x in k.split(" "))
            
            links.append((l[0], l[1], sum_))
        result[k] = links

    return result


def combine_2_searched_databaze(priority1, priority2, mistakes1, mistakes2):
    mistakes1 = mistakes1.copy()
    mistakes2 = mistakes2.copy()
    for j in priority1:
        if len(j.split(" ")) >= 2:
            mistakes1[j] = sum([mistakes1[x] for x in j.split(" ")])
    for j in priority2:
        if len(j.split(" ")) >= 2:
            mistakes2[j] = sum([mistakes2[x] for x in j.split(" ")])
    
    dict_with_priority = ({(len(k.split(" ")), -mistakes1.pop(k, 10), 2, k) : priority1[k] for k in priority1} |
                          {(len(k.split(" ")), -mistakes2.pop(k, 10), 1, k) : priority2[k] for k in priority2} )
    result = {}
    in_result = set()   
    for record in sorted(dict_with_priority, reverse=True):
        if any([x not in in_result for x in record[3].split(" ")]):
            for x in record[3].split(" "):
                in_result.add(x)
            result[record[3]] = dict_with_priority[record]
    return result


def prioritize_sukl(one_search, complete_search):
    in_count = sum([1 for k in complete_search if one_search[1] in k[1]])
    return (one_search[2], in_count, random.randint(0, 100))

#### Linking

In [617]:
medikace_test = improved_medikace.sample(35, random_state=21)
medikace_test = make_shortcuts(medikace_test)
medikace_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts
3848,medikace,tamoxifenu,tamoxifenu,,
2984,medikace,herceptinu,herceptinu,,
4807,medikace,neoadjuvantní chemoterapie paclitaxel weekly,neoadjuvantní chemoterapie - paclitaxel weekly,,


In [618]:
if os.path.isfile("saved_search/medikace_test2.csv"):
    medikace_test = pd.read_csv("saved_search/medikace_test2.csv")
    medikace_test.index = medikace_test["Unnamed: 0"]
    medikace_test.drop(["Unnamed: 0"], axis=1, inplace=True)
    medikace_test["text"] = medikace["text"].fillna("")
    medikace_test["shortcuts"] = medikace_test["shortcuts"].fillna("")
    medikace_test["about"] = medikace_test["about"].fillna("N/A")
    
    medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(from_string_to_dict)
    medikace_test["sukl_chemicals_errors"] = medikace_test["sukl_chemicals_errors"].apply(from_string_to_int_dict)
    medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(lambda x: {k : list(map(lambda e: (e[0], e[1], int(e[2])), x[k])) for k in x})
    
    medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(from_string_to_dict)
    medikace_test["sukl_drugs_errors"] = medikace_test["sukl_drugs_errors"].apply(from_string_to_int_dict)
    medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(lambda x: {k : list(map(lambda e: (e[0], e[1], int(e[2])), x[k])) for k in x})

else:
    medikace_test["sukl_search_chemicals"] = medikace_test["text"].apply(lambda x: sukl_find_chemicals_search(x, output_errors=True))
    medikace_test["sukl_search_chemicals"] = medikace_test.apply(lambda x: x.sukl_search_chemicals |
                (sukl_find_chemicals_search(x.about ,output_errors=True) if x.about != "N/A" else {}), axis=1)
    medikace_test["sukl_search_chemicals"] = medikace_test.apply(lambda x: x.sukl_search_chemicals |
                (sukl_find_chemicals_shortcuts_search(x.shortcuts ,output_errors=True) if x.shortcuts != "" else {}), axis=1)

    medikace_test["sukl_search_drugs"] = medikace_test["text"].apply(lambda x: sukl_find_drugs_search(x, output_errors=True))
    medikace_test["sukl_search_drugs"] = medikace_test.apply(lambda x: x.sukl_search_drugs |
                (sukl_find_drugs_search(x.about ,output_errors=True) if x.about != "N/A" else {}), axis=1)
    medikace_test["sukl_search_drugs"] = medikace_test.apply(lambda x: x.sukl_search_drugs |
                (sukl_find_drugs_shortcuts_search(x.shortcuts ,output_errors=True) if x.shortcuts != "" else {}), axis=1)

    medikace_test["sukl_chemicals_errors"] = medikace_test["sukl_search_chemicals"].apply(lambda x: {k[0]:k[1] for k in x})
    medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(lambda x: {k[0]:x[k] for k in x})
    medikace_test["sukl_drugs_errors"] = medikace_test["sukl_search_drugs"].apply(lambda x: {k[0]:k[1] for k in x})
    medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(lambda x: {k[0]:x[k] for k in x})
    
    medikace_test.to_csv("saved_search/medikace_test2.csv")

medikace_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts,sukl_search_chemicals,sukl_search_drugs,sukl_chemicals_errors,sukl_drugs_errors
3848,medikace,tamoxifenu,tamoxifenu,,,"{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 2)]}","{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 6)]}",{'tamoxifenu': 1},{'tamoxifenu': 1}
2984,medikace,herceptinu,herceptinu,,,"{'herceptinu': [('A16AA07', 'METRELEPTIN', 2),...","{'herceptinu': [('L01FD01', 'TRASTUZUMAB', 2)]}",{'herceptinu': 4},{'herceptinu': 1}
4807,medikace,neoadjuvantní chemoterapie paclitaxel weekly,neoadjuvantní chemoterapie - paclitaxel weekly,,,"{'neoadjuvantní': [('B02BD', 'KOAGULAČNÍ FAKTO...","{'neoadjuvantní': [('A05AA01', 'KYSELINA CHENO...","{'neoadjuvantní': 7, 'chemoterapie': 1, 'pacli...","{'neoadjuvantní': 7, 'chemoterapie': 6, 'pacli..."


In [619]:
print_long_searches_len_dep(medikace_test["sukl_search_chemicals"], limit=10)
print()
print_long_searches_len_dep(medikace_test["sukl_search_drugs"], limit=10)

medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(
    lambda x: drop_long_searches_len_dep(x, limit=25))
medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(
    lambda x: drop_long_searches_len_dep(x, limit=40))

[[('weekly', 62, 10.333333333333334)], [('CBDCA', 61, 12.2)], [('režimem', 90, 12.857142857142858), ('AC', 1383, 691.5)], [('LYRICA', 112, 18.666666666666668)], [('vit', 62, 20.666666666666668), ('D', 3422, 3422.0)], [('3', 42, 42.0)], [('COMIRNATY', 516, 57.333333333333336)], [('PTX', 283, 94.33333333333333)], [('AC', 1383, 691.5), ('4x', 960, 480.0), ('st', 39, 19.5), ('p', 693, 693.0), ('NACT', 147, 36.75)], [('v', 1682, 1682.0), ('weekly', 62, 10.333333333333334)]]

[[('10', 25, 12.5)], [('Neo', 41, 13.666666666666666)], [('vit', 65, 21.666666666666668), ('D', 811, 811.0)], [('PTX', 100, 33.333333333333336)], [('3', 34, 34.0)], [('AC', 300, 150.0)], [('AC', 300, 150.0), ('st', 32, 16.0), ('p', 191, 191.0), ('NACT', 71, 17.75)], [('v', 719, 719.0)]]


In [620]:
medikace_test["sukl_chemicals_combine"] = medikace_test["sukl_search_chemicals"].apply(combine_preserve_count)
medikace_test[medikace_test["sukl_chemicals_combine"] != {}]["sukl_chemicals_combine"]

Series([], Name: sukl_chemicals_combine, dtype: object)

In [621]:
medikace_test["sukl_drugs_combine"] = medikace_test["sukl_search_drugs"].apply(combine_preserve_count)
medikace_test[medikace_test["sukl_drugs_combine"] != {}]["sukl_drugs_combine"]

4807    {'neoadjuvantní chemoterapie': [('V06XX', 'POT...
3546    {'calcium vit': [('A12AA04', 'UHLIČITAN VÁPENA...
1703       {'orcal Neo': [('C08CA01', 'AMLODIPIN', 120)]}
Name: sukl_drugs_combine, dtype: object

In [622]:
print_long_searches(medikace_test["sukl_search_chemicals"], limit=5)
print()
print_long_searches(medikace_test["sukl_search_drugs"], limit=10)

medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(lambda x: drop_long_searches(x, limit=10))
medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(lambda x: drop_long_searches(x, limit=15))

[[('adjuvanci', 6), ('podáme', 6), ('weekly', 62)], [('ranisan', 7)], [('kalnormin', 7)], [('dexametason', 16)], [('utrogestan', 18)], [('neoadjvu', 23), ('režimem', 90)], [('herceptinu', 26)], [('detralex', 28)], [('cipralex', 34)], [('st', 39)], [('orcal', 47), ('Neo', 27)], [('CBDCA', 61)], [('vit', 62)], [('flamigel', 63)], [('terapii', 64)], [('neoadjuvantní', 66), ('weekly', 62)], [('ryzodeg', 66)], [('LYRICA', 112)]]

[[('stp1xAC', 11)], [('flamigel', 11)], [('ranisan', 12)], [('podáme', 16), ('weekly', 40)], [('antiemetika', 19)], [('CBDCA', 21)], [('chemoterapie', 22), ('weekly', 40)], [('10', 25), ('mg', 19)], [('neoadjvu', 26), ('režimem', 43)], [('st', 32), ('NACT', 71)], [('3', 34), ('mg', 19)], [('Neo', 41)], [('vit', 65)], [('PTX', 100)]]


In [623]:
medikace_test["sukl_mathematical_explanation"] = "N/A"
for index in medikace_test.index:
    union_chem = drop_empty_searches(medikace_test["sukl_search_chemicals"][index] | medikace_test["sukl_chemicals_combine"][index])
    union_drugs = drop_empty_searches(medikace_test["sukl_search_drugs"][index] | medikace_test["sukl_drugs_combine"][index])

    medikace_test.at[index, "sukl_mathematical_explanation"] = combine_2_searched_databaze(union_chem, union_drugs,
                     medikace_test["sukl_chemicals_errors"][index],
                     medikace_test["sukl_drugs_errors"][index])

random.seed(10)
medikace_test["sukl_mathematical_explanation"] = (
    medikace_test["sukl_mathematical_explanation"].apply(
        lambda x: {k: sorted(x[k], key=lambda y: prioritize_sukl(y, x[k]), reverse=True)[0]
                   for k in x if len(x[k]) != 0}))

medikace_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts,sukl_search_chemicals,sukl_search_drugs,sukl_chemicals_errors,sukl_drugs_errors,sukl_chemicals_combine,sukl_drugs_combine,sukl_mathematical_explanation
3848,medikace,tamoxifenu,tamoxifenu,,,"{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 2)]}","{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 6)]}",{'tamoxifenu': 1},{'tamoxifenu': 1},{},{},"{'tamoxifenu': ('L02BA01', 'TAMOXIFEN', 2)}"
2984,medikace,herceptinu,herceptinu,,,{'herceptinu': []},"{'herceptinu': [('L01FD01', 'TRASTUZUMAB', 2)]}",{'herceptinu': 4},{'herceptinu': 1},{},{},"{'herceptinu': ('L01FD01', 'TRASTUZUMAB', 2)}"
4807,medikace,neoadjuvantní chemoterapie paclitaxel weekly,neoadjuvantní chemoterapie - paclitaxel weekly,,,"{'neoadjuvantní': [], 'chemoterapie': [('D06',...","{'neoadjuvantní': [('A05AA01', 'KYSELINA CHENO...","{'neoadjuvantní': 7, 'chemoterapie': 1, 'pacli...","{'neoadjuvantní': 7, 'chemoterapie': 6, 'pacli...",{},"{'neoadjuvantní chemoterapie': [('V06XX', 'POT...","{'neoadjuvantní chemoterapie': ('V06XX', 'POTR..."


#### Results

In [627]:
medikace_test["asign"] = "N/A"

i = 0
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Partially" #2 - Wrong neoadjuvantní terapie - not a medicine, drug was right (better filter)
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong" #3 - Wrong  - word taxol is in chemicals BETAXOLOL, but also it is drug TAXOL which we should find
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Partially"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Partially"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"

# print(medikace_test["original_text"].iloc[i])
# print(medikace_test["sukl_mathematical_explanation"].iloc[i])

In [628]:
medikace_test["asign"].value_counts()

Right        21
Wrong        11
Partially     3
Name: asign, dtype: int64

Right 60 % | Partially 69 %

In [629]:
print("RIGHT")
for i in medikace_test[medikace_test["asign"] == "Right"].sample(10, random_state=32).index:
    x = medikace_test["sukl_chemicals_errors"][i]
    y = medikace_test["sukl_search_chemicals"][i]
    temp = {k : y[k] for k in y if x[k] != 0}
    if len(temp) != 0:
        print(temp)
    temp2 = {k: (x[k], len(k), round(x[k]/len(k), 3)) for k in x if x[k] != 0}
    if len(temp2) != 0:
        print(temp2)
    if len(temp) + len(temp2) != 0:
        print()

print("WRONG")
for i in medikace_test[medikace_test["asign"] != "Right"].sample(10, random_state=32).index:
    x = medikace_test["sukl_chemicals_errors"][i]
    y = medikace_test["sukl_search_chemicals"][i]
    temp = {k : y[k] for k in y if x[k] != 0}
    if len(temp) != 0:
        print(temp)
    temp2 = {k: (x[k], len(k), round(x[k]/len(k), 3)) for k in x if x[k] != 0}
    if len(temp2) != 0:
        print(temp2)
    if len(temp) + len(temp2) != 0:
        print()

RIGHT
{'LYRICA': []}
{'LYRICA': (2, 6, 0.333)}

{'tritace': [('A16AX13', 'URIDIN-TRIACETÁT', 2)]}
{'tritace': (1, 7, 0.143)}

{'berodual': [('B06AC06', 'BEROTRALSTAT', 2), ('L04AC12', 'BRODALUMAB', 2)]}
{'berodual': (2, 8, 0.25)}

{'ondasetronem': [('A04AA01', 'ONDANSETRON', 2)]}
{'ondasetronem': (3, 12, 0.25)}

{'dexametason': []}
{'dexametason': (1, 11, 0.091)}

{'trastuzumabu': [('L01FD01', 'TRASTUZUMAB', 2), ('L01FD03', 'TRASTUZUMAB EMTANSIN', 2), ('L01FD04', 'TRASTUZUMAB DERUXTEKAN', 2), ('L01FD05', 'TRASTUZUMAB DUOKARMAZIN', 2), ('L01FY01', 'PERTUZUMAB A TRASTUZUMAB', 2)]}
{'trastuzumabu': (1, 12, 0.083)}

{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 2)]}
{'tamoxifenu': (1, 10, 0.1)}

{'COMIRNATY': []}
{'COMIRNATY': (3, 9, 0.333)}

{'utrogestan': []}
{'utrogestan': (3, 10, 0.3)}

{'orcal': []}
{'orcal': (1, 5, 0.2)}

WRONG
{'neoadjuvantní': [], 'chemoterapie': [('D06', 'ANTIBIOTIKA A CHEMOTERAPEUTIKA PRO POUŽITÍ V DERMATOLOGII', 1), ('D06B', 'CHEMOTERAPEUTIKA PRO LOKÁLNÍ APLIKACI', 1

Limit 0.25 will be enough

In [630]:
print("RIGHT")
for i in medikace_test[medikace_test["asign"] == "Right"].sample(10, random_state=13).index:
    x = medikace_test["sukl_drugs_errors"][i]
    y = medikace_test["sukl_search_drugs"][i]
    temp = {k : y[k] for k in y if x[k] != 0}
    if len(temp) != 0:
        print(temp)
    temp2 = {k: (x[k], len(k), round(x[k]/len(k), 3)) for k in x if x[k] != 0}
    if len(temp2) != 0:
        print(temp2)
    if len(temp) + len(temp2) != 0:
        print()

print("WRONG")
for i in medikace_test[medikace_test["asign"] != "Right"].sample(10, random_state=13).index:
    x = medikace_test["sukl_drugs_errors"][i]
    y = medikace_test["sukl_search_drugs"][i]
    temp = {k : y[k] for k in y if x[k] != 0}
    if len(temp) != 0:
        print(temp)
    temp2 = {k: (x[k], len(k), round(x[k]/len(k), 3)) for k in x if x[k] != 0}
    if len(temp2) != 0:
        print(temp2)
    if len(temp) + len(temp2) != 0:
        print()

RIGHT
{'ondasetronem': [('A04AA01', 'ONDANSETRON', 61)]}
{'ondasetronem': (3, 12, 0.25)}

{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 6)]}
{'tamoxifenu': (1, 10, 0.1)}

{'herceptinu': [('L01FD01', 'TRASTUZUMAB', 2)]}
{'herceptinu': (1, 10, 0.1)}

{'trastuzumabu': [('A12BA01', 'CHLORID DRASELNÝ', 2), ('B05XA01', 'CHLORID DRASELNÝ', 9), ('B05XA03', 'CHLORID SODNÝ', 4), ('B05XA07', 'CHLORID VÁPENATÝ', 4), ('C05BA53', 'HEPARIN, KOMBINACE', 3), ('N02AX02', 'TRAMADOL', 3), ('V07AB', 'ROZPOUŠTĚDLA A ŘEDIDLA, VČETNĚ IRIGAČNÍCH ROZTOKŮ', 4), ('V11', 'FYTOFARMAKA A ŽIVOČIŠNÉ PRODUKTY (ČESKÁ ATC SKUPINA)', 2), ('V12', 'HOMEOPATIKA (ČESKÁ ATC SKUPINA)', 2)]}
{'trastuzumabu': (6, 12, 0.5)}

WRONG
{'bisulepin': [('N05AL01', 'SULPIRID', 4)]}
{'bisulepin': (3, 9, 0.333)}

{'NACT': []}
{'NACT': (1, 4, 0.25)}

{'neoadjvu': [], 'režimem': []}
{'neoadjvu': (4, 8, 0.5), 'režimem': (3, 7, 0.429)}

{'CBDCA': []}
{'CBDCA': (2, 5, 0.4)}

{'PTX': []}
{'PTX': (1, 3, 0.333)}

{'antiHER2': [('J06AA03', 'SÉRUM PROTI H

limit 0.25 will be enough

### Label medikace 03 - Extended SUKL

#### New chemicals database

In [219]:
substances_sukl_code = pd.read_csv("databaze/DLP20240229/dlp_latky.csv", encoding="cp1250", delimiter=";"
           )[["KOD_LATKY", "NAZEV_INN", "NAZEV_EN", "NAZEV"]]
content_drugs = pd.read_csv("databaze/DLP20240229/dlp_slozeni.csv", encoding="cp1250", delimiter=";")

drug_code_to_substance_code = content_drugs[["KOD_SUKL", "KOD_LATKY"]].dropna()
drug_code_to_substance_code["KOD_LATKY"] = drug_code_to_substance_code["KOD_LATKY"].apply(lambda x: round(x))
drug_code_to_substance_code.index = drug_code_to_substance_code["KOD_SUKL"]
drug_code_to_substance_code = drug_code_to_substance_code[["KOD_LATKY"]]

drug_code_to_atc = products.copy()
drug_code_to_atc.index = drug_code_to_atc["KOD_SUKL"]
drug_code_to_atc = drug_code_to_atc.drop(["KOD_SUKL"], axis=1)

substance_code_to_atc_duplicated = drug_code_to_substance_code.join(drug_code_to_atc, how="left")[["KOD_LATKY", "NAZEV", "ATC_WHO"]]
substance_code_to_atc_duplicated["KOD_SUKL"] = substance_code_to_atc_duplicated.index
substance_code_to_atc_duplicated.index = substance_code_to_atc_duplicated["KOD_LATKY"]
substance_code_to_atc_duplicated = substance_code_to_atc_duplicated.drop(["KOD_LATKY"], axis=1)

substance_code_to_substance_name = substances_sukl_code.copy()
substance_code_to_substance_name.index = substance_code_to_substance_name["KOD_LATKY"]
substance_code_to_substance_name = substance_code_to_substance_name.drop(["KOD_LATKY"], axis=1)

substance_code_with_name_to_atc_duplicated = substance_code_to_atc_duplicated.join(
    substance_code_to_substance_name, how="left", lsuffix="_L", rsuffix="_R").drop([-1,0])

substance_code_to_atc = substance_code_with_name_to_atc_duplicated.groupby(["KOD_LATKY"], group_keys=True).apply(
    lambda x: x["ATC_WHO"].value_counts().sort_values(ascending=False).index[0])

substance_code_to_atc.head(3)

KOD_LATKY
1        V11
2    M02AA10
8    V10XA01
dtype: object

In [220]:
synonyms = pd.read_csv("databaze/DLP20240229/dlp_synonyma.csv", encoding="cp1250", delimiter=";")
temp = synonyms["KOD_LATKY"].apply(lambda x: substance_code_to_atc[x] if x in substance_code_to_atc else "N/A")
synonyms_linked = synonyms[temp != "N/A"].copy()
synonyms_linked["ATC"] = temp
synonyms_linked

temp = substances_sukl_code.apply(lambda x: substance_code_to_atc[x.KOD_LATKY]
                           if x.KOD_LATKY in substance_code_to_atc else "N/A", axis=1)
substances_to_atc = substances_sukl_code[temp != "N/A"].copy()
substances_to_atc["ATC"] = temp


name_to_atc = pd.DataFrame()
name_to_atc["NAME"] = "N/A"
name_to_atc["ATC"] = "N/A"

length = len(name_to_atc)
temp = pd.DataFrame()
temp["NAME"] = substances_to_atc["NAZEV_INN"].reset_index(drop=True)
temp["ATC"] = substances_to_atc["ATC"].reset_index(drop=True)
temp.index = temp.index + length
name_to_atc = pd.concat([name_to_atc, temp])

length = len(name_to_atc)
temp = pd.DataFrame()
temp["NAME"] = substances_to_atc["NAZEV_EN"].reset_index(drop=True)
temp["ATC"] = substances_to_atc["ATC"].reset_index(drop=True)
temp.index = temp.index + length
name_to_atc = pd.concat([name_to_atc, temp])

length = len(name_to_atc)
temp = pd.DataFrame()
temp["NAME"] = substances_to_atc["NAZEV"].reset_index(drop=True)
temp["ATC"] = substances_to_atc["ATC"].reset_index(drop=True)
temp.index = temp.index + length
name_to_atc = pd.concat([name_to_atc, temp])

length = len(name_to_atc)
temp = pd.DataFrame()
temp["NAME"] = synonyms_linked["NAZEV"].reset_index(drop=True)
temp["ATC"] = synonyms_linked["ATC"].reset_index(drop=True)
temp.index = temp.index + length
name_to_atc = pd.concat([name_to_atc, temp])

length = len(name_to_atc)
temp = pd.DataFrame()
temp["NAME"] = substances_with_ATC_concat["NAZEV"].reset_index(drop=True)
temp["ATC"] = substances_with_ATC_concat["ATC"].reset_index(drop=True)
temp.index = temp.index + length
name_to_atc = pd.concat([name_to_atc, temp])

name_to_atc = name_to_atc.dropna()
name_to_atc = name_to_atc[~name_to_atc.duplicated()]
name_to_atc = name_to_atc.rename(columns={"NAME": "NAZEV"})
name_to_atc.head(3)

Unnamed: 0,NAZEV,ATC
0,ABSINTHII HERBA,V11
1,AURANTII AMARI FLORIS AROMA,M02AA10
2,ACIDUM ACETICUM 98%,V10XA01


#### Linking

In [638]:
medikace_test = improved_medikace.sample(35, random_state=21)
medikace_test = make_shortcuts_table(medikace_test)
medikace_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts
3848,medikace,tamoxifenu,tamoxifenu,,
2984,medikace,herceptinu,herceptinu,,
4807,medikace,neoadjuvantní chemoterapie paclitaxel weekly,neoadjuvantní chemoterapie - paclitaxel weekly,,


In [639]:
if os.path.isfile("saved_search/medikace_test3.csv"):
    medikace_test = pd.read_csv("saved_search/medikace_test3.csv")
    medikace_test.index = medikace_test["Unnamed: 0"]
    medikace_test.drop(["Unnamed: 0"], axis=1, inplace=True)
    medikace_test["text"] = medikace["text"].fillna("")
    medikace_test["shortcuts"] = medikace_test["shortcuts"].fillna("")
    medikace_test["about"] = medikace_test["about"].fillna("N/A")
    
    medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(from_string_to_dict)
    medikace_test["sukl_chemicals_errors"] = medikace_test["sukl_chemicals_errors"].apply(from_string_to_int_dict)
    medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(lambda x: {k : list(map(lambda e: (e[0], e[1], int(e[2])), x[k])) for k in x})
    
    medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(from_string_to_dict)
    medikace_test["sukl_drugs_errors"] = medikace_test["sukl_drugs_errors"].apply(from_string_to_int_dict)
    medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(lambda x: {k : list(map(lambda e: (e[0], e[1], int(e[2])), x[k])) for k in x})

else:
    medikace_test["sukl_search_chemicals"] = medikace_test["text"].apply(
        lambda x: sukl_find_chemicals_search(x, output_errors=True, database=name_to_atc, limit=0.15))
    medikace_test["sukl_search_chemicals"] = medikace_test.apply(
        lambda x: x.sukl_search_chemicals | 
        (sukl_find_chemicals_search(x.about ,output_errors=True, database=name_to_atc, limit=0.15)
                 if x.about != "N/A" else {}), axis=1)
    medikace_test["sukl_search_chemicals"] = medikace_test.apply(
        lambda x: x.sukl_search_chemicals | 
        (sukl_find_chemicals_shortcuts_search(x.shortcuts, output_errors=True, database=name_to_atc, limit=0.15)
                 if x.shortcuts != "" else {}), axis=1)

    medikace_test["sukl_search_drugs"] = medikace_test["text"].apply(
        lambda x: sukl_find_drugs_search(x, output_errors=True, limit=0.25))
    medikace_test["sukl_search_drugs"] = medikace_test.apply(lambda x: x.sukl_search_drugs |
                (sukl_find_drugs_search(x.about ,output_errors=True, limit=0.25)
                         if x.about != "N/A" else {}), axis=1)
    medikace_test["sukl_search_drugs"] = medikace_test.apply(lambda x: x.sukl_search_drugs |
                (sukl_find_drugs_shortcuts_search(x.shortcuts ,output_errors=True, limit=0.25)
                         if x.shortcuts != "" else {}), axis=1)

    medikace_test["sukl_chemicals_errors"] = medikace_test["sukl_search_chemicals"].apply(lambda x: {k[0]:k[1] for k in x})
    medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(lambda x: {k[0]:x[k] for k in x})
    medikace_test["sukl_drugs_errors"] = medikace_test["sukl_search_drugs"].apply(lambda x: {k[0]:k[1] for k in x})
    medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(lambda x: {k[0]:x[k] for k in x})
    
    medikace_test.to_csv("saved_search/medikace_test3.csv")

medikace_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts,sukl_search_chemicals,sukl_search_drugs,sukl_chemicals_errors,sukl_drugs_errors
3848,medikace,tamoxifenu,tamoxifenu,,,"{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 3)]}","{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 6)]}",{'tamoxifenu': 0},{'tamoxifenu': 1}
2984,medikace,herceptinu,herceptinu,,,{'herceptinu': []},"{'herceptinu': [('L01FD01', 'TRASTUZUMAB', 2)]}",{'herceptinu': 2},{'herceptinu': 1}
4807,medikace,neoadjuvantní chemoterapie paclitaxel weekly,neoadjuvantní chemoterapie - paclitaxel weekly,,,"{'neoadjuvantní': [], 'chemoterapie': [('D06',...","{'neoadjuvantní': [], 'chemoterapie': [], 'pac...","{'neoadjuvantní': 2, 'chemoterapie': 1, 'pacli...","{'neoadjuvantní': 4, 'chemoterapie': 4, 'pacli..."


In [640]:
print_long_searches_len_dep(medikace_test["sukl_search_chemicals"], limit=10)
print()
print_long_searches_len_dep(medikace_test["sukl_search_drugs"], limit=10)

medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(
    lambda x: drop_long_searches_len_dep(x, limit=25))
medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(
    lambda x: drop_long_searches_len_dep(x, limit=40))

[[('orcal', 123, 24.6), ('Neo', 52, 17.333333333333332)], [('vit', 160, 53.333333333333336), ('D', 4333, 4333.0)], [('PTX', 657, 219.0)], [('10', 808, 404.0), ('mg', 70, 35.0)], [('AC', 2211, 1105.5)], [('AC', 2211, 1105.5), ('st', 163, 81.5), ('p', 1207, 1207.0), ('NACT', 562, 140.5)], [('3', 1588, 1588.0), ('mg', 70, 35.0)], [('v', 2446, 2446.0)]]

[[('10', 25, 12.5)], [('Neo', 41, 13.666666666666666)], [('vit', 65, 21.666666666666668), ('D', 811, 811.0)], [('PTX', 100, 33.333333333333336)], [('3', 34, 34.0)], [('AC', 300, 150.0)], [('AC', 300, 150.0), ('st', 32, 16.0), ('p', 191, 191.0), ('NACT', 71, 17.75)], [('v', 719, 719.0)]]


In [641]:
medikace_test["sukl_chemicals_combine"] = medikace_test["sukl_search_chemicals"].apply(combine_preserve_count)
medikace_test[medikace_test["sukl_chemicals_combine"] != {}]["sukl_chemicals_combine"]

1703    {'orcal Neo': [('G01AA51', 'NYSTATIN, KOMBINAC...
Name: sukl_chemicals_combine, dtype: object

In [642]:
medikace_test["sukl_drugs_combine"] = medikace_test["sukl_search_drugs"].apply(combine_preserve_count)
medikace_test[medikace_test["sukl_drugs_combine"] != {}]["sukl_drugs_combine"]

3546    {'calcium vit': [('A12AA04', 'UHLIČITAN VÁPENA...
1703       {'orcal Neo': [('C08CA01', 'AMLODIPIN', 120)]}
Name: sukl_drugs_combine, dtype: object

In [643]:
print_long_searches(medikace_test["sukl_search_chemicals"], limit=5)
print()
print_long_searches(medikace_test["sukl_search_drugs"], limit=10)

medikace_test["sukl_search_chemicals"] = medikace_test["sukl_search_chemicals"].apply(lambda x: drop_long_searches(x, limit=10))
medikace_test["sukl_search_drugs"] = medikace_test["sukl_search_drugs"].apply(lambda x: drop_long_searches(x, limit=15))

[[('adjuvanci', 6)], [('ryzodeg', 10)], [('antiHER2', 12), ('terapii', 64)], [('lexaurin', 15)], [('tritace', 25)], [('CHT', 29)], [('stilnox', 30)], [('ranisan', 64)], [('orcal', 123), ('Neo', 52)]]

[[('flamigel', 11)], [('ranisan', 12)], [('podáme', 16)], [('CBDCA', 21)], [('10', 25), ('mg', 19)], [('st', 32), ('NACT', 71)], [('3', 34), ('mg', 19)], [('Neo', 41)], [('vit', 65)], [('PTX', 100)]]


In [644]:
medikace_test["sukl_mathematical_explanation"] = "N/A"
for index in medikace_test.index:
    union_chem = drop_empty_searches(medikace_test["sukl_search_chemicals"][index] | medikace_test["sukl_chemicals_combine"][index])
    union_drugs = drop_empty_searches(medikace_test["sukl_search_drugs"][index] | medikace_test["sukl_drugs_combine"][index])

    medikace_test.at[index, "sukl_mathematical_explanation"] = combine_2_searched_databaze(union_chem, union_drugs,
                     medikace_test["sukl_chemicals_errors"][index],
                     medikace_test["sukl_drugs_errors"][index])

random.seed(10)
medikace_test["sukl_mathematical_explanation"] = (
    medikace_test["sukl_mathematical_explanation"].apply(
        lambda x: {k: sorted(x[k], key=lambda y: prioritize_sukl(y, x[k]), reverse=True)[0]
                   for k in x if len(x[k]) != 0}))

medikace_test.head(3)

Unnamed: 0,label,text,original_text,about,shortcuts,sukl_search_chemicals,sukl_search_drugs,sukl_chemicals_errors,sukl_drugs_errors,sukl_chemicals_combine,sukl_drugs_combine,sukl_mathematical_explanation
3848,medikace,tamoxifenu,tamoxifenu,,,"{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 3)]}","{'tamoxifenu': [('L02BA01', 'TAMOXIFEN', 6)]}",{'tamoxifenu': 0},{'tamoxifenu': 1},{},{},"{'tamoxifenu': ('L02BA01', 'TAMOXIFEN', 3)}"
2984,medikace,herceptinu,herceptinu,,,{'herceptinu': []},"{'herceptinu': [('L01FD01', 'TRASTUZUMAB', 2)]}",{'herceptinu': 2},{'herceptinu': 1},{},{},"{'herceptinu': ('L01FD01', 'TRASTUZUMAB', 2)}"
4807,medikace,neoadjuvantní chemoterapie paclitaxel weekly,neoadjuvantní chemoterapie - paclitaxel weekly,,,"{'neoadjuvantní': [], 'chemoterapie': [('D06',...","{'neoadjuvantní': [], 'chemoterapie': [], 'pac...","{'neoadjuvantní': 2, 'chemoterapie': 1, 'pacli...","{'neoadjuvantní': 4, 'chemoterapie': 4, 'pacli...",{},{},"{'paclitaxel': ('L01CD01', 'PAKLITAXEL', 1), '..."


#### Result

In [684]:
medikace_test["asign"] = "N/A"

i = 0
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Empty"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Partially"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"
i += 1
j = medikace_test.index[i]
medikace_test.at[j, "asign"] = "Right"

# print(medikace_test["original_text"].iloc[i])
# print(medikace_test["sukl_mathematical_explanation"].iloc[i])

In [685]:
medikace_test["asign"].value_counts()

Right        27
Wrong         5
Empty         2
Partially     1
Name: asign, dtype: int64

Right: 77 % | Partially 80 % | 6 % Empty

### Label NE medikace

In [690]:
improved_ne_medikace = data_improved_punctuation[(data_improved_punctuation["label"] == "NE medikace")].copy()
print("Number of 'NE medikace' label:", len(improved_ne_medikace))
ne_medikace_test = improved_ne_medikace.sample(frac=1, random_state=235)
ne_medikace_test.head(3)

Number of 'NE medikace' label: 16


Unnamed: 0,label,text,original_text,about
53,NE medikace,antikoncepce: 0,antikoncepce: 0,
5463,NE medikace,bez HRT,bez HRT,
2463,NE medikace,HRT dříve,hormonální léčba: HRT dříve,hormonální léčba


In [691]:
ne_medikace_test["ne"] = False
for i in ne_medikace_test.index:
    words_split = ne_medikace_test["text"][i].split(" ")
    if any([True for x in words_split if is_soft_negative_word(x)]):
        ne_medikace_test.at[i, "ne"] = True

ne_medikace_test["text"] = ne_medikace_test["text"].apply(lambda text: " ".join([x.strip(":") for x in text.split(" ") if not(is_soft_negative_word(x))]))
ne_medikace_test.sample(3, random_state=32)

Unnamed: 0,label,text,original_text,about,ne
53,NE medikace,antikoncepce,antikoncepce: 0,,True
1620,NE medikace,ajatin,ajatin,,False
4570,NE medikace,antikoagulační léčbu,antikoagulační léčbu,,False


In [692]:
ne_medikace_test = make_shortcuts_table(ne_medikace_test)

In [693]:
if os.path.isfile("saved_search/ne_medikace_test.csv"):
    ne_medikace_test = pd.read_csv("saved_search/ne_medikace_test.csv")
    ne_medikace_test.index = ne_medikace_test["Unnamed: 0"]
    ne_medikace_test.drop(["Unnamed: 0"], axis=1, inplace=True)
    ne_medikace_test["about"] = ne_medikace_test["about"].fillna("N/A")
    
    ne_medikace_test["sukl_search_chemicals"] = ne_medikace_test["sukl_search_chemicals"].apply(from_string_to_dict)
    ne_medikace_test["sukl_chemicals_errors"] = ne_medikace_test["sukl_chemicals_errors"].apply(from_string_to_int_dict)
    ne_medikace_test["sukl_search_chemicals"] = ne_medikace_test["sukl_search_chemicals"].apply(
        lambda x: {k : list(map(lambda e: (e[0], e[1], int(e[2])), x[k])) for k in x})
    
    ne_medikace_test["sukl_search_drugs"] = ne_medikace_test["sukl_search_drugs"].apply(from_string_to_dict)
    ne_medikace_test["sukl_drugs_errors"] = ne_medikace_test["sukl_drugs_errors"].apply(from_string_to_int_dict)
    ne_medikace_test["sukl_search_drugs"] = ne_medikace_test["sukl_search_drugs"].apply(
        lambda x: {k : list(map(lambda e: (e[0], e[1], int(e[2])), x[k])) for k in x})

else:
    ne_medikace_test["sukl_search_chemicals"] = ne_medikace_test["text"].apply(
        lambda x: sukl_find_chemicals_search(x, output_errors=True, database=name_to_atc, limit=0.15))
    ne_medikace_test["sukl_search_chemicals"] = ne_medikace_test.apply(
        lambda x: x.sukl_search_chemicals | 
        (sukl_find_chemicals_search(x.about ,output_errors=True, database=name_to_atc, limit=0.15)
                 if x.about != "N/A" else {}), axis=1)
    ne_medikace_test["sukl_search_chemicals"] = ne_medikace_test.apply(
        lambda x: x.sukl_search_chemicals | 
        (sukl_find_chemicals_shortcuts_search(x.shortcuts, output_errors=True, database=name_to_atc, limit=0.15)
                 if x.shortcuts != "" else {}), axis=1)

    ne_medikace_test["sukl_search_drugs"] = ne_medikace_test["text"].apply(
        lambda x: sukl_find_drugs_search(x, output_errors=True, limit=0.25))
    ne_medikace_test["sukl_search_drugs"] = ne_medikace_test.apply(lambda x: x.sukl_search_drugs |
                (sukl_find_drugs_search(x.about ,output_errors=True, limit=0.25)
                         if x.about != "N/A" else {}), axis=1)
    ne_medikace_test["sukl_search_drugs"] = ne_medikace_test.apply(lambda x: x.sukl_search_drugs |
                (sukl_find_drugs_shortcuts_search(x.shortcuts ,output_errors=True, limit=0.25)
                         if x.shortcuts != "" else {}), axis=1)
    
    ne_medikace_test["sukl_chemicals_errors"] = ne_medikace_test["sukl_search_chemicals"].apply(lambda x: {k[0]:k[1] for k in x})
    ne_medikace_test["sukl_search_chemicals"] = ne_medikace_test["sukl_search_chemicals"].apply(lambda x: {k[0]:x[k] for k in x})
    ne_medikace_test["sukl_drugs_errors"] = ne_medikace_test["sukl_search_drugs"].apply(lambda x: {k[0]:k[1] for k in x})
    ne_medikace_test["sukl_search_drugs"] = ne_medikace_test["sukl_search_drugs"].apply(lambda x: {k[0]:x[k] for k in x})
    ne_medikace_test.to_csv("saved_search/ne_medikace_test.csv")

ne_medikace_test.head(3)

Unnamed: 0,label,text,original_text,about,ne,shortcuts,sukl_search_chemicals,sukl_search_drugs,sukl_chemicals_errors,sukl_drugs_errors
53,NE medikace,antikoncepce,antikoncepce: 0,,True,,{'antikoncepce': []},{'antikoncepce': []},{'antikoncepce': 2},{'antikoncepce': 4}
5463,NE medikace,HRT,bez HRT,,True,,"{'HRT': [('A01A', 'STOMATOLOGICKÉ PŘÍPRAVKY', ...","{'HRT': [('A01AD02', 'BENZYDAMIN', 3), ('A02AD...",{'HRT': 1},{'HRT': 1}
2463,NE medikace,HRT dříve,hormonální léčba: HRT dříve,hormonální léčba,False,,"{'HRT': [('A01A', 'STOMATOLOGICKÉ PŘÍPRAVKY', ...","{'HRT': [('A01AD02', 'BENZYDAMIN', 3), ('A02AD...","{'HRT': 1, 'dříve': 0, 'hormonální': 0, 'léčba...","{'HRT': 1, 'dříve': 2, 'hormonální': 3, 'léčba..."


In [694]:
print_long_searches_len_dep(ne_medikace_test["sukl_search_chemicals"], limit=10)
print()
print_long_searches_len_dep(ne_medikace_test["sukl_search_drugs"], limit=10)

ne_medikace_test["sukl_search_chemicals"] = ne_medikace_test["sukl_search_chemicals"].apply(
    lambda x: drop_long_searches_len_dep(x, limit=25))
ne_medikace_test["sukl_search_drugs"] = ne_medikace_test["sukl_search_drugs"].apply(
    lambda x: drop_long_searches_len_dep(x, limit=40))

[[('sine', 106, 26.5), ('FA', 754, 377.0)], [('akt', 309, 103.0), ('již', 342, 114.0)], [('HRT', 1244, 414.6666666666667)], [('HRT', 1244, 414.6666666666667)], [('HRT', 1244, 414.6666666666667)]]

[[('FA', 95, 47.5)], [('HRT', 170, 56.666666666666664)], [('HRT', 170, 56.666666666666664)], [('HRT', 170, 56.666666666666664)]]


In [235]:
ne_medikace_test["sukl_chemicals_combine"] = ne_medikace_test["sukl_search_chemicals"].apply(combine_preserve_count)
ne_medikace_test[ne_medikace_test["sukl_chemicals_combine"] != {}]["sukl_chemicals_combine"]

Series([], Name: sukl_chemicals_combine, dtype: object)

In [236]:
ne_medikace_test["sukl_drugs_combine"] = ne_medikace_test["sukl_search_drugs"].apply(combine_preserve_count)
ne_medikace_test[ne_medikace_test["sukl_drugs_combine"] != {}]["sukl_drugs_combine"]

Series([], Name: sukl_drugs_combine, dtype: object)

In [237]:
print_long_searches(ne_medikace_test["sukl_search_chemicals"], limit=5)
print()
print_long_searches(ne_medikace_test["sukl_search_drugs"], limit=10)

ne_medikace_test["sukl_search_chemicals"] = ne_medikace_test["sukl_search_chemicals"].apply(lambda x: drop_long_searches(x, limit=10))
ne_medikace_test["sukl_search_drugs"] = ne_medikace_test["sukl_search_drugs"].apply(lambda x: drop_long_searches(x, limit=15))

[[('vysazeno', 12)], [('léčbu', 13)], [('tramal', 16)], [('zometa', 54)]]

[[('snesla', 18)], [('léčbu', 35)], [('léčba', 36)], [('léčba', 36)], [('léčba', 36)]]


In [238]:
ne_medikace_test["sukl_mathematical_explanation"] = "N/A"
for index in ne_medikace_test.index:
    union_chem = drop_empty_searches(ne_medikace_test["sukl_search_chemicals"][index] | ne_medikace_test["sukl_chemicals_combine"][index])
    union_drugs = drop_empty_searches(ne_medikace_test["sukl_search_drugs"][index] | ne_medikace_test["sukl_drugs_combine"][index])

    ne_medikace_test.at[index, "sukl_mathematical_explanation"] = combine_2_searched_databaze(union_chem, union_drugs,
                     ne_medikace_test["sukl_chemicals_errors"][index],
                     ne_medikace_test["sukl_drugs_errors"][index])

random.seed(23)
ne_medikace_test["sukl_mathematical_explanation"] = (
    ne_medikace_test["sukl_mathematical_explanation"].apply(
        lambda x: {k: sorted(x[k], key=lambda y: prioritize_sukl(y, x[k]), reverse=True)[0]
                   for k in x if len(x[k]) != 0}))

ne_medikace_test.head(3)

Unnamed: 0_level_0,label,text,original_text,about,ne,sukl_search_chemicals,sukl_chemicals_errors,sukl_search_drugs,sukl_drugs_errors,sukl_drugs_combine,sukl_chemicals_combine,sukl_mathematical_explanation,asign
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
53,NE medikace,antikoncepce,antikoncepce: 0,,True,{'antikoncepce': []},{'antikoncepce': 2},{'antikoncepce': []},{'antikoncepce': 4},{},{},{},Wrong
5463,NE medikace,HRT,bez HRT,,True,{'HRT': []},{'HRT': 1},{'HRT': []},{'HRT': 1},{},{},{},Wrong
2463,NE medikace,HRT dříve,hormonální léčba: HRT dříve,hormonální léčba,False,"{'HRT': [], 'dříve': [('J07BX02', 'VAKCÍNY PRO...","{'HRT': 1, 'dříve': 0, 'hormonální': 0, 'léčba...","{'HRT': [], 'dříve': [('B05XA31', 'ELEKTROLYTY...","{'HRT': 1, 'dříve': 2, 'hormonální': 3, 'léčba...",{},{},"{'léčba': ('V10B', 'PALIATIVNÍ LÉČBA BOLESTI (...",Wrong


#### Result

In [239]:
ne_medikace_test["asign"] = "N/A"

i = 0
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Right"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Right"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Right"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Right"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Right"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Wrong"
i += 1
j = ne_medikace_test.index[i]
ne_medikace_test.at[j, "asign"] = "Right"

# print(ne_medikace_test["original_text"].iloc[i])
# print(ne_medikace_test["sukl_mathematical_explanation"].iloc[i])
# print(ne_medikace_test["sukl_search_drugs"].iloc[i])
# print(ne_medikace_test["sukl_drugs_errors"].iloc[i])
# print(ne_medikace_test["sukl_search_chemicals"].iloc[i])
# print(ne_medikace_test["sukl_chemicals_errors"].iloc[i])

In [240]:
ne_medikace_test["asign"].value_counts()

Wrong    10
Right     6
Name: asign, dtype: int64

Right: 37.5 % | Partially  37.5%

- Lots of shortcuts
- Lots of wrongly labeled data.

### Whole search
Whole search by label

In [386]:
def whole_way(table, index, database, lenght_dep_limit, drop_limit):
    # Search
    search = {}
    if table["text"][index] != "":
        search.update(search_words_mistake_bottom(table["text"][index],
                                              lambda x: medvik_combined_without_descr_search(x, database), 
                                              lambda x, y: medvik_mistakes_search(x, y, database=database),
                                              limit=0.25))
    if table["about"][index] != "N/A":
        search.update(search_words_mistake_bottom(table["about"][index],
                                              lambda x: medvik_combined_without_descr_search(x, database), 
                                              lambda x, y: medvik_mistakes_search(x, y, database=database),
                                              limit=0.25))
    if table["shortcuts"][index] != "":
        search.update(search_words_mistake_bottom(table["shortcuts"][index],
                                              lambda x: shortcuts_medvik_without_descr_search(x, database), 
                                              lambda x, y: shortcuts_medvik_mistakes_without_descr_search(
                                                  x, y, database=database),
                                              limit=0.25))

    # Drop & Combine
    search = drop_long_searches_len_dep(search, lenght_dep_limit)
    combined_search = combine_searches(search)
    search = drop_long_searches(search, drop_limit)
    search = drop_empty_searches(search)

    # Choose best Match
    result = {}
    combined_level = -np.infty
    is_find = set()
    for text in sorted(combined_search, key=lambda x: len(x.split(" ")), reverse=True):
        if combined_level > len(text.split(" ")) and all([w in is_find for w in text.split(" ")]):
            continue
        message = message_for_GPT(text, combined_search[text], medvik_find_by_code, context=table["original_text"][i])
        response = send_to_GPT(message)
        result[text] = from_GPT(response, combined_search[text], medvik_find_by_code)
        if result[text] is not None:
            is_find.update(text.split(" "))
            combined_level = len(text.split(" "))

    for text in search:
        if text in is_find:
            continue
        message = message_for_GPT(text, search[text], medvik_find_by_code, context=table["original_text"][i])
        response = send_to_GPT(message)
        result[text] = from_GPT(response, search[text], medvik_find_by_code)
        
    return interprete_explanation(result)

#### Procedura
Hyperparameters:
- Search only Anatomy and Analytical (A), diagnostic and therapeutic techniques and devices (E)
- drop length dependent before combining limit: 40
- drop not word limit: 65

In [242]:
whole_way_procedura = (lambda table, index: whole_way(table, index, procedury_anathomy, 40, 65))

#### Os. Anamnéza
Hyperparameters:
- Search only: Anatomy and Analytical (A), Diseases (C), diagnostic and therapeutic techniques and devices (E)
- drop length dependent before combining limit: 70
- drop not word limit: 90

In [243]:
whole_way_os_anamneza = (lambda table, index: whole_way(table, index, osobni_a, 70, 90))

#### Symptomy
Hyperparameters:
- Search only: Anatomy (A), Diseases (C), Chemicals and Drugs (D), Analytical, Diagnostic and Therapeutic Techniques and Equipment (E), Phenomena and Processes (G)
- drop length dependent before combining limit: 30
- drop not word limit: 90

In [244]:
whole_way_symptomy = (lambda table, index: whole_way(table, index, symptomy, 30, 90))

#### NE Os. Anamnéza
Hyperparameters:
- Search only: Anatomy and Analytical (A), Diseases (C), diagnostic and therapeutic techniques and devices (E)
- drop length dependent before combining limit: 33
- drop not word limit: 70

In [245]:
def whole_way_ne_os_anamneza(table, index):
    is_negation = False
    if any([True for x in table["text"][index].split(" ") if is_soft_negative_word(x)]):
        is_negation = True

    table.at[index, "text"] = " ".join([x.strip(":") for x in table["text"][index].split(" ") if not(is_negative_word(x))])

    return (whole_way(table, index, osobni_a, 33, 70), is_negation)

#### NE Symptomy
Hyperparameters:
- Search only: Anatomy and Analytical (A), Diseases (C), diagnostic and therapeutic techniques and devices (E)
- drop length dependent before combining limit: 33
- drop not word limit: 60

In [246]:
def whole_way_ne_symptomy(table, index):
    is_negation = False
    if any([True for x in table["text"][index].split(" ") if is_soft_negative_word(x)]):
        is_negation = True

    table.at[index, "text"] = " ".join([x.strip(":") for x in table["text"][index].split(" ") if not(is_negative_word(x))])

    return (whole_way(table, index, symptomy, 33, 60), is_negation)

#### Medikace

Hyperparameters:

- Search: SUKL drugs, SUKL atc, SUKL synonyms, SUKL substances
- drop length dependent before combining limit: 25 (chemicals), 40 (drugs)
- drop not word limit: 10 (chemicals), 15 (drugs)


In [250]:
def whole_way_medikace(table, index):
    search_chemicals = {}
    search_chemicals = (search_chemicals | (sukl_find_chemicals_search(
        table["text"][index], output_errors=True, database=name_to_atc, limit=0.15)
                        if table["text"][index] != "" else {}))
    search_chemicals = (search_chemicals |  (sukl_find_chemicals_search(
        table["about"][index], output_errors=True, database=name_to_atc, limit=0.15)
                                             if table["about"][index] != "N/A" else {}))
    search_chemicals = (search_chemicals |  (sukl_find_chemicals_search(
        table["shortcuts"][index], output_errors=True, database=name_to_atc, limit=0.15)
                                             if table["shortcuts"][index] != "" else {}))
    
    search_chemicals_errors = (lambda x: {k[0]:k[1] for k in x})(search_chemicals)
    search_chemicals = (lambda x: {k[0]:x[k] for k in x})(search_chemicals)
    
    search_drugs = sukl_find_drugs_search(table["text"][index], output_errors=True, limit=0.25)
    search_drugs = (search_drugs | (sukl_find_drugs_search(table["about"][index],
                                                     output_errors=True,
                                                     limit=0.25)
                                    if table["about"][index] != "N/A" else {}))
    search_drugs_errors = (lambda x: {k[0]:k[1] for k in x})(search_drugs)
    search_drugs = (lambda x: {k[0]:x[k] for k in x})(search_drugs)

    search_chemicals = drop_long_searches_len_dep(search_chemicals, 25)
    search_drugs = drop_long_searches_len_dep(search_drugs, 40)
    combined_chemicals = combine_preserve_count(search_chemicals)
    combined_drugs = combine_preserve_count(search_drugs)

    # Choosing best match
    search_chemicals = drop_long_searches(search_chemicals, 10)
    search_drugs = drop_long_searches(search_drugs, 15)
    search_chemicals = drop_empty_searches(search_chemicals)
    search_drugs = drop_empty_searches(search_drugs)

    union_chem = search_chemicals | combined_chemicals
    union_drugs = search_drugs | combined_drugs

    comb_searches = combine_2_searched_databaze(union_chem, union_drugs, search_chemicals_errors, search_drugs_errors)
    
    return {k: sorted(comb_searches[k], key=lambda y: prioritize_sukl(y, comb_searches[k]), reverse=True)[0] 
              for k in comb_searches 
              if len(comb_searches[k]) != 0}

#### NE Medikace

In [248]:
def whole_way_ne_medikace(table, index):
    is_negation = False
    if any([True for x in table["text"][index].split(" ") if is_soft_negative_word(x)]):
        is_negation = True

    table.at[index, "text"] = " ".join([x.strip(":") for x in table["text"][index].split(" ") if not(is_soft_negative_word(x))])
    
    return (whole_way_medikace(table, index), is_negation)

#### Example of usage

In [251]:
i = improved_medikace.sample(frac=1, random_state=23).index[0]
print(improved_medikace["original_text"][i])
whole_way_medikace(make_shortcuts_table(improved_medikace), i)

depakine Crono


{'depakine Crono': ('N03AG01', 'KYSELINA VALPROOVÁ', 42)}

In [252]:
i = improved_ne_medikace.sample(frac=1, random_state=21).index[0]
print(improved_ne_medikace["original_text"][i])
whole_way_ne_medikace(make_shortcuts_table(improved_ne_medikace), i)

zometě


({'zometě': ('M05BA08', 'KYSELINA ZOLEDRONOVÁ', 9)}, False)

In [253]:
i = improved_symptom.sample(frac=1, random_state=20).index[0]
print(improved_symptom["original_text"][i])
whole_way_symptomy(make_shortcuts_table(improved_symptom), i)

DKK brnění prstů


{'brnění': ('D007103',
  'imobilizace',
  'Omezení pohybu celého těla nebo jeho části fyzickými prostředky (FYZICKÉ OMEZENÍ) nebo chemicky analgezií, případně užitím trankvilizérů nebo myorelaxačních látek nedepolarizujících. Patří sem experimentální protokoly používané k hodnocení fyziologických účinků imobility (nepohyblivosti).')}

## Labelling

In [254]:
atc_drug_bank = pd.read_csv("databaze/drugbank_to_atc.csv", index_col=[0])
atc_drug_bank.head(3)

Unnamed: 0,atc,name,description,code
0,B01AE02,Lepirudin,Lepirudin is a recombinant hirudin formed by 6...,DB00001
1,L01FE01,Cetuximab,Cetuximab is a recombinant chimeric human/mous...,DB00002
2,R05CB13,Dornase alfa,Dornase alfa is a biosynthetic form of human d...,DB00003


In [255]:
temp = atc_drug_bank[atc_drug_bank["atc"].duplicated(keep=False)]
temp.index = temp["atc"]
temp2 = substances_with_ATC.copy()
temp2.index = temp2["ATC"]
temp.join(temp2).drop(["ATC", "code", "NT", "NAZEV_EN"], axis=1).sort_index().head(20)

Unnamed: 0,atc,name,description,NAZEV
A02BD01,A02BD01,Omeprazole,"Originally approved by the FDA in 1989, omepra...","OMEPRAZOL, AMOXICILIN A METRONIDAZOL"
A02BD01,A02BD01,Amoxicillin,"Amoxicillin, or BRL-2333, is a penicillin G de...","OMEPRAZOL, AMOXICILIN A METRONIDAZOL"
A02BD01,A02BD01,Metronidazole hydrochloride,,"OMEPRAZOL, AMOXICILIN A METRONIDAZOL"
A02BD02,A02BD02,Lansoprazole,Lansoprazole marketed under the brand Prevacid...,"LANSOPRAZOL, TETRACYKLIN A METRONIDAZOL"
A02BD02,A02BD02,Tetracycline phosphate,,"LANSOPRAZOL, TETRACYKLIN A METRONIDAZOL"
A02BD02,A02BD02,Metronidazole hydrochloride,,"LANSOPRAZOL, TETRACYKLIN A METRONIDAZOL"
A02BD03,A02BD03,Lansoprazole,Lansoprazole marketed under the brand Prevacid...,"LANSOPRAZOL, AMOXICILIN A METRONIDAZOL"
A02BD03,A02BD03,Metronidazole hydrochloride,,"LANSOPRAZOL, AMOXICILIN A METRONIDAZOL"
A02BD03,A02BD03,Amoxicillin,"Amoxicillin, or BRL-2333, is a penicillin G de...","LANSOPRAZOL, AMOXICILIN A METRONIDAZOL"
A02BD04,A02BD04,Pantoprazole,Pantoprazole is a first-generation proton pump...,"PANTOPRAZOL, AMOXICILIN A KLARITHROMYCIN"


In [256]:
def link_drug_bank(drug_code):
    if "DBSALT" in drug_code:
        return f"https://go.drugbank.com/salts/{drug_code}"
    return f"https://go.drugbank.com/drugs/{drug_code}"


def description_atc(atc_code):
    list_des = atc_drug_bank[atc_drug_bank["atc"] == atc_code].apply(
        lambda x: (x.code, x.description, link_drug_bank(x.code)), axis=1)
    
    if len(list_des) == 0:
        return ""
        
    result_string = ["Látky vedené v databázi DrugBank spojené s tímto ATC kódem:\n"]
    for code, des, link in list_des:
        result_string.append(f"{code}: {link} \n{des}\n")

    return "".join(result_string)
            

print(description_atc("L02BA01"))

Látky vedené v databázi DrugBank spojené s tímto ATC kódem:
DB00675: https://go.drugbank.com/drugs/DB00675 
Tamoxifen is a non-steroidal antiestrogen used to treat estrogen receptor positive breast cancers as well as prevent the incidence of breast cancer in high risk populations.[A1025,L7799,L7802] Tamoxifen is used alone or as an adjuvant in these treatments.[L7799,L7802] Tamoxifen may no longer be the preferred treatment for these types of cancers as patients generally have better survival, side effect profiles, and compliance with [anastrozole].[A1026]

Tamoxifen was granted FDA approval on 30 December 1977.[L7799]



In [257]:
link_medvik = lambda code: f"https://www.medvik.cz/bmc/link.do?id={code}"
remove_cit = lambda des: regex.sub(" ?\((C|c)it\.([^\)])*\)?.", "",  des)
link_drugbank = lambda code: f"https://go.drugbank.com/atc/{code}"


def prepare_labeling_medvik(db, expl_column="explanation"):
    result = db[db[expl_column] != {}].copy()
    result["Link to MSHCZ"] = result[expl_column].apply(
        lambda dictr: [(k, dictr[k][0], dictr[k][1], remove_cit(dictr[k][2]), link_medvik(dictr[k][0])) for k in dictr ]
    )
    result = result[["label", "original_text", "Link to MSHCZ"]]    
    return result.reset_index(drop=True)


def prepare_labeling_sukl(db, expl_column="explanation"):
    result = db[db[expl_column] != {}].copy()
    result["Link to MSHCZ"] = result[expl_column].apply(
        lambda dictr: [(k, dictr[k][1], dictr[k][0], description_atc(dictr[k][0]), link_drugbank(dictr[k][0])) for k in dictr]
    )
    result = result[["label", "original_text", "Link to MSHCZ"]]    
    return result.reset_index(drop=True)


prepare_labeling_medvik(procedura_test, expl_column="medvik_explanation").head(3)
prepare_labeling_sukl(medikace_test, expl_column="sukl_mathematical_explanation").head(3)

Unnamed: 0,label,original_text,Link to MSHCZ
0,medikace,tamoxifenu,"[(tamoxifenu, TAMOXIFEN, L02BA01, Látky vedené..."
1,medikace,herceptinu,"[(herceptinu, TRASTUZUMAB, L01FD01, Látky vede..."
2,medikace,neoadjuvantní chemoterapie - paclitaxel weekly,"[(paclitaxel, PAKLITAXEL, L01CD01, Látky veden..."


In [258]:
def predict_for_labeling_table(db, whole_way_func, n=35, random_state=None):
    to_link = db.sample(frac=1, random_state=random_state)
    count_success, index = 0, 0
    to_link["explanation"] = "N/A"
    
    while count_success < n:
        j = to_link.index[index]
        to_link.at[j, "explanation"] = whole_way_func(to_link, j)
        if to_link["explanation"][j] != {}:
            count_success += 1
        index += 1
    
    return to_link.iloc[:index]


# temp = prepare_labeling_medvik(predict_for_labeling_table(make_shortcuts_table(improved_procedura),
#                                                           whole_way_procedura,
#                                                           n=5,
#                                                           random_state=12))
# temp

## TODO
- What label? 7x label, 1x MASH procedury, 1x SUKL medikace, 1x MEDVIK whole access, 1x MEDVIK part access
- MASH procedury, MEDVIK medikace, DB
- MEDVIK part access, MEDVIK whole access, access
- DONE Write Vít problem with number of samples for doctors with regards to confidence interval.
- 
- DONE Complete medvik labeling
- DONE Do sukl labeling
- DONE Make functions clear - functions for working with table: ends with _table
- DONE Make limit clear, and shortcuts clear
- DONE Improve filter
- DONE NE A the same as A (A in {procedura, medikace, symptomy})?
- DONE Investigate message ChatGPT on procedury (move from procedury 3 - add limit, shortcuts - elsewhere it isn't)

- What is finally done: MEDIKACE, NE MEDIKACE, PROCEDURA, OSOBNI_A, NE OSOBNI, NE SYMPTOMY, SYMPTOMY