# Linking named entities
by Filip Gregora

In [527]:
import pandas as pd
import requests
import json
from itertools import combinations
from string import punctuation
import math
import os
import xml.etree.ElementTree as elt
import re

In [4]:
data = pd.read_csv("data/NER_entities.csv")
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,Novalgin
3,symptom,Označena SLU v levé axile.
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,Založení TE l.sin
7,procedura,Cytostatika
8,NE symptom,"přiměřené echogenity,"
9,NE symptom,nezvětšena


## Data exploration
At the beginning we want to explore data.

- We can see that somewhere there is big letter at the beginning (but be carefull when whole first word is written in upper case)
- Somewhere at the end is interpuncion
- There are lots of duplicates
- the length of text is variable and the longest has 20 words (this can be problem in the future)


In [5]:
len(data[data.duplicated()])

2588

In [768]:
def clean(string):
    string = string.strip(" " + "".join(punctuation))
    # Remove first upper letter if not all letters are upper
    if len(string) >= 2:
        string = string[0].lower() + string[1:] if string[1].islower() else string
    # Replace multiple whitespaces with one
    return " ".join(string.split())

def clean_db(db):
    db_copy = db.copy()
    db_copy["text"] = db_copy["text"].apply(clean)
    db_copy["text"] = db_copy["text"].drop_duplicates()
    return db_copy.dropna()

data = clean_db(data)
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,novalgin
3,symptom,označena SLU v levé axile
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,založení TE l.sin
7,procedura,cytostatika
8,NE symptom,přiměřené echogenity
9,NE symptom,nezvětšena


In [7]:
def comb_sum(j):
    sum = 0
    for i in range(j, 0, -1):
        sum += math.comb(j,i)

    return sum

for i in range(1, 21):
    print(i, comb_sum(i), sep = ": ", end = " | ")
    
lenght_data = data["text"].apply(lambda x: len(x.split(" ")))
len(lenght_data[lenght_data >= 7])

1: 1 | 2: 3 | 3: 7 | 4: 15 | 5: 31 | 6: 63 | 7: 127 | 8: 255 | 9: 511 | 10: 1023 | 11: 2047 | 12: 4095 | 13: 8191 | 14: 16383 | 15: 32767 | 16: 65535 | 17: 131071 | 18: 262143 | 19: 524287 | 20: 1048575 | 

140

## Linking
Our approach to linking entities is:
- Link words to some medicinal database.
- Let some pretrained language model to choose the best from them.

So I can compare I decided to use two approaches for linking. The first one is to use international mesh and access it via web API of NIH (National Institute of Health).

The second is to use czech mesh. I accessed it from predownloaded file.

### Linking to international MESH through NIH
Mash is international medical databaze: https://uts.nlm.nih.gov/uts/.

I tried search all combinations of words from text in databaze. The longer combinations have higher priority. 

There is one big problem, the complexity grows exponentially with the lenght of the words (in the worst case for lenght of 20 we have to try around 10^6 combinations). My solution for this problem is go from bottom up, start with lenght 1 and continue only with combinations which success.


In [587]:
# Do not search in databaze if it number or it is too short (shorter than 2)
def filter_short(string):
    return len(string) < 2 or string.isdigit()
    
    
def print_stats(data_list):
    empty = len(data_list[data_list.apply(lambda x: len(x) == 0)])
    print(f"Number of empty: {empty} ({empty / len(data_list) * 100} %)")

    number_of_matches = data_list.apply(lambda x: len(x))
    print(f"Mean from number of matches: {number_of_matches.mean()}")
    print(f"Median from number of matches: {number_of_matches.median()}")
    print(f"Maximal of matches: {number_of_matches.max()}")
    
    
def from_string_to_list(string):
    result = []
    for j in string.strip("[]()").split("), ("):
        if len(j) == 0:
            continue
        result.append(tuple([s.strip("'\" \\") for s in j.split("', ")]))
                
    return result


def from_string_to_tuple(string):
    if string == "N/A":
        return
    result = [i.strip("\\\"'()") for i in string.strip("\\\" )('").split(", ")]
    return (result[0], result[1], ", ".join(result[2:]))


def from_string_to_dict(string):
    result = {}
    for j in string.strip("{} ").split("], "):
        if j == "":
            continue
        i = list(j.split(": ["))
        assert len(i) == 2
        result[i[0].strip("\"\' \\")] = from_string_to_list(i[1])

    return result


def from_string_to_dict_to_tuple(string):
    result = {}
    # string = string.replace("\\", "")
    # string = string.replace("\\\'", "")
    for j in re.split("(\)| None), ('|\")", string.strip("{} ")):
        if j in ["", ')', ' None', "'", '"']:
            continue
        i = list(j.split(": ("))
        if len(i) == 1:
            i[0] = i[0].split(": None")[0].strip(": ")
            result[i[0].strip("\"\' \\")] = None
        else:
            result[i[0].strip("\"\' \\")] = from_string_to_tuple(i[1])

    return result

print(from_string_to_list("[('C0240430', ""Mee's line""), ('C0259779', 'Fibrous Dysplasia')]"))

[('C0240430', "Mee's line"), ('C0259779', 'Fibrous Dysplasia')]


In [562]:
"\\\'kalubru\\\'".strip(" \'\\")

'kalubru'

In [9]:
with open("APIkeys/NIH", "r") as f:
    NIH_api = f.read()

def mash_search_basic(string):
    splitted_input = (string.split(" "))
    result = []
    for j in range(len(splitted_input), 0, -1):
        for string in combinations(splitted_input, j): 
            if filter_short(" ".join(string)):
                continue
                
            path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
            query = {
                     'string': " ".join(string),
                     'apiKey':NIH_api,
            }
            res = requests.get(path, params=query)

            if res.status_code <= 200:
                data = json.loads(res.text)
                for j in data["result"]["results"]:
                    result.append((j["ui"], j["name"]))
            else:
                print(res.status_code, res.text)
        
        if len(result) != 0:
            break
                        
    return result
        
    
def search_with_inclusion(string, func, output_state = False):
    if (output_state):
        global count
        count += 1
        if count % 100 == 0:
            print(count)
    
    splitted_input = (string.split(" "))
    result = []
    last_result = []
    lenght = len(splitted_input)
    
    for j in range(1, lenght + 1):
        splitted_dict={}
        for elem in splitted_input:
            splitted_dict[j] = False
                    
        for words in combinations(splitted_input, j):
            data = func(" ".join(words))
            if len(data) != 0:
                for j in words:
                    splitted_dict[j] = True
                if filter_short(" ".join(words)):
                    continue
                result.append(data)

        splitted_input = [j for j, i in splitted_dict.items() if i]
        if len(splitted_input) == 0:
            break
        else:
            last_result, result = result, []
        
    temp = []
    for j in last_result:
        temp += list(enumerate(j))
    return [j for (i, j) in sorted(temp)]
    
    
def mash_search(string):
    path = 'https://uts-ws.nlm.nih.gov/rest/search/current'
    query = {
             'string': string,
             'apiKey':NIH_api,
    }
    res = requests.get(path, params=query)

    if res.status_code <= 200:
        data = json.loads(res.text)          
        return [(j["ui"], j["name"]) for j in data["result"]["results"]]
    else:
        print(res.status_code, res.text)
        return []
    

def search_db(db, func):
    db = db.copy()
    db["search"] = db["text"].apply(func)
    return db 
    
    
def search_db_mash(db):
    db = db.copy()
    db["search"] = db["text"].apply(mash_search_basic)
    return db    


def search_db_mash_optimized(db):
    db = db.copy()
    db["search"] = db["text"].apply(lambda x: search_with_inclusion(x, mash_search))
    return db


In [10]:
if os.path.isfile("saved_search/data_mash.csv"):
    data_mash = pd.read_csv("saved_search/data_mash.csv")
    data_mash.index = data_mash["Unnamed: 0"]
    data_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    data_mash["search"] = data_mash["search"].apply(from_string_to_list)
else:
    data_mash = search_db_mash_optimized(data_mash)
    data_mash.to_csv("saved_search/data_mash.csv")
    
print_stats(data_mash["search"])

Number of empty: 298 (10.631466286122013 %)
Mean from number of matches: 21.931858722797003
Median from number of matches: 25.0
Maximal of matches: 138


#### Not assigned
If we look at the random sample of 10 texts, which are not assigned, then we can see that in five of them there is typographical mistake (*"nejsou zn.plicní hpertenze"* = *"nejsou zn. plicní hypertenze"*, *"kumulce a nehomogenity"* = *"kumulace a nehomogenita"*, *"ceriucal"* = *"cerucal"*, *"paitace"* = *"palpitace"*, *"mamily klidné"* = ?). Others five are correct medical term, but in some non-typical grammatical form.

If we try to improve them we get 50 % improvement.

In [11]:
# empty_sample = data_mash[data_mash["search"].apply(lambda x: len(x) == 0)].sample(10, random_state=42)

# Because of my mistake (I had worser clean_db), the code above generate different sample than I have worked with.
# So I have to create the sample by hand:
empty_sample = data_mash.loc[[878, 91, 5240, 3728, 1125, 2479, 4981, 1134, 5089, 1129]]

empty_sample["text"][878] = "nejsou zn. plicní hypertenze"
empty_sample["text"][91] = "hormostenické"
empty_sample["text"][5240] = "kumulace a nehomogenita"
empty_sample["text"][3728] = "biopsie"
empty_sample["text"][1125] = "chemobioterapie"
empty_sample["text"][2479] = "dysmorfické"
empty_sample["text"][4981] = "anikterické"
empty_sample["text"][1134] = "cerucal"
empty_sample["text"][5089] = "palpitace"
empty_sample["text"][1129] = "mamily klidné"

empty_sample = search_db_mash_optimized(empty_sample)
empty_sample

Unnamed: 0_level_0,text,search,label
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
878,nejsou zn. plicní hypertenze,"[(C0020542, Pulmonary Hypertension), (C0152171...",NE symptom
91,hormostenické,[],NE symptom
5240,kumulace a nehomogenita,[],symptom
3728,biopsie,"[(C0005558, Biopsy), (C0220797, biopsy charact...",procedura
1125,chemobioterapie,[],procedura
2479,dysmorfické,"[(C0005887, Body Dysmorphic Disorders)]",symptom
4981,anikterické,[],NE symptom
1134,cerucal,"[(C0701450, Cerucal)]",medikace
5089,palpitace,"[(C0030252, Palpitations), (C0549267, Palpitat...",NE symptom
1129,mamily klidné,[],NE symptom


There is one mistake which we can correct automaticly and it is not having space after punctuation mark. We can see that if we have space after punctuation then it find something, else it didn't.

In [12]:
print(len(search_with_inclusion("zn. plicní", mash_search)))
print(len(search_with_inclusion("zn.plicní", mash_search)))

50
0


We can see that there is around 150 examples of this mistakes.

In [13]:
def is_space_after_punc(string):
    punctuation = [".", ",", "!", "?", ":", ";", "+"]
    for i in range(len(string) - 1):
        if string[i] in punctuation and string[i+1] != " " and string[i+1] not in punctuation:
            return False
 
    return True

def insert_space_after_punc(string):
    punctuation = [".", ",", "!", "?", ":", ";", "+"]
    for i in range(len(string) - 1):
        if string[i] in punctuation and string[i+1] != " " and string[i+1] not in punctuation:
            string = string[:i+1] + " " + string[i+1:]
 
    return string

inserted_space_data = data.copy()
inserted_space_data["text"] = inserted_space_data["text"].apply(insert_space_after_punc)
no_space = data[~data["text"].apply(is_space_after_punc)]
len(no_space)

155

In [14]:
if os.path.isfile("saved_search/data_mash_inserted_space.csv"):
    inserted_space_data_mash = pd.read_csv("saved_search/data_mash_inserted_space.csv")
    inserted_space_data_mash.index = inserted_space_data_mash["Unnamed: 0"]
    inserted_space_data_mash.drop(["Unnamed: 0"], axis=1, inplace=True)
    inserted_space_data_mash["search"] = inserted_space_data_mash["search"].apply(from_string_to_list)
else:
    inserted_space_data_mash = data_mash.copy()
    no_space["text"] = no_space["text"].apply(insert_space_after_punc)
    for i in no_space.index:
        inserted_space_data_mash["search"][i] = search_with_inclusion(no_space["text"][i], mash_search)
        inserted_space_data_mash["text"][i] = no_space["text"][i]
    inserted_space_data_mash.to_csv("saved_search/data_mash_inserted_space.csv")    
        
print_stats(data_mash["search"])
print()
print_stats(inserted_space_data_mash["search"])

Number of empty: 298 (10.631466286122013 %)
Mean from number of matches: 21.931858722797003
Median from number of matches: 25.0
Maximal of matches: 138

Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.350338922582946
Median from number of matches: 25.0
Maximal of matches: 138


Thanks to this upgrade we improved search by finding 40 new matches.

### Linking to CZ Mash through Medvik 

Now we try to link through czech mash, I have downloaded it from NLK (národní lékařská knihovna): https://nlk.cz/pro-knihovny/data/#mesh-cz

I called this linking as Medvik, because there is web service called Medvik: https://www.medvik.cz/bmc/subject.do, where you can search in czech mash.

I used already improved methods from Mash_search. First I experimenced with search, which tests if contains gived text.

In [15]:
content = elt.parse('databaze/MeSH2023_Marc21_Alma.xml').getroot()

In [16]:
def patternize(string):
    result = []
    for i in string:
        if i in '<([{\\^-=$!|]})?*+.>]':
            result.append("\\" + i)
        else:
            result.append(i)
    return "".join(result)


def medvik_search(string, init_pattern, last_pattern):
    result = []
    pattern = re.compile(f"{init_pattern}{patternize(string)}{last_pattern}",re.IGNORECASE)
    
    for child in content:
        for subchild in child.iter("{http://www.loc.gov/MARC21/slim}subfield"):
            if subchild.text and pattern.match(subchild.text) is not None:
                try:
                    code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
                    name = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "150" ][0][0].text
                    result.append((code, name))
                    break
                except IndexError:
                    break                
    return result


def medvik_search_match(string):
    return medvik_search(string, ".*", ".*")


def medvik_search_words(string):      
    return medvik_search(string, ".* ", " .*")


def medvik_search_exact(string):              
    return medvik_search(string, "^", "$")


def medvik_search_combined(string):
    result = medvik_search(string, "^", "$")
    if len(result) == 0:
        result = medvik_search(string, ".* ", " .*")
    if len(result) == 0:
        result = medvik_search(string, ".*", ".*")
    
    return result


def search_db_medvik_match(db):
    return search_db(db, lambda x: search_with_inclusion(x, medvik_search_match, output_state=True))


def search_db_medvik_exact(db):
    return search_db(db, lambda x: search_with_inclusion(x, medvik_search_exact, output_state=True))


def search_db_medvik_words(db):
    return search_db(db, lambda x: search_with_inclusion(x, medvik_search_words, output_state=True))


def search_db_medvik_combined(db):
    return search_db(db, lambda x: search_with_inclusion(x, medvik_search_combined, output_state=True))

In [17]:
if os.path.isfile("saved_search/data_medvik_contains.csv"):
    data_medvik_contains = pd.read_csv("saved_search/data_medvik_contains.csv")
    data_medvik_contains.index = data_medvik_contains["Unnamed: 0"]
    data_medvik_contains.drop(["Unnamed: 0"], axis=1, inplace=True)
    data_medvik_contains["search"] = data_medvik_contains["search"].apply(from_string_to_list)
else:
    count = 0
    data_medvik_contains = search_db_medvik_match(inserted_space_data)
    data_medvik_contains.to_csv("saved_search/data_medvik_contains.csv")
    
print_stats(inserted_space_data_mash["search"])
print()
print_stats(data_medvik_contains["search"])

Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.350338922582946
Median from number of matches: 25.0
Maximal of matches: 138

Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 1670.0117731002497
Median from number of matches: 28.0
Maximal of matches: 43684


In [18]:
print([i for i in data_medvik_contains["search"].apply(lambda x: len(x)).sample(10, random_state=42)])

temp = data_medvik_contains["search"].apply(lambda x: len(x))
print(f"number of searches longer than 100 matches in data_medvik_contains: {len(temp[temp > 100])}")

[1904, 1720, 0, 27715, 1722, 94, 9, 10724, 0, 112]
number of searches longer than 100 matches in data_medvik_contains: 1017


We can see, that for some examples this is working well. But for some we have really lots of samples whose lenght grows exponentially.

For this reasons it might be better to use some different match method instead:
- First method is contain search (I used it before) - test if contains given text
- Next method is word search - test if contains given text as word (there are spaces around)
- Next method is exact search - test if contains exactly given text
- The last method is combined search - first text exact, then word, then contains (if some success then end).

In [19]:
test_data = inserted_space_data.sample(20, random_state=42)

if os.path.isfile("saved_search/test_medvik.csv"):
    test_data = pd.read_csv("saved_search/test_medvik.csv")
    test_data["search_match"] = test_data["search_match"].apply(from_string_to_list)
    test_data["search_exact"] = test_data["search_exact"].apply(from_string_to_list)
    test_data["search_words"] = test_data["search_words"].apply(from_string_to_list)
    test_data["search_combined"] = test_data["search_combined"].apply(from_string_to_list)
else:
    count = 0
    test_data["search_match"] = search_db_medvik_match(test_data)["search"]
    test_data["search_exact"] = search_db_medvik_exact(test_data)["search"]
    test_data["search_words"] = search_db_medvik_words(test_data)["search"]
    test_data["search_combined"] = search_db_medvik_combined(test_data)["search"]
    test_data.to_csv("saved_search/test_medvik.csv")
    
print("Contains match")
print_stats(test_data["search_match"])
print("\nWords match")
print_stats(test_data["search_words"])
print("\nExact match")
print_stats(test_data["search_exact"])
print("\nCombined match")
print_stats(test_data["search_combined"])

Contains match
Number of empty: 3 (15.0 %)
Mean from number of matches: 2265.2
Median from number of matches: 15.5
Maximal of matches: 27715

Words match
Number of empty: 6 (30.0 %)
Mean from number of matches: 194.0
Median from number of matches: 5.5
Maximal of matches: 1918

Exact match
Number of empty: 11 (55.00000000000001 %)
Mean from number of matches: 3.05
Median from number of matches: 0.0
Maximal of matches: 22

Combined match
Number of empty: 3 (15.0 %)
Mean from number of matches: 261.95
Median from number of matches: 6.5
Maximal of matches: 1904


We can see that using exact match we get rid of the long matches but it have quite low success rate. Using words match is something in the middle (not good in both ways).

As last option we used combined match (first try exact, if don't success then words, then only match). This seems as the best methods (this doesn't create too large lists and has the same number of empty matches as contains match) 

In [20]:
if os.path.isfile("saved_search/data_medvik_combined.csv"):
    data_medvik_combined = pd.read_csv("saved_search/data_medvik_combined.csv")
    data_medvik_combined.index = data_medvik_combined["Unnamed: 0"]
    data_medvik_combined.drop(["Unnamed: 0"], axis=1, inplace=True)
    data_medvik_combined["search"] = data_medvik_combined["search"].apply(from_string_to_list)
else:
    count = 0
    data_medvik_combined = search_db_medvik_combined(inserted_space_data)
    data_medvik_combined.to_csv("saved_search/data_medvik_combined.csv")
       
print_stats(data_medvik_combined["search"])

Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 488.3999286478773
Median from number of matches: 7.0
Maximal of matches: 28651


#### Duplicates
It is possible to get duplicates in list of matches, when getting the same match from two different words from text (or combinations of the same lenght) 

In mash search there are few duplicates, but in medvik search it can be serious problem - we can see, that maximum of matches in contains search is reduced nearly by 15 000.

In [21]:
len(inserted_space_data_mash[~inserted_space_data_mash["search"].apply(lambda x: len(set(x)) == len(x))])

5

In [22]:
def remove_dup_preserve_order(l):
    seen = set()
    seen_add = seen.add
    return [x for x in l if not (x in seen or seen_add(x))]

print("Before removing duplicates:")
print("Mash search")
print_stats(inserted_space_data_mash["search"])
print("\nMedvik contains search")
print_stats(data_medvik_contains["search"])
print("\nMedvik combined search")
print_stats(data_medvik_combined["search"])

inserted_space_data_mash["search"] = inserted_space_data_mash["search"].apply(remove_dup_preserve_order)
data_medvik_contains["search"] = data_medvik_contains["search"].apply(remove_dup_preserve_order)
data_medvik_combined["search"] = data_medvik_combined["search"].apply(remove_dup_preserve_order)

print("\n\nAfter removing duplicates:")
print("Mash search")
print_stats(inserted_space_data_mash["search"])
print("\nMedvik contains search")
print_stats(data_medvik_contains["search"])
print("\nMedvik combined search")
print_stats(data_medvik_combined["search"])

Before removing duplicates:
Mash search
Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.350338922582946
Median from number of matches: 25.0
Maximal of matches: 138

Medvik contains search
Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 1670.0117731002497
Median from number of matches: 28.0
Maximal of matches: 43684

Medvik combined search
Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 488.3999286478773
Median from number of matches: 7.0
Maximal of matches: 28651


After removing duplicates:
Mash search
Number of empty: 256 (9.133071708883339 %)
Mean from number of matches: 22.33927934356047
Median from number of matches: 25.0
Maximal of matches: 138

Medvik contains search
Number of empty: 270 (9.632536567962898 %)
Mean from number of matches: 1625.7213699607564
Median from number of matches: 28.0
Maximal of matches: 29327

Medvik combined search
Number of empty: 270 (9.632536567962898 %)
Mean from number of ma

## Choosing best match with Chat GPT
The idea behind our model is first link term to database and then choose the best one by some pretrained language model.

I use GPT-3.5, because it is free to access with limitations (There are some limits of access per day. And there is limited number of access per account. Then we have to pay.), it is fast and it is well known.

The message to GPT is in this format:

Který z nadcházejících popisů medicínských pojmů nejlépe popisuje text: "[MEDICAL TERM]":

    1. [DESCRIPTION_N.1] (pojem: [TERM_N.1])
    2. [DESCRIPTION_N.2] (pojem: [TERM_N.2])
    ...
    
Jako odpověď mi pošli pouze číslo odpovědi. Pokud to nebude žádná z možností, pak odpověz NONE.

In [23]:
from openai import OpenAI
with open("APIkeys/NIH", "r") as f:
    NIH_api = f.read()

def find_by_code_medvik(string):
    if len(string) == 0:
        return ""
    
    for child in content:
        try:
            code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
            if code == string:
                d = [i.iter("{http://www.loc.gov/MARC21/slim}subfield") for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "680"][0]
                return next(d).text
        except IndexError:
            continue      
            
    return ""


def find_by_code_mash(string):
    path = f'https://uts-ws.nlm.nih.gov/rest/content/current/CUI/{string}'
    query = {
             'apiKey':NIH_api,
    }
    res = requests.get(path, params=query)
    
    if res.status_code <= 200:
        try:
            data = json.loads(res.text)
            definition = data["result"]["definitions"]

            if re.match("https://uts-ws.nlm.nih.gov/", definition) is not None:
                path = definition
                res = requests.get(path,params=query)
                try:
                    return [i["value"] for i in json.loads(res.text)["result"] if i["rootSource"] == "MSHCZE"][0]
                except IndexError:
                    pass
                try:
                    return [i["value"] for i in json.loads(res.text)["result"] if i["rootSource"] == "MSH"][0]
                except IndexError:
                    definition = "NONE"
        except Exception:
            definition = "NONE"
            print(string, res.text)
        
        if definition == "NONE":
            return data["result"]["name"]
        
        return definition
    else:
        print(string)
        print(res.status_code, res.text)
        
    return ""

In [24]:
def send_to_chat(message):
    with open("APIkeys/chatGTP", "r") as f:
        chatgpt_api = f.read()

    client = OpenAI(api_key=chatgpt_api)

    return client.chat.completions.create(
        model="gpt-3.5-turbo",

        messages=[{"role": "user", "content": message}],
        stream=False,
    )


def create_message_chatGTP(string, li):
    result = [f"Který z nadcházejících popisů medicínských pojmů nejlépe popisuje text: \"{string}\":\n"]
    j = 1
    for i in li:
        result.append(f"{j}. {i}\n")
        j += 1
    result.append("Jako odpověď mi pošli pouze číslo odpovědi. Pokud to nebude žádná z možností, pak odpověz NONE.")
    return "".join(result)


def message_chatGPT(string, li, find):
    searched_li = [find(i[0]) + f" (pojem: {i[1]})" for i in li]
    return create_message_chatGTP(string, searched_li)


def from_chatGPT(result, li, find):
    try:
        i = int(result.choices[0].message.content.split(".")[0].strip(" ")) - 1
        return (li[i][0], li[i][1], find(li[i][0]))
    except ValueError:
        return None
    except IndexError:
        return None

Because ChatGPT has problem with long messages (and sometimes we get really long results with medvik_combined), we have to restrict these messages and drop them.

In [28]:
if os.path.isfile("saved_search/explanation_sample.csv"):
    explanation = pd.read_csv("saved_search/explanation_sample.csv")
    explanation.index = explanation["Unnamed: 0"]
    explanation.drop(["Unnamed: 0"], axis=1, inplace=True)
    explanation["mash_explanation"] = explanation["mash_explanation"].fillna(value="N/A").apply(from_string_to_tuple)
    explanation["medvik_explanation"] = explanation["medvik_explanation"].fillna(value="N/A").apply(from_string_to_tuple)
    explanation["mash_search"] = explanation["mash_search"].apply(from_string_to_list)
    explanation["medvik_search_combined"] = explanation["medvik_search_combined"].apply(from_string_to_list)
else:
    explanation = results.sample(100, random_state=38)
    explanation["mash_explanation"] = "N/A"
    for j in explanation.index:
        message = message_chatGTP(explanation["text"][j], explanation["mash_search"][j], find_by_code_mash)
        response = send_to_chat(message)
        explanation["mash_explanation"][j] = from_chatGPT(response, explanation["mash_search"][j], find_by_code_mash)     
        
#         If the message is too long we cannot send it to chatGPT, so we drop shorter messages.
        if len(explanation["medvik_search_combined"][j]) > 50:
            continue
        message = message_chatGPT(explanation["text"][j], explanation["medvik_search_combined"][j], find_by_code_medvik)
        response = send_to_chat(message)
        explanation["medvik_explanation"][j] = from_chatGPT(response, explanation["medvik_search_combined"][j], find_by_code_medvik)
    explanation.to_csv("saved_search/explanation_sample.csv")

In [30]:
explanation.head()

Unnamed: 0_level_0,label,text,mash_search,medvik_search_combined,mash_explanation,medvik_explanation
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1633,NE symptom,KI100,[],[],,
5536,procedura,st. p. tru-cut biopsii,"[(C1170898, Companion P/ST 1000ML), (C4015802,...","[(D000039, peritonzilární absces), (D000081182...","(C5700874, Percutaneous pulmonary artery revas...",
412,osobní anamnéza,konizace čípku,"[(C0195324, Conization)]","[(D002583, nádory děložního čípku), (D019092, ...","(C0195324, Conization, Kruhovité kuželovité vy...","(D019092, konizace děložního čípku, Kruhovité ..."
4517,procedura,operace: ITP,"[(C0398650, Immune thrombocytopenic purpura), ...","[(D007293, inosintrifosfát)]","(C3842543, Idiopathic thrombocytopenia (ITP), ...",
5757,symptom,neostře konturované ložisko 7x5mm,"[(C0241148, Cutaneous plaque), (C1533591, Calc...","[(D001253, astrocyty), (D002833, choroiditida)...","(C0235456, Thyroid nodular, Thyroid nodular)","(D002833, choroiditida, Zánět cévnatky, zadní ..."


In [31]:
len(explanation[explanation["medvik_search_combined"].apply(lambda x: len(x) > 50)])

22

## Conclusion Basic Access
There I am going to evaluate the basic access for linking entities.

### Results for Linking
Now we look how we have been successful with linking to database with respect to different labels.

In [297]:
results = inserted_space_data.copy()
results["mash_search"] = inserted_space_data_mash["search"]
results["medvik_search_combined"] = data_medvik_combined["search"]
results.head(3)

Unnamed: 0,label,text,mash_search,medvik_search_combined
0,symptom,jemný fibrózní proužek,"[(C0030848, Peyronie Disease), (C0227365, Taen...","[(D000077275, fibrózní dysplazie kraniofaciáln..."
1,procedura,neoadjuvantní CHT,"[(C0600558, Neoadjuvant Therapy), (C1422359, S...","[(D000014, abnormality vyvolané léky), (D00313..."
2,medikace,novalgin,"[(C0917937, Novalgin)]","[(D004177, metamizol)]"


In [306]:
os_anamneza = results[results.label == "osobní anamnéza"]
ne_os_anamneza = results[results.label == "NE osobní anamnéza"]
medikace = results[results.label == "medikace"]
ne_medikace = results[results.label == "NE medikace"]
symptom = results[results.label == "symptom"]
ne_symptom = results[results.label == "NE symptom"]
procedura = results[results.label == "procedura"]
    
print("osobní anamnéza:")    
print_stats(os_anamneza["mash_search"])
print_stats(os_anamneza["medvik_search_combined"])

print("\nNE osobní anamnéza:")
print_stats(ne_os_anamneza["mash_search"])
print_stats(ne_os_anamneza["medvik_search_combined"])

print("\nmedikace:")
print_stats(medikace["mash_search"])
print_stats(medikace["medvik_search_combined"])

print("\nNE medikace:")
print_stats(ne_medikace["mash_search"])
print_stats(ne_medikace["medvik_search_combined"])

print("\nsymptom:")
print_stats(symptom["mash_search"])
print_stats(symptom["medvik_search_combined"])

print("\nNE symptom:")
print_stats(ne_symptom["mash_search"])
print_stats(ne_symptom["medvik_search_combined"])

print("\nprocedura:")
print_stats(procedura["mash_search"])
print_stats(procedura["medvik_search_combined"])

print("\n\nall:")
print_stats(results["mash_search"])
print_stats(results["medvik_search_combined"])
print("Number of empty in both search:", len(results[(results["mash_search"].apply(lambda x: len(x) == 0)) & (results["medvik_search_combined"].apply(lambda x: len(x) == 0))]))

osobní anamnéza:
Number of empty: 11 (5.14018691588785 %)
Mean from number of matches: 22.939252336448597
Median from number of matches: 24.0
Maximal of matches: 86
Number of empty: 18 (8.411214953271028 %)
Mean from number of matches: 459.1588785046729
Median from number of matches: 11.5
Maximal of matches: 13185

NE osobní anamnéza:
Number of empty: 4 (6.557377049180328 %)
Mean from number of matches: 20.83606557377049
Median from number of matches: 25.0
Maximal of matches: 75
Number of empty: 14 (22.950819672131146 %)
Mean from number of matches: 190.34426229508196
Median from number of matches: 1.0
Maximal of matches: 5777

medikace:
Number of empty: 73 (23.934426229508198 %)
Mean from number of matches: 14.462295081967213
Median from number of matches: 8.0
Maximal of matches: 82
Number of empty: 101 (33.114754098360656 %)
Mean from number of matches: 96.88196721311475
Median from number of matches: 1.0
Maximal of matches: 5134

NE medikace:
Number of empty: 1 (6.25 %)
Mean from nu

We can see, that most labels work quite similar to each others. One big exception is label "medikace" (in english medication). This label have significantly higher empty rate, but have less of matches. 

I this is caused by shorter text (have less of words), but these words are usually more concrete (as names of medicine).

### Results for Mash and Medvik search
Now I am going to evaluate results from GPT for linking.

For this I have created a sample of 100 entries, which I have send to GPT to find best match. From them I picked up another sample of 35 entries which I have evaluated manually.

In [322]:
print("Number of examples: {}".format(len(explanation)))

print("Number of empty Linking for Medvik_combined_search: {}".format(explanation["medvik_search_combined"].apply(lambda x: len(x) == 0).sum()))
print("Number of empty Linking for Mash_search: {}".format(explanation["mash_search"].apply(lambda x: len(x) == 0).sum()))

print("Number of not assigned for Medvik: {}".format(explanation["medvik_explanation"].apply(lambda x: x is None).sum()))
print("Number of not assigned for Mash: {}".format(explanation["mash_explanation"].apply(lambda x: x is None).sum()))

Number of examples: 100
Number of empty Linking for Medvik_combined_search: 11
Number of empty Linking for Mash_search: 10
Number of not assigned for Medvik: 39
Number of not assigned for Mash: 24


We can see, that there is high number of not assigned in both searches. But in Medvik it is much higher. The higher number in Medvik is because we have to drop very long searches. In the future accesses it is important to handle the long.

In [310]:
asign = []
for j in explanation.sample(35, random_state=42).index:   
    if explanation["medvik_explanation"][j] is not None:
        x = explanation["medvik_explanation"][j]
        asign.append((explanation["text"][j], "Medvik", "{} ({})".format(x[1], x[2])))
    if explanation["mash_explanation"][j] is not None:
        x = explanation["mash_explanation"][j]
        asign.append((explanation["text"][j], "Mash", "{} ({})".format(x[1], x[2])))
    if explanation["mash_explanation"][j] is None and explanation["medvik_explanation"][j] is None:
        asign.append((explanation["text"][j], "None", "Empty"))

i = 0
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)

assert len(set([i[3] for i in asign])) == 4

In [311]:
print("Not assigned medvik:", 35 - len([i for i in asign if i[1] == "Medvik"]))
print("Mistakes from medvik:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]))
print("Partially right from medvik:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]))
print("Right from medvik:", len([i for i in asign if i[3] == 'Right' and i[1] == "Medvik"]))

print("\nNot assigned mash:", 35 - len([i for i in asign if i[1] == "Mash"]))
print("Mistakes from mash:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]))
print("Partially right from mash:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Mash"]))
print("Right from mash:", len([i for i in asign if i[3] == 'Right' and i[1] == "Mash"]))

Not assigned medvik: 15
Mistakes from medvik: 13
Partially right from medvik: 5
Right from medvik: 2

Not assigned mash: 11
Mistakes from mash: 13
Partially right from mash: 5
Right from mash: 6


In [315]:
med = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]
print("text wrong in medvik and mash:", [i for i in med if i in mash])

print("\ntext wrong only in medvik", [i for i in med if i not in mash])

print("\ntext wrong only in mash", [i for i in mash if i not in med])

text wrong in medvik and mash: ['MG vlevo', 'bez šelestu', 'neostře konturované ložisko 7x5mm', 'bez poruchy kinetiky myokardu', 'norm. velikost srdeč. oddílů', 'jizevnaté změny v ZDK', 'tumorozní ložisko']

text wrong only in medvik ['benigní verifikované ložisko v HKK', 'játra bez solidních patologických ložiskových změn', 'USG pravé mammy a axilly', 'mírný sekund lymfedém pod axilou, v zadní axil. řase', 'apokrinní metaplazií a místy i adenóza', 'mastitis: ne']

text wrong only in mash ['někdy tahv oblasti jizvy', 'subjektivně bez bolestí', 'močový měchýř hypodenzní homogenní náplně', 'bránice hladká', 'fibropleurální změny', 'dlouhodobě stac. nález']


In [316]:
med = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Mash"]
print("text partially right in medvik and mash:", [i for i in med if i in mash])

print("\ntext partially right only in medvik", [i for i in med if i not in mash])

print("\ntext partially right only in mash", [i for i in mash if i not in med])

text partially right in medvik and mash: ['parciální mastektomie vlevo']

text partially right only in medvik ['močový měchýř hypodenzní homogenní náplně', 'bránice hladká', 'zn. krvácení', 'průjmy']

text partially right only in mash ['játra bez solidních patologických ložiskových změn', 'USG pravé mammy a axilly', 'uzliny fyziologické', 'operace: 0']


In [590]:
med = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Mash"]
print("text right in medvik and mash:", [i for i in med if i in mash])

print("\ntext right only in medvik", [i for i in med if i not in mash])

print("\ntext right only in mash", [i for i in mash if i not in med])

text right in medvik and mash: ['parestezie nepozoruje', 'bolesti na hrudi 0', 'axilla volná', 'jaterní testy']

text right only in medvik ['USG (Mamma, Axilla', 'anastrozol', 'jizva v ZHQ zhojena', 'CLEXANE']

text right only in mash ['gynekologické operace']


## Improved Access

Now I am going to try another access, where I try handle former mistakes. The biggest change is not to have one list of links for whole text, but to have one list for each word from the text. And then try to explain this word.

In [326]:
def search_with_inclusion_modified(string, func):
    splitted_input = (string.split(" "))
    lenght = len(splitted_input)
    
    result_dict = {}
    for word in splitted_input:
        result_dict[word] = []
        
    for j in range(1, lenght + 1):
        splitted_dict={}
        for elem in splitted_input:
            splitted_dict[j] = False
                    
        for words in combinations(splitted_input, j):
            data = func(" ".join(words))
            if len(data) == 0:
                continue
            for j in words:
                splitted_dict[j] = True
                result_dict[j] += [(i[0], i[1], words) for i in data]
                    
        splitted_input = [j for j, i in splitted_dict.items() if i]
        if len(splitted_input) == 0:
            break
            
    for k, v in result_dict.copy().items():
        result_dict[k] = [i for i in v if len(i[2]) == len(v[-1][2])]
        pop_key = True
        for n in set([i[2] for i in result_dict[k]]):
            string = " ".join(n)
            if string == k:
                pop_key = False
            if string in result_dict:
                continue
            result_dict[string] = [(i[0], i[1]) for i in result_dict[k] if " ".join(i[2]) == string]
            
        if pop_key:
            result_dict.pop(k)
    
    return result_dict
        
    
def search_db_mash_optimized_modified(db):
    db = db.copy()
    db["search"] = db["text"].apply(lambda x: search_with_inclusion_modified(x, mash_search))
    return db

In [327]:
test_new_access = results[["text"]].sample(100, random_state=25)

First we need to link them to databases.

In [529]:
if os.path.isfile("saved_search/new_access.csv"):
    test_new_access = pd.read_csv("saved_search/new_access.csv")
    test_new_access.index = test_new_access["Unnamed: 0"]
    test_new_access.drop(["Unnamed: 0"], axis=1, inplace=True)
    test_new_access["mash_search"] = test_new_access["mash_search"].apply(from_string_to_dict)
    test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(from_string_to_dict)

else:
    test_new_access["mash_search"] = "N/A"
    for j in test_new_access.index:
        test_new_access["mash_search"][j] = search_with_inclusion_modified(test_new_access["text"][j], mash_search)

    test_new_access["medvik_search"] = "N/A"
    for j in test_new_access.index:
        test_new_access["medvik_search"][j] = search_with_inclusion_modified(test_new_access["text"][j], medvik_search_combined)

    test_new_access.to_csv("saved_search/new_access.csv")

To send message to GPT we need not to exceed certain length. We try to discover some lenght, by which the medvik search returns only noice (or really probably).

In [405]:
print()
temp = test_new_access["medvik_search"].apply(lambda x: [(i, len(x[i])) for i in x if len(x[i]) > 20])
print(sorted(list(temp[temp.apply(lambda x: len(x) != 0)]), key=(lambda x: x[0][1])))


[[('vlně', 22)], [('krvácení', 23)], [('mírné', 23)], [('stomatologické', 27)], [('tlustého střeva', 30)], [('nových', 30)], [('stabilní', 32)], [('strukturou', 36)], [('laloku', 38)]]


We can see, that for longer length than 40 we get mostly non-medical terms (or general medical terms).

In [412]:
def drop_long(dictionary):
    for key in dictionary.copy():
        if len(dictionary[key]) > 40:
            dictionary[key] = []
    return dictionary

test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(drop_long)

The second part is to choose the best one by GPT.

In [563]:
if os.path.isfile("saved_search/new_access_explanation.csv"):
    test_new_access = pd.read_csv("saved_search/new_access_explanation.csv")
    test_new_access.index = test_new_access["Unnamed: 0"]
    test_new_access.drop(["Unnamed: 0"], axis=1, inplace=True)
    test_new_access["mash_search"] = test_new_access["mash_search"].apply(from_string_to_dict)
    test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(from_string_to_dict)
    test_new_access["mash_explanation"] = test_new_access["mash_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    test_new_access["medvik_explanation"] = test_new_access["medvik_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)

else:   
    test_new_access["medvik_explanation"] = "N/A"
    for i in test_new_access.index:
        l = test_new_access["medvik_search"][i]
        result = {}
        for text in l:
            message = message_chatGPT(text, l[text], find_by_code_medvik)
            response = send_to_chat(message)
            result[text] = from_chatGPT(response, l[text], find_by_code_medvik)
        test_new_access["medvik_explanation"][i] = result

    test_new_access["mash_explanation"] = "N/A"
    for i in test_new_access.index:
        l = test_new_access["mash_search"][i]
        result = {}
        for text in l:
            message = message_chatGPT(text, l[text], find_by_code_medvik)
            response = send_to_chat(message)
            result[text] = from_chatGPT(response, l[text], find_by_code_medvik)
        test_new_access["mash_explanation"][i] = result

    test_new_access.to_csv("saved_search/new_access_explanation.csv")

### Results

In [548]:
print("Number of examples: {}".format(len(test_new_access)))

print("Number of empty Linking for Medvik_combined_search: {}".format(test_new_access["medvik_search"].apply(lambda x: len(x) == 0).sum()))
print("Number of empty Linking for Mash_search: {}".format(test_new_access["mash_search"].apply(lambda x: len(x) == 0).sum()))

print("Number of not assigned for Medvik: {}".format(test_new_access["medvik_explanation"].apply(lambda x: len(x) == 0).sum()))
print("Number of not assigned for Mash: {}".format(test_new_access["mash_explanation"].apply(lambda x: len(x) == 0).sum()))

Number of examples: 100
Number of empty Linking for Medvik_combined_search: 12
Number of empty Linking for Mash_search: 10
Number of not assigned for Medvik: 12
Number of not assigned for Mash: 15


In [589]:
asign = []
for j in test_new_access.sample(35, random_state=42).index:   
    if test_new_access["medvik_explanation"][j] is not None:
        x = test_new_access["medvik_explanation"][j]
        asign.append((test_new_access["text"][j], "Medvik", "{}".format([(e, x[e]) for e in x])))
    if test_new_access["mash_explanation"][j] is not None:
        x = test_new_access["mash_explanation"][j]
        asign.append((test_new_access["text"][j], "Mash", "{}".format([(e, x[e]) for e in x])))
    if test_new_access["mash_explanation"][j] is None and test_new_access["medvik_explanation"][j] is None:
        asign.append((test_new_access["text"][j], "None", "Empty"))

i = 0
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)

In [792]:
print("Not assigned medvik:", len([i for i in asign if i[1] == "Medvik" and i[3] == ""]))
print("Mistakes from medvik:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]))
print("Partially right from medvik:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]))
print("Right from medvik:", len([i for i in asign if i[3] == 'Right' and i[1] == "Medvik"]))

print("\nNot assigned mash:", len([i for i in asign if i[1] == "Mash" and i[3] == ""]))
print("Mistakes from mash:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]))
print("Partially right from mash:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Mash"]))
print("Right from mash:", len([i for i in asign if i[3] == 'Right' and i[1] == "Mash"]))


med = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]
print("\ntext wrong in medvik and mash:", [i for i in med if i in mash])
print("text wrong only in medvik", [i for i in med if i not in mash])
print("text wrong only in mash", [i for i in mash if i not in med])

med = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Mash"]
print("\ntext partially right in medvik and mash:", [i for i in med if i in mash])
print("text partially right only in medvik", [i for i in med if i not in mash])
print("text partially right only in mash", [i for i in mash if i not in med])

med = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Mash"]
print("\ntext right in medvik and mash:", [i for i in med if i in mash])
print("text right only in medvik", [i for i in med if i not in mash])
print("text right only in mash", [i for i in mash if i not in med])

Not assigned medvik: 1
Mistakes from medvik: 20
Partially right from medvik: 6
Right from medvik: 8

Not assigned mash: 2
Mistakes from mash: 18
Partially right from mash: 10
Right from mash: 5

text wrong in medvik and mash: ['DKK: bez otoků', 'kličky tenkého i tlustého střeva na necíleném vyšetření přiměřeného kalibru i norm. šíře stěny', 'fibrocystické změny s mnohočetnými intraduktálními papilomy', 'beze změny zdra. satvu', 'oboustranné totální mastektomii', 'vpačování bradavek 0', 'bez patrných MTS', 'menzes no', 'mírné velikostní progresi', 'tamoxifenu', 'mamila: pravidelné stavby', 'mutace v genu NBN', 'vlevo bez patol', 'bez nových poíží']
text wrong only in medvik ['gynekologické operace', 'AS reg', 'normě', 'váha stabilní', 'kompletní klinické regrese', 'regrese v prsu']
text wrong only in mash ['parc. ME s disekcí axily', 'anastrozol', 'jizva v ZHQ zhojena', 'neurotoxicita']

text partially right in medvik and mash: ['kůže intaktní', 'stolice spíš zácpovitá', 'hysterectomii 

I have noticed, that most of the words which have been linked correctly have been in basic form (Sg 1). 

#### Some Adititional comments on results
During labeling I noticed, that there can be problem with punctuation (because of punctuation we usualy don't find match). We try to discover if there is some punctuation we can remove.

In [795]:
set(test_new_access["text"].apply(lambda x: "".join(set([i for i in x if i in punctuation]))))

{'', ',', ',(', ',+./', '-', '.', '.-', ':'}

In [651]:
print("Patterns: ',('")
pattern = ",("
for i in test_new_access[test_new_access["text"].apply(lambda x: any([i in pattern  for i in x]))].index:
    text = test_new_access.loc[i]["text"]
    search = test_new_access.loc[i]["medvik_search"]
    print([(x, len(search.get(x, []))) for x in text.split(" ") if any([True for l in x if l in pattern])][0], ":", text)

print("\nPatterns: ':'")
pattern = ":"
for i in test_new_access[test_new_access["text"].apply(lambda x: any([i in pattern  for i in x]))].index:
    text = test_new_access.loc[i]["text"]
    search = test_new_access.loc[i]["medvik_search"]
    print([(x, len(search.get(x, []))) for x in text.split(" ") if any([True for l in x if l in pattern])][0], ":", text)

print("\nPatterns: '-/+'")
pattern = "-/+"
for i in test_new_access[test_new_access["text"].apply(lambda x: any([i in pattern  for i in x]))].index:
    text = test_new_access.loc[i]["text"]
    search = test_new_access.loc[i]["medvik_search"]
    print([(x, len(search.get(x, []))) for x in text.split(" ") if any([True for l in x if l in pattern])][0], ":", text)

print("\nPatterns: '.'")
pattern = "."
for i in test_new_access[test_new_access["text"].apply(lambda x: any([i in pattern  for i in x]))].index:
    text = test_new_access.loc[i]["text"]
    search = test_new_access.loc[i]["medvik_search"]
    print([(x, len(search.get(x, []))) for x in text.split(" ") if any([True for l in x if l in pattern])][0], ":", text)

Patterns: ',('
('(Mamma,', 0) : USG (Mamma, Axilla
('vlevo,', 0) : patní ostruha vlevo, po podání rázové vlny
('denzitometir,', 0) : denzitometir, e
('71/reg,', 0) : sr. 71/reg, osa + 9, převodní časy v normě, QTc 414 mm, bez ložisk. ischem změn
('břicho,', 0) : břicho, měkké, prohmanté, nebol
('hepar,', 0) : hepar, lien nehmatám

Patterns: ':'
('mamila:', 0) : mamila: pravidelné stavby
('DKK:', 0) : DKK: bez otoků
('FA:', 0) : FA: sine

Patterns: '-/+'
('kolene-', 0) : kolene- Baker. cysta
('tru-cut', 0) : st. p. tru-cut biopsii
('COVID-19', 4) : příznaky COVID-19 nevykazuje
('-', 0) : NACT - paclitaxel weekly
('-', 0) : mammy - vpravo jizva po ablaci
('71/reg,', 0) : sr. 71/reg, osa + 9, převodní časy v normě, QTc 414 mm, bez ložisk. ischem změn

Patterns: '.'
('zdra.', 5) : beze změny zdra. satvu
('ovar.', 0) : prezervace oocytů pomocí ovar. ablace
('patol.', 0) : kůže bez patol. efloresc
('hmat.', 1) : bez hmat. patol. resistence
('bilat.', 0) : homans bilat. negat
('neoadj.', 0) :

We can see, that comma and parenthesis have no specific meaning, so we can remove them. 

On the other hand colon has specific meaning, its meaning is specifing some cathegory and the rest from the text is about it.

The others punctuations have specific meaning. For example dot, which specify that the word is only shorcut or dash which is part of some words.

In [780]:
data_improved_punctuation = data.copy()
data_improved_punctuation["original_text"] = data["text"]
data_improved_punctuation["text"] = data_improved_punctuation["text"].apply(
    lambda text: " ".join([word.strip(" ,()") for word in text.split(" ")]))

for i in data_improved_punctuation.index:
    if len(data_improved_punctuation["text"][i].split(":")) >= 2:
        print(data_improved_punctuation["text"][i])

uzliny: fyziologické
antikoncepce: 0
operace: 0
alergie: Ketazon urtika
alergie: Červená paprika a kočky
břicho: v niveau
alergie: O
DKK:O
ložiska: l.dx. ZDK solitární 6x6x5 mm
gynekologická onemocnění: ne
operace: krční mandle
mastitis: neguje
perikard:bez výpotku
mastitis: ne
plíce: poklep plný jasný
gynekologické operace: 0
dušnost: 0
srdce: as reg
uzliny: patologické
hlava : poklepově nebolestivá
hormonální léčba: 0
axila i nadkl:O
karnofsky index: 100
alergie: neuvádí
břicho: měkké
mastitis: 0
hormonální léčba: HA + HRT
hormonální léčba: HRT
ITP: 1
alergie: 0
antikoncepce: cca 2roky
hormonální léčba: HRT dříve
DKK: bez otoků
trávicí potíže: 0
gynekologická onemocnění: 0
gynekologické operace: neguje
DKK :hybnost volná v plném rozsahu
gynekologické operace: ne
uzliny: nejsou patrné
axilly: 0
krk: nápln žil v normě
antikoncepce: IUD jaydess
FA: sine
ao: trojcípá
operace: neguje
uzliny: suspektní
OPERACE: Mastectomia part. l. sin
cyklus: pravidelný
gynekologická onemocnění: neguje
O 

During solving the colon problem I noticed, that there is another meaning of colon. Sometimes it was together at the end with 0. Its meaning is negation. There are also other ways how to express negation:

In [781]:
for i in data_improved_punctuation.index:
    colon_split = data_improved_punctuation["text"][i].split(":")
    if len(colon_split) >= 2 and colon_split[1].strip(" ") == "0":
        print(data_improved_punctuation["text"][i])
print()
for i in data_improved_punctuation.index:
    words_split = data_improved_punctuation["text"][i].split(" ")
    if any([True for x in words_split if x.strip(" ") in ["0", "ne"]]):
        print(data_improved_punctuation["text"][i])

antikoncepce: 0
operace: 0
gynekologické operace: 0
dušnost: 0
hormonální léčba: 0
mastitis: 0
alergie: 0
trávicí potíže: 0
gynekologická onemocnění: 0
axilly: 0
parestezie: 0
AA:0

astma 0
bolesti na hrudi 0
antikoncepce: 0
dušnost - 0 - bez obtíží
operace: 0
arytmie 0
IM 0
bolest - 0 - žádná
thyreopatie 0
DM 0
axile bez evid patol uzlin nadklíček 0
gynekologická onemocnění: ne
axila a nadkl. 0
TT 0
palpitace a stenokardie 0
zvracení - 0 - nezvrací
nadklíček 0
zvracení - 0 - nezvracíí
regurgitace 0
slabost nebo únava 0
infekce 0
hepar a lien 0
otoky DKK 0
transfuze 0
mastitis: ne
ŠŽ 0
gynekologické operace: 0
CHOPN 0
DVT 0
dušnost: 0
dušnost ani kašel také ne
dušnost - 0
VCHGD 0
hormonální léčba: 0
vpačování bradavek 0
mastitis: 0
žádná dušnost - 0
dušnosta kašel 0
axila 0
CMP 0
TBC 0
alergie: 0
TEN 0
dušnosta akšel 0
vředová choroba 0
gynekologické potíže 0
trávicí potíže: 0
gynekologická onemocnění: 0
výtoky z bradavek 0
gynekologické operace: ne
kašel 0
axilly: 0
pálení žáhy 0
potr

Now we remove the negation from there and create new colomn for it.

In [793]:
data_improved_punctuation["negation"] = False
for i in data_improved_punctuation.index:
    colon_split = data_improved_punctuation["text"][i].split(":")
    if len(colon_split) >= 2 and colon_split[1].strip(" ") in  ["0", "ne", "O"]:
        data_improved_punctuation.at[i, "negation"] = True
        data_improved_punctuation.at[i, "text"] = colon_split[0]
for i in data_improved_punctuation.index:
    words_split = data_improved_punctuation["text"][i].split(" ")
    if any([True for x in words_split if x.strip(" ") in ["0", "ne"]]):
        data_improved_punctuation.at[i, "negation"] = True
        data_improved_punctuation.at[i, "text"] = " ".join([x for x in words_split if x not in ["0", "ne"]])
    # If there is bez on the beginning, it is mostly related to whole text, but in the middle of the text
    # it can only be related to part of it
    elif words_split[0].strip(" ") == "bez":
        data_improved_punctuation.at[i, "negation"] = True
        data_improved_punctuation.at[i, "text"] = " ".join(words_split[1:])

data_improved_punctuation["about"] = "N/A"
for i in data_improved_punctuation.index:
    colon_split = data_improved_punctuation["text"][i].split(":")
    if len(colon_split) >= 2:
        data_improved_punctuation.at[i, "about"] = colon_split[0].strip(" ")
        data_improved_punctuation.at[i, "text"] = colon_split[1].strip(" ")

data_improved_punctuation["text"] = data_improved_punctuation["text"].apply(clean)

In [796]:
data_improved_punctuation.head(5)

Unnamed: 0,label,text,original_text,negation,about
0,symptom,jemný fibrózní proužek,jemný fibrózní proužek,False,
1,procedura,neoadjuvantní CHT,neoadjuvantní CHT,False,
2,medikace,novalgin,novalgin,False,
3,symptom,označena SLU v levé axile,označena SLU v levé axile,False,
4,procedura,st.p. totální ME + SNB vlevo,st.p. totální ME + SNB vlevo,False,


## Improved Search with preprocessing
Because we have noticed, that in previous model it worked well for words which have been in basic form. We try to preprocess all words this way.

In [790]:
data_improved_punctuation

Unnamed: 0,label,text,original_text,negation,about
0,symptom,jemný fibrózní proužek,jemný fibrózní proužek,False,
1,procedura,neoadjuvantní CHT,neoadjuvantní CHT,False,
2,medikace,novalgin,novalgin,False,
3,symptom,označena SLU v levé axile,označena SLU v levé axile,False,
4,procedura,st.p. totální ME + SNB vlevo,st.p. totální ME + SNB vlevo,False,
...,...,...,...,...,...
6017,symptom,hypovit D,hypovit D,False,
6024,symptom,velikostní progresi,velikostní progresi,False,
6026,symptom,DKK brnění prstů,DKK brnění prstů,False,
6031,procedura,nukleární medicína,nukleární medicína,False,


In [413]:
def basic_access_whole_way(string):
    pass