# Linking named entities
by Filip Gregora

In [1]:
import pandas as pd
from itertools import combinations
from string import punctuation, ascii_letters
import numpy as np
import math
import random
import os
import xml.etree.ElementTree as elt
from openai import OpenAI
import regex
import requests
import json

In [2]:
data = pd.read_csv("../data/NER_entities.csv")
data.head(10)

Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,Novalgin
3,symptom,Označena SLU v levé axile.
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,Založení TE l.sin
7,procedura,Cytostatika
8,NE symptom,"přiměřené echogenity,"
9,NE symptom,nezvětšena


In [3]:
len(data[data.duplicated()])

2588

In [4]:
def clean(string):
    string = string.strip(" " + "".join(punctuation))
    # Remove first upper letter if not all letters are upper
    if len(string) >= 2:
        string = string[0].lower() + string[1:] if string[1].islower() else string
    # Replace multiple whitespaces with one
    return " ".join(string.split())

def clean_table(db):
    db_copy = db.copy()
    db_copy["text"] = db_copy["text"].apply(clean)
    db_copy["text"] = db_copy["text"].drop_duplicates()
    return db_copy.dropna()

former_len = len(data)
data = clean_table(data)
print(former_len, "->", len(data), ":", former_len - len(data))
data.head(10)

6034 -> 2803 : 3231


Unnamed: 0,label,text
0,symptom,jemný fibrózní proužek
1,procedura,neoadjuvantní CHT
2,medikace,novalgin
3,symptom,označena SLU v levé axile
4,procedura,st.p. totální ME + SNB vlevo
5,medikace,NOVALGIN
6,procedura,založení TE l.sin
7,procedura,cytostatika
8,NE symptom,přiměřené echogenity
9,NE symptom,nezvětšena



### Linking to international MESH through NIH
Mash is international medical databaze: https://uts.nlm.nih.gov/uts/.

I tried search all combinations of words from text in databaze. The longer combinations have higher priority. 

There is one big problem, the complexity grows exponentially with the lenght of the words (in the worst case for lenght of 20 we have to try around 10^6 combinations). My solution for this problem is go from bottom up, start with lenght 1 and continue only with combinations which success.

In [5]:
# Do not search in databaze if it number or it is too short (shorter than 2)
def filter_short(string):
    return len(string) < 2 or string.isdigit()

In [6]:
# From string to list which contains tuples
def from_string_to_list(string):
    result = []
    for j in string.strip("[]()").split("), ("):
        if len(j) == 0:
            continue
        result.append(tuple([s.strip("'\" \\") for s in j.split("', ")]))
                
    return result


#From string to tuple
def from_string_to_tuple(string):
    if string == "N/A":
        return
    string = string.strip("\\\" '")
    stripped_string = string[0].strip("(") + string[1:-1] + string[-1].strip(")")
    
    result = [i.strip("\\\"'") for i in stripped_string.split(", ")]
    return (result[0], result[1], ", ".join(result[2:]))


#From string to dictionary
def from_string_to_dict(string):
    result = {}
    for j in string.strip("{} ").split("], "):
        if j == "":
            continue
        i = list(j.split(": ["))
        assert len(i) == 2
        result[i[0].strip("\"\' \\")] = from_string_to_list(i[1])

    return result


#From string to dictionary which contains tuples
def from_string_to_dict_to_tuple(string):
    result = {}
    for j in regex.split("(\)| None), ('|\")", string.strip("{} ")):
        if j in ["", ')', ' None', "'", '"']:
            continue
        i = list(j.split(": ("))
        if len(i) == 1:
            i[0] = i[0].split(": None")[0].strip(": ")
            result[i[0].strip("\"\' \\")] = None
        else:
            result[i[0].strip("\"\' \\")] = from_string_to_tuple(i[1])

    return result

#From string to dictionary with ints
from_string_to_int_dict = (lambda x: {elem.split("': ")[0].strip("' \"") : int(elem.split("': ")[1])
                                      for elem in x.strip("{}\"' ").split(", '")})

In [7]:
offline_mshcz = elt.parse('../databaze/MeSH2023_Marc21_Alma.xml').getroot()

In [8]:
def medvik_search(string, test):
    result = []
    for child in offline_mshcz:
        for subchild in child.iter("{http://www.loc.gov/MARC21/slim}subfield"):
            if subchild.text and test(string, subchild.text):
                try:
                    code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
                    name = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "150" ][0][0].text
                    result.append((code, name))
                    break
                except IndexError:
                    break                
    return result


def medvik_exact_search(string):              
    return medvik_search(string, lambda x, y: x.lower() == y.lower())


def medvik_words_search(string):      
    return medvik_search(string, lambda x, y: (" " + x.lower() + " ") in y.lower())


def medvik_match_search(string):
    return medvik_search(string, lambda x, y: x.lower() in y.lower())


def medvik_combined_search(string):
    result = medvik_exact_search(string)
    if len(result) == 0:
        result = medvik_words_search(string)
    if len(result) == 0:
        result = medvik_match_search(string)
    
    return result

## Choosing best match with Chat GPT
The idea behind our model is first link term to database and then choose the best one by some pretrained language model.

I use GPT-3.5, because it is free to access with limitations (There are some limits of access per day. And there is limited number of access per account. Then we have to pay.), it is fast and it is well known.

The message to GPT is in this format:

Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: "[MEDICAL TERM]":

    1. [DESCRIPTION_N.1] (pojem: [TERM_N.1])
    2. [DESCRIPTION_N.2] (pojem: [TERM_N.2])
    ...
    
Jako odpověď mi pošli pouze číslo odpovědi. Pokud to nebude žádná z možností, pak odpověz NONE.

In [9]:
def medvik_find_by_code(string):
    if len(string) == 0:
        return ""
    
    for child in offline_mshcz:
        try:
            code = [i for i in child.findall("{http://www.loc.gov/MARC21/slim}controlfield") if i.attrib["tag"] == "001" ][0].text
            if code == string:
                d = [i.iter("{http://www.loc.gov/MARC21/slim}subfield") for i in child.findall("{http://www.loc.gov/MARC21/slim}datafield") if i.attrib["tag"] == "680"][0]
                return next(d).text
        except IndexError:
            continue      
            
    return ""

In [10]:
with open("../APIkeys/NIH", "r") as f:
    NIH_api = f.read()

def send_to_GPT(message):
    if message == "":
        return ""

    with open("../APIkeys/chatGTP", "r") as f:
        chatgpt_api = f.read()

    client = OpenAI(api_key=chatgpt_api)
    return client.chat.completions.create(
                model="gpt-3.5-turbo-16k",
                messages=[{"role": "user", "content": message}],
                stream=False)


def message_for_GPT(string, li, find, context=None):
    if len(li) == 0:
        return ""
            
    if context is None:
        result = [f"Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: \"{string}\":\n"]
    else:
        result = [f"Který z uvedených lékařských pojmů s jeho popisem nejlépe odpovídá pojmu: \"{string}\" v kontextu:  \"{context}\":\n"]
        
    j = 1
    for i in li:
        result.append(f"{j}. {find(i[0])} (pojem: {i[1]})\n")
        j += 1
        
    result.append("Jako odpověď mi pošli pouze číslo odpovědi. Pokud to nebude žádná z možností, pak odpověz NONE. Pokud to není lékařský pojem odpověz taky NONE.")
    
    return "".join(result)
    
def find_int(string):
    result = []
    for i in string:
        if i.isdigit():
            result.append(i)

    return int("".join(result))


def from_GPT(result, li, find):
    try:
        i = find_int(result.choices[0].message.content) - 1
        return (li[i][0], li[i][1], find(li[i][0]))
    except ValueError:
        pass
    except IndexError:
        pass

    return None

## Improved Search

Now I am going to try another access, where I try handle former mistakes. The biggest change is not to have one list of links for whole text, but to have one list for each word from the text. And then try to explain this word.

In [11]:
def search_from_bottom_no_drop(string, func):
    splitted_input = (string.split(" "))
    lenght = len(splitted_input)
    
    result_dict = {}
    for word in splitted_input:
        result_dict[word] = []
        
    for j in range(1, lenght + 1):
        splitted_dict={}
        for elem in splitted_input:
            splitted_dict[j] = False
                    
        for words in combinations(splitted_input, j):
            data = func(" ".join(words))
            if len(data) == 0:
                continue
            for j in words:
                splitted_dict[j] = True
                result_dict[j] += [(i[0], i[1], words) for i in data]
                    
        splitted_input = [j for j, i in splitted_dict.items() if i]
        if len(splitted_input) == 0:
            break
            
    for k, v in result_dict.copy().items():
        result_dict[k] = [i for i in v if len(i[2]) == len(v[-1][2])]
        pop_key = True
        for n in set([i[2] for i in result_dict[k]]):
            string = " ".join(n)
            if string == k:
                pop_key = False
            if string in result_dict:
                continue
            result_dict[string] = [(i[0], i[1]) for i in result_dict[k] if " ".join(i[2]) == string]
            
        if pop_key:
            result_dict.pop(k)
    
    return result_dict

First we need to link them to databases.

In [12]:
if os.path.isfile("../saved_search/new_access.csv"):
    test_new_access = pd.read_csv("../saved_search/new_access.csv")
    test_new_access.index = test_new_access["Unnamed: 0"]
    test_new_access.drop(["Unnamed: 0"], axis=1, inplace=True)
    test_new_access["mash_search"] = test_new_access["mash_search"].apply(from_string_to_dict)
    test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(from_string_to_dict)

else:
    test_new_access = results[["text"]].sample(100, random_state=25)
    test_new_access["mash_search"] = "N/A"
    for j in test_new_access.index:
        test_new_access["mash_search"][j] = search_from_bottom_no_drop(test_new_access["text"][j], mash_search)

    test_new_access["medvik_search"] = "N/A"
    for j in test_new_access.index:
        test_new_access["medvik_search"][j] = search_from_bottom_no_drop(test_new_access["text"][j], medvik_combined_search)

    test_new_access.to_csv("../saved_search/new_access.csv")

To send message to GPT we need not to exceed certain length. We try to discover some lenght, by which the medvik search returns only noice (or really probably).

In [13]:
def print_long_searches(db, limit=20):
    temp = db.apply(lambda x: [(i, len(x[i])) for i in x if len(x[i]) > limit])
    print(sorted(list(temp[temp.apply(lambda x: len(x) != 0)]), key=(lambda x: x[0][1])))

print_long_searches(test_new_access["medvik_search"])

[[('vlně', 22)], [('krvácení', 23)], [('mírné', 23)], [('stomatologické', 27), ('vyš', 874)], [('nových', 30)], [('stabilní', 32)], [('strukturou', 36)], [('laloku', 38)], [('susp', 52)], [('i na', 58), ('tlustého střeva', 30)], [('operace', 58)], [('nebol', 66)], [('léčí s', 76)], [('spíš', 77)], [('příl', 84)], [('e', 95)], [('genetické', 99)], [('p.', 119)], [('NACT', 128), ('-', 5334)], [('plicní', 162)], [('klinické', 164)], [('ME s', 166)], [('ME s', 166)], [('léčba', 181)], [('vyšetření', 186)], [('příznaky', 314)], [('není', 327)], [('TAD', 331), ('l. I', 72)], [('- po', 386)], [('pomocí', 393)], [('pm', 548)], [('PM', 548), ('se', 7588)], [('negat', 740)], [('Cor', 1304)], [('patol', 6932)], [('v', 7248)], [('toxicita', 11218)], [('v', 14496)]]


We can see, that for longer length than 40 we get mostly non-medical terms (or general medical terms).

In [14]:
def drop_long_searches(dictionary, limit=40):
    for key in dictionary.copy():
        if len(dictionary[key]) > limit:
            dictionary[key] = []
    return dictionary

test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(drop_long_searches)

The second part is to choose the best one by GPT.

In [15]:
if os.path.isfile("../saved_search/new_access_explanation.csv"):
    test_new_access = pd.read_csv("../saved_search/new_access_explanation.csv")
    test_new_access.index = test_new_access["Unnamed: 0"]
    test_new_access.drop(["Unnamed: 0"], axis=1, inplace=True)
    test_new_access["mash_search"] = test_new_access["mash_search"].apply(from_string_to_dict)
    test_new_access["medvik_search"] = test_new_access["medvik_search"].apply(from_string_to_dict)
    test_new_access["mash_explanation"] = test_new_access["mash_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)
    test_new_access["medvik_explanation"] = test_new_access["medvik_explanation"].fillna("{}").apply(from_string_to_dict_to_tuple)

else:   
    test_new_access["medvik_explanation"] = "N/A"
    for i in test_new_access.index:
        l = test_new_access["medvik_search"][i]
        result = {}
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        test_new_access["medvik_explanation"][i] = result

    test_new_access["mash_explanation"] = "N/A"
    for i in test_new_access.index:
        l = test_new_access["mash_search"][i]
        result = {}
        for text in l:
            message = message_for_GPT(text, l[text], medvik_find_by_code)
            response = send_to_GPT(message)
            result[text] = from_GPT(response, l[text], medvik_find_by_code)
        test_new_access["mash_explanation"][i] = result

    test_new_access.to_csv("../saved_search/new_access_explanation.csv")

### Results
Now its time to examine the results of improved search.

In [16]:
print("Number of examples: {}".format(len(test_new_access)))

print("Number of empty Linking for Medvik_combined_search: {}".format(test_new_access["medvik_search"].apply(lambda x: len(x) == 0).sum()))
print("Number of empty Linking for Mash_search: {}".format(test_new_access["mash_search"].apply(lambda x: len(x) == 0).sum()))

print("Number of not assigned for Medvik: {}".format(test_new_access["medvik_explanation"].apply(lambda x: len(x) == 0).sum()))
print("Number of not assigned for Mash: {}".format(test_new_access["mash_explanation"].apply(lambda x: len(x) == 0).sum()))

Number of examples: 100
Number of empty Linking for Medvik_combined_search: 12
Number of empty Linking for Mash_search: 10
Number of not assigned for Medvik: 12
Number of not assigned for Mash: 15


There we have got much more better results than in basic access. We were able to choose the best match for nearly everything, what we have been able to find in database.

In [17]:
asign = []
for j in test_new_access.sample(35, random_state=42).index:   
    if test_new_access["medvik_explanation"][j] is not None:
        x = test_new_access["medvik_explanation"][j]
        asign.append((test_new_access["text"][j], "Medvik", "{}".format([(e, x[e]) for e in x])))
    if test_new_access["mash_explanation"][j] is not None:
        x = test_new_access["mash_explanation"][j]
        asign.append((test_new_access["text"][j], "Mash", "{}".format([(e, x[e]) for e in x])))
    if test_new_access["mash_explanation"][j] is None and test_new_access["medvik_explanation"][j] is None:
        asign.append((test_new_access["text"][j], "None", "Empty"))

i = 0
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Right"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Partially"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = ""
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)
i += 1
assigment = "Wrong"
asign[i] = (asign[i][0], asign[i][1], asign[i][2], assigment)

In [18]:
print("Not assigned medvik:", len([i for i in asign if i[1] == "Medvik" and i[3] == ""]))
print("Mistakes from medvik:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]))
print("Partially right from medvik:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]))
print("Right from medvik:", len([i for i in asign if i[3] == 'Right' and i[1] == "Medvik"]))

print("\nNot assigned mash:", len([i for i in asign if i[1] == "Mash" and i[3] == ""]))
print("Mistakes from mash:", len([i for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]))
print("Partially right from mash:", len([i for i in asign if i[3] == 'Partially' and i[1] == "Mash"]))
print("Right from mash:", len([i for i in asign if i[3] == 'Right' and i[1] == "Mash"]))

Not assigned medvik: 1
Mistakes from medvik: 20
Partially right from medvik: 6
Right from medvik: 8

Not assigned mash: 2
Mistakes from mash: 18
Partially right from mash: 10
Right from mash: 5


The results are better than previous access:

For medvik we have got 40 % for at least partially right linking. And 23 % for completely right linking.

For mash we have got 43 % for at least partially right linking. And 14 % for completely right linking.

But even these results are very good, they need next improvement. For this reason I would like to try another improved linking. But first I look at the wrong, partially and right results and observe if there is some pattern.


In [19]:
med = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Wrong' and i[1] == "Mash"]
print("text wrong in medvik and mash:", [i for i in med if i in mash])
print("text wrong only in medvik", [i for i in med if i not in mash])
print("text wrong only in mash", [i for i in mash if i not in med])

med = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Partially' and i[1] == "Mash"]
print("\ntext partially right in medvik and mash:", [i for i in med if i in mash])
print("text partially right only in medvik", [i for i in med if i not in mash])
print("text partially right only in mash", [i for i in mash if i not in med])

med = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Medvik"]
mash = [i[0] for i in asign if i[3] == 'Right' and i[1] == "Mash"]
print("\ntext right in medvik and mash:", [i for i in med if i in mash])
print("text right only in medvik", [i for i in med if i not in mash])
print("text right only in mash", [i for i in mash if i not in med])

text wrong in medvik and mash: ['DKK: bez otoků', 'kličky tenkého i tlustého střeva na necíleném vyšetření přiměřeného kalibru i norm. šíře stěny', 'fibrocystické změny s mnohočetnými intraduktálními papilomy', 'beze změny zdra. satvu', 'oboustranné totální mastektomii', 'vpačování bradavek 0', 'bez patrných MTS', 'menzes no', 'mírné velikostní progresi', 'tamoxifenu', 'mamila: pravidelné stavby', 'mutace v genu NBN', 'vlevo bez patol', 'bez nových poíží']
text wrong only in medvik ['gynekologické operace', 'AS reg', 'normě', 'váha stabilní', 'kompletní klinické regrese', 'regrese v prsu']
text wrong only in mash ['parc. ME s disekcí axily', 'anastrozol', 'jizva v ZHQ zhojena', 'neurotoxicita']

text partially right in medvik and mash: ['kůže intaktní', 'stolice spíš zácpovitá', 'hysterectomii pro krvácení', 'klinicky lipom při sternu', 'GIT toxicita G1']
text partially right only in medvik ['parc. ME s disekcí axily']
text partially right only in mash ['USG (Mamma, Axilla', 'váha stab

- I have noticed, that most of the words which have been linked correctly have been in basic form (Sg 1).
- Some from the wrong one have mistake or there a punctuation is doing some mess.


### Some Adititional comments on results
The results above aren't pretty good. Some changes and improvements can be done in the medical text. But lots of these changes will be label specific.

During labeling I have noticed, that there can be problem with punctuation (because of punctuation we usualy don't find match). For this reason I tried to discover if there is some punctuation we can remove.

In [20]:
punc_in_data = set()
data["text"].apply(lambda x: punc_in_data.update(set([i for i in x if i in punctuation])))
punc_in_data

{'%', '(', ')', '+', ',', '-', '.', '/', ':', ';', '['}

In [21]:
# temp = data.sample(200, random_state=10)
for p in sorted(list(punc_in_data)):
    print(f"Punctuation: {p}")
    data[data["text"].apply(lambda x: p in x)].sample(frac=1, random_state=10).head(10)["text"].apply(lambda x: print(x))
    print()


Punctuation: %
ki 100%, aloepcie,sliznice prokrveny, oběh. komp
ki 100%,alopecie,sliznice prokrveny, oběh. komp
inf. glucosi 10% 500 ml
růst trabekulárně, solidně, s polarizací do 10 %, jádra s jemným chromatinem
KI 100% lucidní

Punctuation: (
proliferace dle Ki67 (automat) 35
RTG (Plíce
cílenou axilární disekcí (SNB + klipovaná uzlina
biopsie sentinelové uzliny (SNB
vyrážka na kůži (kopřivka
re-resekce - laterální části (kůže + podkoží) + disekce pravé axily - en bloc
adjuvantní radioterapii na hrudní stěnu vpravo + axilu vpravo ( I-IV.etáž
páteře C+ L( diskopatie
USG (Břicho
alergie: Ketazon (urtika

Punctuation: )
proliferace dle Ki67 (automat) 35
odstranění klipované (a ev. sentinelové) uzliny
re-resekce - laterální části (kůže + podkoží) + disekce pravé axily - en bloc

Punctuation: +
adjuv CHt paclitaxel weekly 12x + trastuzuab
ko + trombo + dif
LHRH+IA
H+L nehmatné
hye+ae
UZ+MMG
totální ME + SNB vlevo
cílenou axilární disekcí (SNB + klipovaná uzlina
LDK, st.p. HYE+AE
páteře C+ 

We can see, that most of the punctuation have no specific meaning, so we can substitute them with space (most of them stands there instead of space).

On the other hand colon has specific meaning, its meaning is specific some category and the rest from the text is about it.

And dot have specific meaning, which specify that the word is only shortcut or dash which is part of some words.

In [22]:
data_improved_punctuation = data.copy()
data_improved_punctuation["original_text"] = data["text"]
data_improved_punctuation["text"] = data_improved_punctuation["text"].apply(
    lambda text: "".join([l if l not in punctuation or l in ["."] else " " for l in text]))
data_improved_punctuation["text"] = data_improved_punctuation["text"].apply(clean)

If we inspect the longer matches, we can see, that many times we've got there nonsense combination (like word with conjunction behind it). For this reason I am going to modify the algorithm, where it will preserve even the shorter matches.

In [23]:
temp = test_new_access["medvik_explanation"].apply(lambda x: {k: x[k][1] for k in x if len(k.split(" ")) >= 2 if x[k] is not None})
print(list(temp[temp != {}].head(5)))
temp = test_new_access["mash_explanation"].apply(lambda x: {k: x[k][1] for k in x if len(k.split(" ")) >= 2 if x[k] is not None})
print(list(temp[temp != {}].head(5)))

[{'beze změny': 'tropismus'}, {'sekund a': 'syndromy spánkové apnoe', 'lymfedém a': 'syndrom žlutých nehtů', 'pod hrudní': 'bederní obratle', 'pod a': 'parotis', 'hrudní a': 'bránice', 'stěnou a': 'sagitální abdominální rozměr'}, {'změny s': 'incontinentia pigmenti', 's mnohočetnými': 'vrozené srdeční vady'}, {'vše v': 'naučená bezmocnost'}, {'v kloubech': 'synoviální cysta', 'v a': 'parciální tromboplastinový čas', 'a kloubech': 'juvenilní artritida', 'a kyčelních': 'artróza kyčelních kloubů'}]
[{'hrudní a': 'Thoracoabdominal aortic aneurysm'}, {'jizva v': "Vaccination site scar',"}, {'Baker. cysta': "Popliteal Cyst',"}, {'změny s': 'Mood alterations with depressive symptoms\',",'}, {'TEN 0': "WHODAS 2.0 12-item Version Proxy-administered - Concentrating for Ten Minutes',"}]
