In [1]:
import math

import gensim
from tqdm import tqdm

from UMLS import UMLSMapper, UMLSEvaluator
from embeddings import Embeddings
from evaluate_embeddings import Benchmark

from vectorize_data import lines_from_file, preprocess

# General

In [2]:
embedding_name = "no_prep_vecs"
umls_mapper = UMLSMapper(from_dir='E:/AML4DH-DATA/UMLS')

initialize UMLSMapper...


# Vectorization

In [3]:
cpq_sentences = lines_from_file(path="E:/AML4DH-DATA/CPG-AMIA2020/Plain Text/cpg-sentences.txt")
print(cpq_sentences[:2])

['Helicobacter pylori ist der wesentliche Risikofaktor für das Magenkarzinom.', 'Die H. pylori-Eradikation mit dem Ziel der Magenkarzinomprävention sollte bei den folgenden Risikopersonen durchgeführt werden (siehe Tabelle unten).']


In [4]:
# tokenization
cpq_sentences = [sentence.split() for sentence in cpq_sentences]
print(cpq_sentences[:2])

[['Helicobacter', 'pylori', 'ist', 'der', 'wesentliche', 'Risikofaktor', 'für', 'das', 'Magenkarzinom.'], ['Die', 'H.', 'pylori-Eradikation', 'mit', 'dem', 'Ziel', 'der', 'Magenkarzinomprävention', 'sollte', 'bei', 'den', 'folgenden', 'Risikopersonen', 'durchgeführt', 'werden', '(siehe', 'Tabelle', 'unten).']]


In [5]:
# cpq_sentences = umls_mapper.standardize_documents(cpq_sentences)
cpq_sentences = umls_mapper.replace_documents_with_umls(cpq_sentences)
# cpq_sentences = preprocess(documents=cpq_sentences, lemmatize=True, remove_stopwords=True)
print(cpq_sentences[:2])

[['C0085508', 'pylori', 'ist', 'der', 'wesentliche', 'Risikofaktor', 'für', 'das', 'Magenkarzinom.'], ['Die', 'H.', 'pylori-Eradikation', 'mit', 'dem', 'Ziel', 'der', 'Magenkarzinomprävention', 'sollte', 'bei', 'den', 'folgenden', 'Risikopersonen', 'durchgeführt', 'werden', '(siehe', 'Tabelle', 'unten).']]


In [6]:
vecs = Embeddings.calculate_vectors(cpq_sentences, use_phrases=False)
Embeddings.save_medical(vecs, embedding_name, umls_mapper)

 ... (more hidden) ...


# Benchmarks

In [4]:
vecs = Embeddings.load(path=f"data/{embedding_name}.kv")

load embedding...


In [7]:
for c, v in vecs.most_similar(umls_mapper.umls_dict["Cisplatin"]):
    print(umls_mapper.un_umls(c), v)
concept_vecs = umls_mapper.get_umls_vectors_only(vecs)

Paclitaxel 0.9073265790939331
Carboplatin 0.9063575267791748
Etoposid 0.8836759924888611
Docetaxel 0.8660497665405273
Capecitabin 0.8628433346748352
Oxaliplatin 0.8613467216491699
Bevacizumab 0.8208531737327576
Rituximab 0.8187896609306335
Vinorelbin 0.8170721530914307
Bleomycin 0.8145763874053955


In [8]:
evaluator = UMLSEvaluator(from_dir='E:/AML4DH-DATA/UMLS', vectors=vecs)

initialize UMLSEvaluator...


In [9]:
benchmark = Benchmark(vecs, umls_mapper, evaluator)
benchmark.choi_benchmark()

Pharmacologic Substance: 3.9181550275259163
Disease or Syndrome: 1.8205877458826087
Neoplastic Process: 2.454774462747514
Clinical Drug: 0
Finding: 0.6427943747281554
Injury or Poisoning: 0.08356520320690844


In [20]:
benchmark.all_categories_benchmark()

 ... (more hidden) ...

Organic Chemical: within 0.4047955686860371, out 0.3272494287251759, distance 0.07754613996086124


 ... (more hidden) ...

Pharmacologic Substance: within 0.4028166942319427, out 0.3246679605345326, distance 0.0781487336974101


 ... (more hidden) ...

Hazardous or Poisonous Substance: within 0.3601047635513578, out 0.33224244602521114, distance 0.02786231752614665


 ... (more hidden) ...

Body Location or Region: within 0.3390124184052935, out 0.300114256724426, distance 0.038898161680867516


 ... (more hidden) ...

Congenital Abnormality: within 0.21600400134921074, out 0.2743047607047167, distance -0.058300759355505966


 ... (more hidden) ...

Pathologic Function: within 0.3577277729504769, out 0.3236791052552097, distance 0.03404866769526721


 ... (more hidden) ...

Phenomenon or Process: within 0.38312225017164436, out 0.3331698758681513, distance 0.049952374303493086


 ... (more hidden) ...

Indicator, Reagent, or Diagnostic Aid: within 0.319217589226636, out 0.3158613446842207, distance 0.0033562445424152876


 ... (more hidden) ...

Biologically Active Substance: within 0.4187097965113715, out 0.3599078740933325, distance 0.058801922418039


 ... (more hidden) ...

Amino Acid, Peptide, or Protein: within 0.3746652771904849, out 0.33498055916154096, distance 0.039684718028943966


 ... (more hidden) ...

Enzyme: within 0.38978989183784946, out 0.34447448728317687, distance 0.04531540455467259


 ... (more hidden) ...

Behavior: within 0.3901875838637352, out 0.3302508038757388, distance 0.0599367799879964
Chemical: within 0.26886695623397827, out 0.35725723215095456, distance -0.08839027591697629


 ... (more hidden) ...

Disease or Syndrome: within 0.3518351337455603, out 0.3241291970990333, distance 0.027705936646526985


 ... (more hidden) ...

Occupational Activity: within 0.3385116277055608, out 0.32616768394308976, distance 0.012343943762471055


 ... (more hidden) ...

Neoplastic Process: within 0.29051146212836887, out 0.2610372338340724, distance 0.029474228294296467


 ... (more hidden) ...

Nucleic Acid, Nucleoside, or Nucleotide: within 0.5095622754272293, out 0.38772655425719516, distance 0.12183572117003416


 ... (more hidden) ...

Tissue: within 0.4270461845580499, out 0.3353582534182252, distance 0.09168793113982465


 ... (more hidden) ...

Mental or Behavioral Dysfunction: within 0.4479726084313769, out 0.35227113572995894, distance 0.09570147270141793


 ... (more hidden) ...

Immunologic Factor: within 0.39098162686669846, out 0.2996347365071448, distance 0.09134689035955368


 ... (more hidden) ...

Temporal Concept: within 0.21461383622282973, out 0.25443841639461073, distance -0.039824580171781004


 ... (more hidden) ...

Hormone: within 0.438483803906923, out 0.3591434887951506, distance 0.07934031511177236


 ... (more hidden) ...

Body Part, Organ, or Organ Component: within 0.37356629573422295, out 0.3201767047587413, distance 0.05338959097548163


 ... (more hidden) ...

Therapeutic or Preventive Procedure: within 0.3005198697733791, out 0.2851689773441934, distance 0.015350892429185692


 ... (more hidden) ...

Age Group: within 0.4342701807618141, out 0.31910429929685147, distance 0.11516588146496265


 ... (more hidden) ...

Intellectual Product: within 0.32291661292106727, out 0.30004805819492947, distance 0.022868554726137802


 ... (more hidden) ...

Bacterium: within 0.5758251628869182, out 0.4117670126330439, distance 0.16405815025387432


 ... (more hidden) ...

Mental Process: within 0.364898445985224, out 0.3201339773263056, distance 0.044764468658918366


 ... (more hidden) ...

Geographic Area: within 0.4446920288544813, out 0.31101994430631114, distance 0.13367208454817014


 ... (more hidden) ...

Health Care Activity: within 0.25492109481510633, out 0.2713321054146167, distance -0.01641101059951039
Fungus: within 0.34906188646952313, out 0.3526686115424809, distance -0.0036067250729577838


 ... (more hidden) ...

Quantitative Concept: within 0.26310396188577895, out 0.26924749661513625, distance -0.006143534729357303


 ... (more hidden) ...

Organism Function: within 0.38493569020730356, out 0.3376213115372298, distance 0.047314378670073765


 ... (more hidden) ...

Substance: within 0.3666912150653926, out 0.3514280352772509, distance 0.015263179788141679
Environmental Effect of Humans: within 0.30557212233543396, out 0.31987728997154286, distance -0.014305167636108895


 ... (more hidden) ...

Individual Behavior: within 0.32686239489494195, out 0.3044376450117449, distance 0.022424749883197037


 ... (more hidden) ...

Biomedical or Dental Material: within 0.3829471029163826, out 0.335281978073513, distance 0.04766512484286961


 ... (more hidden) ...

Gene or Genome: within 0.3359297186136246, out 0.3300844598955505, distance 0.0058452587180740645


 ... (more hidden) ...

Plant: within 0.4119572660705377, out 0.3575382448088179, distance 0.05441902126171977


 ... (more hidden) ...

Social Behavior: within 0.25000224564047085, out 0.2698099972385943, distance -0.019807751598123424


 ... (more hidden) ...

Finding: within 0.31428306695051533, out 0.3078153126540944, distance 0.006467754296420936


 ... (more hidden) ...

Antibiotic: within 0.4165744306458757, out 0.3333402788841421, distance 0.0832341517617336


 ... (more hidden) ...

Inorganic Chemical: within 0.4823627831548258, out 0.3785492004877973, distance 0.1038135826670285


 ... (more hidden) ...

Biomedical Occupation or Discipline: within 0.34957715077541085, out 0.305214719517335, distance 0.044362431258075874


 ... (more hidden) ...

Diagnostic Procedure: within 0.33122343735267773, out 0.28575477493892015, distance 0.04546866241375758


 ... (more hidden) ...

Occupation or Discipline: within 0.361173888342455, out 0.340014938276847, distance 0.02115895006560803
Biologic Function: within 0.2053917944431305, out 0.28142290315254975, distance -0.07603110870941926


 ... (more hidden) ...

Sign or Symptom: within 0.39704506520229, out 0.3306880701042806, distance 0.06635699509800941


 ... (more hidden) ...

Element, Ion, or Isotope: within 0.4588092557258076, out 0.3769434276132241, distance 0.08186582811258353


 ... (more hidden) ...

Injury or Poisoning: within 0.2869607746400512, out 0.2989403032114906, distance -0.011979528571439413


 ... (more hidden) ...

Natural Phenomenon or Process: within 0.30829709226838053, out 0.3094667319732678, distance -0.0011696397048872598
Machine Activity: within 0.2791351030270259, out 0.28917633584739966, distance -0.01004123282037378


 ... (more hidden) ...

Daily or Recreational Activity: within 0.38997776364232156, out 0.3496648511790053, distance 0.04031291246331625


 ... (more hidden) ...

Medical Device: within 0.4339772335902895, out 0.3631240446189018, distance 0.0708531889713877


 ... (more hidden) ...

Manufactured Object: within 0.3195514083217756, out 0.3146034254294815, distance 0.004947982892294089


 ... (more hidden) ...

Food: within 0.5712416891761014, out 0.40681436263765997, distance 0.16442732653844144


 ... (more hidden) ...

Idea or Concept: within 0.32114183592790435, out 0.3148258099797145, distance 0.006316025948189863


 ... (more hidden) ...

Body Substance: within 0.29398395139233846, out 0.2939546464244374, distance 2.9304967901044865e-05


 ... (more hidden) ...

Vitamin: within 0.3818649800324982, out 0.3241910881564779, distance 0.05767389187602029


 ... (more hidden) ...

Organism Attribute: within 0.4297139300887162, out 0.3596279486840658, distance 0.07008598140465044


 ... (more hidden) ...

Laboratory Procedure: within 0.3507550207783079, out 0.33024339011806547, distance 0.02051163066024242


 ... (more hidden) ...

Cell: within 0.460407746931491, out 0.37709851961180346, distance 0.08330922731968754


 ... (more hidden) ...

Organ or Tissue Function: within 0.2906977299371383, out 0.303948880048197, distance -0.013251150111058652


 ... (more hidden) ...

Clinical Attribute: within 0.40344896001923947, out 0.34994650264705335, distance 0.05350245737218612


 ... (more hidden) ...

Research Activity: within 0.29819182699671376, out 0.25847462963164825, distance 0.039717197365065515


 ... (more hidden) ...

Cell Function: within 0.4520566812293096, out 0.363440062827442, distance 0.08861661840186758


 ... (more hidden) ...

Cell Component: within 0.40402088775521233, out 0.36100536748280293, distance 0.0430155202724094


 ... (more hidden) ...

Health Care Related Organization: within 0.37195969488133085, out 0.31769129229978815, distance 0.054268402581542696
Mammal: within 0.6347400546073914, out 0.4432473351849052, distance 0.19149271942248614


 ... (more hidden) ...

Cell or Molecular Dysfunction: within 0.2679698342191322, out 0.2894006595186173, distance -0.021430825299485146


 ... (more hidden) ...

Classification: within 0.32787467366437506, out 0.31664193085923914, distance 0.01123274280513592


 ... (more hidden) ...

Embryonic Structure: within 0.08227378688752651, out 0.1907486420134271, distance -0.10847485512590058
Nucleotide Sequence: within 0.130400151014328, out 0.18381244679944425, distance -0.05341229578511625


 ... (more hidden) ...

Anatomical Abnormality: within 0.24630807916678132, out 0.26559392293135436, distance -0.01928584376457304


 ... (more hidden) ...

Qualitative Concept: within 0.30979206792816716, out 0.3135466545352594, distance -0.003754586607092214


 ... (more hidden) ...

Professional or Occupational Group: within 0.37544904653279565, out 0.3172672892420807, distance 0.05818175729071495
Group: within 0.41867607831954956, out 0.36974655934156486, distance 0.0489295189779847


 ... (more hidden) ...

Body System: within 0.2357291164142745, out 0.2394139479758651, distance -0.003684831561590607


 ... (more hidden) ...

Educational Activity: within 0.367653689568951, out 0.33880444755218525, distance 0.028849242016765753


 ... (more hidden) ...

Spatial Concept: within 0.3784589394927025, out 0.359647689742493, distance 0.01881124975020948


 ... (more hidden) ...

Virus: within 0.2086123449727893, out 0.25379359652495176, distance -0.04518125155216246


 ... (more hidden) ...

Organization: within 0.30415075942873954, out 0.27562479993838834, distance 0.028525959490351194


 ... (more hidden) ...

Genetic Function: within 0.4043901190161705, out 0.348883128804463, distance 0.05550699021170752


 ... (more hidden) ...

Family Group: within 0.3778652229479381, out 0.31823815548869816, distance 0.05962706745923996
Eukaryote: within 0.10145740086833636, out 0.21884517044946245, distance -0.11738776958112608


 ... (more hidden) ...

Governmental or Regulatory Activity: within 0.26438333590825397, out 0.3245517685107595, distance -0.06016843260250554


 ... (more hidden) ...

Regulation or Law: within 0.4303968608379364, out 0.38133614422378365, distance 0.04906071661415273


 ... (more hidden) ...

Physiologic Function: within 0.3854515825482932, out 0.32590920251038613, distance 0.05954238003790707


 ... (more hidden) ...

Body Space or Junction: within 0.2564708764354388, out 0.27428788537142257, distance -0.017817008935983758


 ... (more hidden) ...

Activity: within 0.28729649260640144, out 0.319978195667008, distance -0.03268170306060658


 ... (more hidden) ...

Human-caused Phenomenon or Process: within 0.2949708513915539, out 0.3166748973873766, distance -0.021704045995822707


 ... (more hidden) ...

Population Group: within 0.309857558068775, out 0.27244950880725144, distance 0.03740804926152358


 ... (more hidden) ...

Molecular Function: within 0.43335805584987, out 0.3564678173174114, distance 0.07689023853245863


 ... (more hidden) ...

Patient or Disabled Group: within 0.23249473209892, out 0.2579443582285464, distance -0.025449626129626396
Molecular Biology Research Technique: within 0.5643023749192556, out 0.3625813801387244, distance 0.20172099478053124


 ... (more hidden) ...

Bird: within 0.4409221410751343, out 0.4001682532168579, distance 0.04075388785827638


 ... (more hidden) ...

Functional Concept: within 0.33309671804308894, out 0.34312298386271123, distance -0.010026265819622293


 ... (more hidden) ...

Receptor: within 0.2777404352091253, out 0.29326709964005576, distance -0.015526664430930481
Chemical Viewed Structurally: within 0.3042570650577545, out 0.2871724414787943, distance 0.017084623578960234


 ... (more hidden) ...

Chemical Viewed Functionally: within 0.2149998595317205, out 0.28677115088016764, distance -0.07177129134844715
Acquired Abnormality: within 0.4173906147480011, out 0.3631991917941964, distance 0.05419142295380469


 ... (more hidden) ...

Laboratory or Test Result: within 0.16570983193814753, out 0.23367634118860464, distance -0.06796650925045711
0.027127933515222327





0.027127933515222327

# Visualization

In [21]:
from whatlies.embedding import Embedding
from whatlies.embeddingset import EmbeddingSet

In [24]:
emb = EmbeddingSet({umls_mapper.un_umls(c, single_return=True): Embedding(umls_mapper.un_umls(c, single_return=True), vecs[c]) for c in vecs.vocab})
# emb = EmbeddingSet({c: Embedding(c, vecs[c]) for c in vecs.vocab})
emb.plot_interactive("Cisplatin","Carboplatin")