In [2]:
import math

import gensim
from tqdm import tqdm

from UMLS import UMLSMapper, UMLSEvaluator
from embeddings import Embeddings
from evaluate_embeddings import Benchmark

from vectorize_data import lines_from_file, preprocess

# General

In [3]:
embedding_name = "no_prep_vecs_test"
umls_mapper = UMLSMapper(from_dir='E:/AML4DH-DATA/UMLS')

initialize UMLSMapper...


# Vectorization

In [3]:
cpq_sentences = lines_from_file(path="E:/AML4DH-DATA/CPG-AMIA2020/Plain Text/cpg-sentences.txt")
print(cpq_sentences[:2])

['Helicobacter pylori ist der wesentliche Risikofaktor für das Magenkarzinom.', 'Die H. pylori-Eradikation mit dem Ziel der Magenkarzinomprävention sollte bei den folgenden Risikopersonen durchgeführt werden (siehe Tabelle unten).']


In [4]:
# tokenization
cpq_sentences = [sentence.split() for sentence in cpq_sentences]
print(cpq_sentences[:2])

[['Helicobacter', 'pylori', 'ist', 'der', 'wesentliche', 'Risikofaktor', 'für', 'das', 'Magenkarzinom.'], ['Die', 'H.', 'pylori-Eradikation', 'mit', 'dem', 'Ziel', 'der', 'Magenkarzinomprävention', 'sollte', 'bei', 'den', 'folgenden', 'Risikopersonen', 'durchgeführt', 'werden', '(siehe', 'Tabelle', 'unten).']]


In [5]:
# cpq_sentences = umls_mapper.standardize_documents(cpq_sentences)
cpq_sentences = umls_mapper.replace_documents_with_umls(cpq_sentences)
# cpq_sentences = preprocess(documents=cpq_sentences, lemmatize=True, remove_stopwords=True)
print(cpq_sentences[:2])

[['C0085508', 'pylori', 'ist', 'der', 'wesentliche', 'Risikofaktor', 'für', 'das', 'Magenkarzinom.'], ['Die', 'H.', 'pylori-Eradikation', 'mit', 'dem', 'Ziel', 'der', 'Magenkarzinomprävention', 'sollte', 'bei', 'den', 'folgenden', 'Risikopersonen', 'durchgeführt', 'werden', '(siehe', 'Tabelle', 'unten).']]


In [6]:
vecs = Embeddings.calculate_vectors(cpq_sentences, use_phrases=False)
Embeddings.save_medical(vecs, embedding_name, umls_mapper)

 ... (more hidden) ...


# Benchmarks

In [4]:
vecs = Embeddings.load(path=f"data/{embedding_name}.kv")

load embedding...


In [5]:
for c, v in vecs.most_similar(umls_mapper.umls_dict["Cisplatin"]):
    print(umls_mapper.un_umls(c), v)
concept_vecs = umls_mapper.get_umls_vectors_only(vecs)

Carboplatin 0.8703843355178833
Paclitaxel 0.8661205768585205
Etoposid 0.8592477440834045
Docetaxel 0.8568588495254517
Capecitabin 0.8505868911743164
Oxaliplatin 0.8471230268478394
Rituximab 0.8007310628890991
Cyclophosphamid 0.7927492260932922
Bendamustin 0.7917323708534241
Bleomycin 0.7808853387832642


In [6]:
evaluator = UMLSEvaluator(from_dir='E:/AML4DH-DATA/UMLS', vectors=vecs)

initialize UMLSEvaluator...


In [7]:
benchmark = Benchmark(vecs, umls_mapper, evaluator)
benchmark.choi_benchmark()

Pharmacologic Substance: 4.020107598754371
Disease or Syndrome: 1.9983221957602162
Neoplastic Process: 2.453319315938585
Clinical Drug: 0
Finding: 0.6869696229689413
Injury or Poisoning: 0.11424715174873927


In [None]:
benchmark.all_categories_benchmark()

  1%|▊                                                                                             | 1/113 [00:22<42:00, 22.50s/it]

Organic Chemical: within 0.3915169490481498, out 0.305338682827962, distance 0.08617826622018782


  2%|█▋                                                                                            | 2/113 [00:49<44:13, 23.90s/it]

Pharmacologic Substance: within 0.3789551859901992, out 0.29874639015246496, distance 0.08020879583773421


  3%|██▍                                                                                           | 3/113 [00:51<31:50, 17.37s/it]

Hazardous or Poisonous Substance: within 0.390809279159996, out 0.3314034615557065, distance 0.059405817604289535


  4%|███▎                                                                                          | 4/113 [00:53<22:59, 12.66s/it]

Nucleic Acid, Nucleoside, or Nucleotide: within 0.44585926748311566, out 0.3401739987439942, distance 0.10568526873912149


  4%|████▏                                                                                         | 5/113 [00:55<17:06,  9.51s/it]

Body Location or Region: within 0.3416425808878808, out 0.27627016634603657, distance 0.06537241454184423


  5%|████▉                                                                                         | 6/113 [00:56<12:15,  6.87s/it]

Congenital Abnormality: within 0.35169234362741314, out 0.31561236441217433, distance 0.03607997921523881


  6%|█████▊                                                                                        | 7/113 [00:56<08:43,  4.94s/it]

Embryonic Structure: within 0.09719399275879065, out 0.17036290808770294, distance -0.07316891532891229


  7%|██████▋                                                                                       | 8/113 [01:22<19:31, 11.16s/it]

Disease or Syndrome: within 0.3477938972100944, out 0.3084560679601076, distance 0.0393378292499868


  8%|███████▍                                                                                      | 9/113 [01:29<17:12,  9.93s/it]

Pathologic Function: within 0.3330391943957378, out 0.2961669953195642, distance 0.03687219907617362


  9%|████████▏                                                                                    | 10/113 [01:30<12:15,  7.14s/it]

Phenomenon or Process: within 0.2660910255379147, out 0.272190202521457, distance -0.006099176983542287


 10%|█████████                                                                                    | 11/113 [01:41<14:24,  8.47s/it]

Body Part, Organ, or Organ Component: within 0.365569148944823, out 0.2956578052258535, distance 0.06991134371896951


 11%|█████████▉                                                                                   | 12/113 [01:42<10:27,  6.21s/it]

Indicator, Reagent, or Diagnostic Aid: within 0.35262886360765267, out 0.3102688798836499, distance 0.04235998372400279


 12%|██████████▋                                                                                  | 13/113 [01:48<09:55,  5.95s/it]

Biologically Active Substance: within 0.4229842477407426, out 0.34728987612326495, distance 0.07569437161747766


# Visualization

In [21]:
from whatlies.embedding import Embedding
from whatlies.embeddingset import EmbeddingSet

In [24]:
emb = EmbeddingSet({umls_mapper.un_umls(c, single_return=True): Embedding(umls_mapper.un_umls(c, single_return=True), vecs[c]) for c in vecs.vocab})
# emb = EmbeddingSet({c: Embedding(c, vecs[c]) for c in vecs.vocab})
emb.plot_interactive("Cisplatin","Carboplatin")