In [1]:
import math

import gensim
from tqdm import tqdm

from UMLS import UMLSMapper, UMLSEvaluator
from embeddings import Embeddings
from evaluate_embeddings import Benchmark

from vectorize_data import lines_from_file, preprocess

# General

In [2]:
embedding_name = "no_prep_vecs_test_all"
umls_mapper = UMLSMapper(from_dir='E:/AML4DH-DATA/UMLS')

initialize UMLSMapper...


# Vectorization

In [3]:
cpq_sentences = lines_from_file(path="E:/AML4DH-DATA/CPG-AMIA2020/Plain Text/cpg-sentences.txt")
print(cpq_sentences[:2])

['Helicobacter pylori ist der wesentliche Risikofaktor für das Magenkarzinom.', 'Die H. pylori-Eradikation mit dem Ziel der Magenkarzinomprävention sollte bei den folgenden Risikopersonen durchgeführt werden (siehe Tabelle unten).']


In [4]:
# tokenization
cpq_sentences = [sentence.split() for sentence in cpq_sentences]
print(cpq_sentences[:2])

[['Helicobacter', 'pylori', 'ist', 'der', 'wesentliche', 'Risikofaktor', 'für', 'das', 'Magenkarzinom.'], ['Die', 'H.', 'pylori-Eradikation', 'mit', 'dem', 'Ziel', 'der', 'Magenkarzinomprävention', 'sollte', 'bei', 'den', 'folgenden', 'Risikopersonen', 'durchgeführt', 'werden', '(siehe', 'Tabelle', 'unten).']]


In [5]:
# cpq_sentences = umls_mapper.standardize_documents(cpq_sentences)
cpq_sentences = umls_mapper.replace_documents_with_umls(cpq_sentences)
# cpq_sentences = preprocess(documents=cpq_sentences, lemmatize=True, remove_stopwords=True)
print(cpq_sentences[:2])

[['C0085508', 'pylori', 'ist', 'der', 'wesentliche', 'Risikofaktor', 'für', 'das', 'Magenkarzinom.'], ['Die', 'H.', 'pylori-Eradikation', 'mit', 'dem', 'Ziel', 'der', 'Magenkarzinomprävention', 'sollte', 'bei', 'den', 'folgenden', 'Risikopersonen', 'durchgeführt', 'werden', '(siehe', 'Tabelle', 'unten).']]


In [6]:
vecs = Embeddings.calculate_vectors(cpq_sentences, use_phrases=False)
Embeddings.save_medical(vecs, embedding_name, umls_mapper)

 ... (more hidden) ...


# Benchmarks

In [3]:
vecs = Embeddings.load(path=f"data/{embedding_name}.kv")

load embedding...


In [6]:
for c, v in vecs.most_similar(umls_mapper.umls_dict["Cisplatin"]):
    print(umls_mapper.un_umls(c), v)
concept_vecs = umls_mapper.get_umls_vectors_only(vecs)

Carboplatin 0.8703843355178833
Paclitaxel 0.8661205768585205
Etoposid 0.8592477440834045
Docetaxel 0.8568588495254517
Capecitabin 0.8505868911743164
Oxaliplatin 0.8471230268478394
Gemcitabin 0.8366334438323975
Rituximab 0.8007310628890991
Cyclophosphamid 0.7927492260932922
Bendamustin 0.7917323708534241


In [4]:
evaluator = UMLSEvaluator(from_dir='E:/AML4DH-DATA/UMLS', vectors=vecs)

initialize UMLSEvaluator...


In [8]:
benchmark = Benchmark(vecs, umls_mapper, evaluator)
benchmark.choi_benchmark()

Pharmacologic Substance: 1.5737781924665222
Disease or Syndrome: 0.3958950505912594
Neoplastic Process: 0.8788805300906921
Clinical Drug: 0
Finding: 0.20033652598776291
Injury or Poisoning: 0.017509969002479572


In [9]:
benchmark.all_categories_benchmark()

Clinical Drug: 0.0000|0.0000|0.0000: 100%|███████████████████████████████████████████████████████| 113/113 [05:24<00:00,  2.87s/it]

0.04218759376296209





0.04218759376296209

In [None]:
import math
from typing import Tuple

import gensim
from tqdm import tqdm

from UMLS import UMLSMapper, UMLSEvaluator
from embeddings import Embeddings
class Benchmark:
    def __init__(self, embeddings: gensim.models.KeyedVectors, umls_mapper: UMLSMapper, umls_evaluator: UMLSEvaluator):
        self.embeddings = embeddings
        self.umls_mapper = umls_mapper
        self.umls_evaluator = umls_evaluator

    def cosine(self, word1=None, word2=None, c1=None, c2=None):
        if word1:
            return self.embeddings.similarity(self.umls_mapper.umls_dict[word1], self.umls_mapper.umls_dict[word2])
        else:
            return self.embeddings.similarity(c1, c2)

    def pairwise_cosine(self, concepts1, concepts2=None):
        if concepts2 is None:
            concepts2 = concepts1
            s = 0
            count = -1
            for i, concept1 in enumerate(concepts1):
                for j, concept2 in enumerate(concepts2):
                    if j > i:
                        c = self.cosine(c1=concept1, c2=concept2)
                        if c < 0:
                            c = -c
                        s += c
                        count += 1

            return s / count
        else:
            s = 0
            count = 0
            for i, concept1 in enumerate(concepts1):
                for j, concept2 in enumerate(concepts2):
                    c = self.cosine(c1=concept1, c2=concept2)
                    if c < 0:
                        c = -c
                    s += c
                    count += 1
            return s / count

    def silhouette(self, term, category):
        def mean_between_distance(datapoint, same_cluster):
            sigma_ai = 0
            for reference_point in same_cluster:
                if datapoint == reference_point:
                    continue
                sigma_ai += self.cosine(c1=datapoint, c2=reference_point)

            return sigma_ai / (len(same_cluster)-1)

        def smallest_mean_out_distance(datapoint, other_clusters):
            distances = []
            for other_cluster in other_clusters:
                sigma_bi = 0
                for other_reference_point in other_cluster:
                    sigma_bi += self.cosine(c1=datapoint, c2=other_reference_point)
                sigma_bi = sigma_bi / len(other_cluster)
                distances.append(sigma_bi)
            return min(distances)

        cluster_name = category
        cluster = self.umls_evaluator.category2concepts[cluster_name]
        other_cluster_names = set(self.umls_evaluator.category2concepts.keys()).difference(cluster_name)
        other_clusters = [self.umls_evaluator.category2concepts[category] for category in other_cluster_names]

        a_i = mean_between_distance(term, cluster)
        b_i = smallest_mean_out_distance(term, other_clusters)
        print(a_i, b

In [71]:
import math
from typing import Tuple

import gensim
from tqdm import tqdm

from UMLS import UMLSMapper, UMLSEvaluator
from embeddings import Embeddings
class Benchmark:
    def __init__(self, embeddings: gensim.models.KeyedVectors, umls_mapper: UMLSMapper, umls_evaluator: UMLSEvaluator):
        self.embeddings = embeddings
        self.umls_mapper = umls_mapper
        self.umls_evaluator = umls_evaluator

    def cosine(self, word1=None, word2=None, c1=None, c2=None):
    
        if word1:
            cos = self.embeddings.similarity(self.umls_mapper.umls_dict[word1], self.umls_mapper.umls_dict[word2])
        else:
            cos = self.embeddings.similarity(c1, c2)
        if cos < 0:
            return -cos
        else:
            return cos

    def pairwise_cosine(self, concepts1, concepts2=None):
        if concepts2 is None:
            concepts2 = concepts1
            s = 0
            count = -1
            for i, concept1 in enumerate(concepts1):
                for j, concept2 in enumerate(concepts2):
                    if j > i:
                        c = self.cosine(c1=concept1, c2=concept2)
                        if c < 0:
                            c = -c
                        s += c
                        count += 1

            return s / count
        else:
            s = 0
            count = 0
            for i, concept1 in enumerate(concepts1):
                for j, concept2 in enumerate(concepts2):
                    c = self.cosine(c1=concept1, c2=concept2)
                    if c < 0:
                        c = -c
                    s += c
                    count += 1
            return s / count

    def silhouette(self, term, category):
        def mean_between_distance(datapoint, same_cluster):
            sigma_ai = 0
            for reference_point in same_cluster:
                if datapoint == reference_point:
                    continue
                sigma_ai += self.cosine(c1=datapoint, c2=reference_point)

            return sigma_ai / (len(same_cluster)-1)

        def smallest_mean_out_distance(datapoint, other_clusters):
            distances = []
            for other_cluster in other_clusters:
                sigma_bi = 0
                for other_reference_point in other_cluster:
                    sigma_bi += self.cosine(c1=datapoint, c2=other_reference_point)
                sigma_bi = sigma_bi / len(other_cluster)
                distances.append(sigma_bi)
            return min(distances)

        
        cluster = self.umls_evaluator.category2concepts[category]
        other_cluster_names = set(self.umls_evaluator.category2concepts.keys()).difference(category)
        other_clusters = [self.umls_evaluator.category2concepts[category] for category in other_cluster_names]

        a_i = mean_between_distance(term, cluster)
        b_i = smallest_mean_out_distance(term, other_clusters)
        
        if a_i < b_i:
            s_i = 1 - a_i / b_i
        elif a_i == b_i:
            s_i = 0
        else:
            s_i = b_i / a_i - 1
        print(s_i)
        return s_i

    def silhouette_coefficient(self):
        s_is = []
        categories = tqdm(self.umls_evaluator.category2concepts.keys())
        for category in categories:
            category_concepts = self.umls_evaluator.category2concepts[category]
            if len(category_concepts) < 2:
                continue
                
            category_s_is = []
            for term in category_concepts:
                category_s_is.append(self.silhouette(term, category))
                
            mean_category_s_i = sum(category_s_is)/len(category_s_is)
            s_is.append(mean_category_s_i)
            categories.set_description(f"{category}: {mean_category_s_i:.4f}")
            categories.refresh()  # to show immediately the update
        return max(s_is)

    def category_benchmark(self, choosen_category):
        other_categories = self.umls_evaluator.category2concepts.keys()
        choosen_concepts = self.umls_evaluator.category2concepts[choosen_category]
        if len(choosen_concepts) <= 1:
            return 0, 0, 0
        p1 = self.pairwise_cosine(choosen_concepts)

        p2s = []
        for other_category in other_categories:
            if other_category == choosen_category:
                continue

            other_concepts = self.umls_evaluator.category2concepts[other_category]
            if len(choosen_concepts) == 0 or len(other_concepts) == 0:
                continue
            p2 = self.pairwise_cosine(choosen_concepts, other_concepts)
            p2s.append(p2)

        avg_p2 = sum(p2s) / len(p2s)
        return p1, avg_p2, p1 - avg_p2

    def all_categories_benchmark(self):

        distances = []
        categories = tqdm(self.umls_evaluator.category2concepts.keys())
        for category in categories:
            within, out, distance = self.category_benchmark(category)
            distances.append(distance)
            categories.set_description(f"{category}: {within:.4f}|{out:.4f}|{distance:.4f}")
            categories.refresh()  # to show immediately the update

        benchmark_value = sum(distances) / len(distances)
        print(benchmark_value)
        return benchmark_value

    def mcsm(self, category, k=40):
        def category_true(concept, category):
            if category in self.umls_evaluator.concept2category[concept]:
                return 1
            else:
                return 0

        v_t = self.umls_evaluator.category2concepts[category]
        if len(v_t) == 0:
            return 0

        sigma = 0
        for v in v_t:
            for i in range(0, k):
                neighbors = self.embeddings.most_similar(v, topn=k)
                v_i = neighbors[i][0]
                sigma += category_true(v_i, category) / math.log((i + 1) + 1, 2)
        return sigma / len(v_t)

    def mrm(self, r, seed_pair: Tuple[str, str], k=40):
        def relation_true(concepts, relation):
            for concept in concepts:
                if concept in relation:
                    return 1
            return 0

        s = self.embeddings.get_vector(seed_pair[0])-self.embeddings.get_vector(seed_pair[1])
        # todo: choose v_star correctly and add ndf_rt source
        v_star = self.umls_evaluator.category2concepts["category"]
        sigma = 0
        for v in v_star:
            union = {}
            for i in range(0, k):
                union.update(self.embeddings.get_vector(v)-self.embeddings.get_vector(s))
            sigma += relation_true(union, r)
        return sigma / len(v_star)

    def choi_benchmark(self):
        categories = ['Pharmacologic Substance',
                      'Disease or Syndrome',
                      'Neoplastic Process',
                      'Clinical Drug',
                      'Finding',
                      'Injury or Poisoning'
                      ]

        for category in categories:
            print(f'{category}: {self.mcsm(category)}')

In [72]:
benchmark = Benchmark(vecs, umls_mapper, evaluator)
benchmark.silhouette_coefficient()

  0%|                                                                                                      | 0/112 [00:00<?, ?it/s]

-0.8224483387402368
-0.9494653332114419
-0.7653840183583714
-0.9002091877577203
-0.8821249114040953
-0.8398321647351656
-0.8897993079988924
-0.9088868028141113
-0.8968104383231753
-0.9904321670373168
-0.789819651226996
-0.8537875325859774
-0.9395122215766099
-0.94776857182935
-0.9949362535788326
-0.9857135202519929
-0.9254226784064608
-0.9000307869527762
-0.9334629422537019
-0.9471922169603378
-0.8132496462148953
-0.9809434667236335
-0.9252236554022195
-0.8532204160688422
-0.8807414813826049
-0.8007915995389475
-0.9481412537969961
-0.9329217465646198
-0.7155202503276163
-0.8797363431250207
-0.9777483036948237
-0.8827037861775373
-0.9479733043568066
-0.9957594734958314
-0.8236633903993927
-0.970849537290695
-0.8071824714780562
-0.9427757662121005
-0.8076039589613591
-0.9773402221084012
-0.9937648901557639
-0.979907592809577
-0.9998977345243386
-0.7874402788607728
-0.8957511672044188
-0.9994555940433997
-0.9431080527937445
-0.7468016016282164
-0.7764326103350835
-0.9712138749382658
-0.80

Organic Chemical: -0.8896:   1%|▌                                                                  | 1/112 [00:27<50:05, 27.07s/it]

-0.9161820051153077
-0.825500362580996
-0.8553846103369677
-0.8968161080502636
-0.9147538337467446
-0.881338920138247
-0.9380687941007244
-0.9987192976560455
-0.8367419282985975
-0.9089754049756039
-0.8013428366011052
-0.8940715511129199
-0.9391053743893069
-0.8496746246351123
-0.6903197535136607
-0.9470885529434286
-0.9949121707874609
-0.9854882652394971
-0.8123723764933004
-0.9316238088058759
-0.9243609950820554
-0.7427128824036
-0.8237383882334657
-0.8971847528188165
-0.9538415235382165
-0.8530196256022066
-0.9916604820553546
-0.8730186974872127
-0.919948133466504
-0.9832440418524202
-0.9429273732029835
-0.9860886157581029
-0.8936890373210511
-0.6820988844548126
-0.9082053643438179
-0.9063045435227957
-0.820117969798501
-0.9636380754742017
-0.7737835089624138
-0.9899249468943994
-0.8499982153641364
-0.8750303030697254
-0.6714438041391066
-0.9340304142422216
-0.9460884013099212
-0.8095230024802376
-0.9095692229963984
-0.980553761066496
-0.849180290439665
-0.8783179758535886
-0.796636

Organic Chemical: -0.8896:   1%|▌                                                                | 1/112 [00:38<1:11:11, 38.48s/it]


KeyboardInterrupt: 

# Visualization

In [21]:
from whatlies.embedding import Embedding
from whatlies.embeddingset import EmbeddingSet

In [24]:
emb = EmbeddingSet({umls_mapper.un_umls(c, single_return=True): Embedding(umls_mapper.un_umls(c, single_return=True), vecs[c]) for c in vecs.vocab})
# emb = EmbeddingSet({c: Embedding(c, vecs[c]) for c in vecs.vocab})
emb.plot_interactive("Cisplatin","Carboplatin")