<a href="https://colab.research.google.com/github/HodaMemar/Patient-Similarity-through-Representation/blob/main/Semantic_Similarity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
!pip install transformers
from transformers import AutoConfig, AutoModel,AutoTokenizer



In [5]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
#import sent2vec

from numpy import dot
from numpy.linalg import norm

In [6]:



def get_bert_based_similarity(sentence_pairs, model, tokenizer):
    """
    computes the embeddings of each sentence and its similarity with its corresponding pair
    Args:
        sentence_pairs(dict): dictionary of lists with the similarity type as key and a list of two sentences as value
        model: the language model
        tokenizer: the tokenizer to consider for the computation

    Returns:
        similarities(dict): dictionary with similarity type as key and the similarity measure as value
    """
    similarities = dict()
    for sim_type, sent_pair in sentence_pairs.items():
        inputs_1 = tokenizer(sent_pair[0], return_tensors='pt')
        inputs_2 = tokenizer(sent_pair[1], return_tensors='pt')
        sent_1_embed = np.mean(model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
        sent_2_embed = np.mean(model(**inputs_2).last_hidden_state[0].detach().numpy(), axis=0)
        similarities[sim_type] = dot(sent_1_embed, sent_2_embed)/(norm(sent_1_embed)* norm(sent_2_embed))
    return similarities





#Load Data

In [7]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [8]:
cd /content/drive/MyDrive/csv

/content/drive/MyDrive/csv


In [9]:
df=pd.read_csv('df_canonical_name_TFIDF_l2.csv')

In [7]:
df.head()

Unnamed: 0.1,Unnamed: 0,HADM_ID,canonical_name,len
0,0,183196,"Empyema Aortic Aneurysm, Abdominal gallbladder...",677
1,1,113704,Mediastinal Emphysema Duodenal Ulcer Erythema ...,138
2,2,184752,Cellulitis Obesity Panus Malignant Vaginal Neo...,482
3,3,141137,Thyroid dysfunction Diabetes Bundle-Branch Blo...,166
4,4,130564,Lung Abscess Pyelonephritis Pressure Ulcer Sho...,189


# UmlsBERT model

https://github.com/gmichalo/UmlsBERT

In [11]:
!wget -O umlsbert.tar.xz https://www.dropbox.com/s/kziiuyhv9ile00s/umlsbert.tar.xz?dl=0

--2023-11-09 06:09:57--  https://www.dropbox.com/s/kziiuyhv9ile00s/umlsbert.tar.xz?dl=0
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: /s/raw/kziiuyhv9ile00s/umlsbert.tar.xz [following]
--2023-11-09 06:09:57--  https://www.dropbox.com/s/raw/kziiuyhv9ile00s/umlsbert.tar.xz
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://uc920029866c142dab81fe486c71.dl.dropboxusercontent.com/cd/0/inline/CHPvXGCo4pSIx6wYJoQw_UOiat2iGu5MmghWAuODhlkyezVN5t9RGEfAd2fz4CThgqDZq2P8XabhrnK-tSXMisGLpZKIL79fr3l30vFjyaD-AA7kv5zBE7oRqiTFtulXgmlgnvTc6akBr33kxbG9Oapw/file# [following]
--2023-11-09 06:09:57--  https://uc920029866c142dab81fe486c71.dl.dropboxusercontent.com/cd/0/inline/CHPvXGCo4pSIx6wYJoQw_UOiat2iGu5MmghWAuODhlkyezVN5t9RGEfAd2fz4CThgqDZq2P8XabhrnK-tSXMisGL

In [13]:
!tar -xvf  "/content/umlsbert.tar.xz"

tar: /content/umlsbert.tar.xz: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now


In [12]:
if __name__ == "__main__":
    sentence_pairs = {'similar':['mild', 'mild'],
                      'dissimilar': ['mild', 'mild']}
    umls_bert_model = AutoModel.from_pretrained('/content/umlsbert')
    umls_bert_tokenizer = AutoTokenizer.from_pretrained('/content/umlsbert')
    inputs_1 = umls_bert_tokenizer('mild scattered paranasal sinus mucosal thickening is observed', return_tensors='pt')
    sent_1_embed = np.mean(umls_bert_model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
    inputs_2 = umls_bert_tokenizer('mild scattered paranasal sinus  is observed', return_tensors='pt')
    sent_2_embed = np.mean(umls_bert_model(**inputs_2).last_hidden_state[0].detach().numpy(), axis=0)
    #print(get_bert_based_similarity(sentence_pairs, umls_bert_model, umls_bert_tokenizer))


HFValidationError: ignored

In [None]:
%%time
umls_bert_model = AutoModel.from_pretrained('/content/umlsbert')
umls_bert_tokenizer = AutoTokenizer.from_pretrained('/content/umlsbert')
ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_UmlsBert_Results=[]
for i in ls_HADM_ID:

    ls=[]
    print('\n')
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.tolist()[0]


    try:
        inputs_1 = umls_bert_tokenizer(a, return_tensors='pt')
        sent_1_embed = np.mean(umls_bert_model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
        ls_embedding_UmlsBert_Results.append(['UmlsBert',i,sent_1_embed.tolist()])

    except:
        print("An exception occurred")
        print(i)
        print('\n')

In [None]:
df_umls_bert=pd.DataFrame(ls_embedding_UmlsBert_Results,columns=['Model','HADM_ID','embedding'])
df_umls_bert
df_umls_bert.to_csv('/content/drive/MyDrive/csv/Embeddiing_UmlsBert_TFIDF.csv')

#PubMedBERT

In [10]:
if __name__ == "__main__":
    sentence_pairs = {'similar': ["""Tachycardia Ramus Intermedius Artery Gastrointestinal Hemorrhage Autoimmune hemolytic anemia Anemia, Hemolytic Restenosis Coronary Arteriosclerosis Ventricular hypertrophy Reduced Myocardial Infarction Stenosis Morphology Tachypnea Hypoxia Weakness Respiratory distress Diabetes Mellitus Hypothyroidism Hypotension Congestive heart failure Pain Hypertensive disease Dyspnea""",
                                """Amputation Cardiomyopathies Diverticular Diseases Peripheral Vascular Diseases Diabetes Muscle Weakness Left-Sided Hepatitis B Benign prostatic hypertrophy Diverticulitis Hypothyroidism Kidney Failure, Chronic Hypercholesterolemia Congestive heart failure Chills Abdominal Pain Dyspnea"""],
                      'dissimilar': ['mild scattered paranasal sinus mucosal thickening is observed',
                                   'deformity of the ventral thecal sac is observed']}

    pubmed_bert_model = AutoModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
    pubmed_bert_tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
    print(get_bert_based_similarity(sentence_pairs, pubmed_bert_model, pubmed_bert_tokenizer))

{'similar': 0.9793204, 'dissimilar': 0.96878797}


In [11]:
pubmed_bert_model = AutoModel.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')
pubmed_bert_tokenizer = AutoTokenizer.from_pretrained('microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract-fulltext')

In [12]:
%%time

ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_pubmed_bert_Results=[]
for i in ls_HADM_ID[0:10]:

    ls=[]
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.tolist()[0]


    try:
        inputs_1 = pubmed_bert_tokenizer(a, return_tensors='pt')
        sent_1_embed = np.mean(pubmed_bert_model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
        ls_embedding_pubmed_bert_Results.append(['pubmed_bert',i,sent_1_embed.tolist()])

    except:
        print("An exception occurred")
        print(i)
        print('\n')

183196
113704
184752
141137
130564
195782
162248
197204
177047
104141
CPU times: user 2.3 s, sys: 11.4 ms, total: 2.31 s
Wall time: 2.31 s


In [14]:
df_pubmed_bert=pd.DataFrame(ls_embedding_pubmed_bert_Results,columns=['Model','HADM_ID','embedding'])
df_pubmed_bert
df_pubmed_bert.to_csv('/content/drive/MyDrive/csv/Embeddiing_pubmed_bert_TFIDF.csv')
df_pubmed_bert

Unnamed: 0,Model,HADM_ID,embedding
0,pubmed_bert,183196,"[-0.12977764010429382, -0.09786634147167206, -..."
1,pubmed_bert,113704,"[-0.1491001844406128, -0.06807370483875275, -0..."
2,pubmed_bert,184752,"[-0.10457468032836914, -0.08365943282842636, 0..."
3,pubmed_bert,141137,"[-0.03541595861315727, -0.03448089584708214, -..."
4,pubmed_bert,130564,"[-0.14866535365581512, 0.03329657018184662, -0..."
5,pubmed_bert,195782,"[-0.15756647288799286, -0.02009391412138939, 0..."
6,pubmed_bert,162248,"[-0.2147568315267563, -0.14330296218395233, 0...."
7,pubmed_bert,197204,"[-0.19898676872253418, -0.13567709922790527, 0..."
8,pubmed_bert,177047,"[-0.1807640939950943, -0.029346080496907234, -..."
9,pubmed_bert,104141,"[-0.22416454553604126, -0.0297127366065979, -0..."


#SciBERT

In [16]:
sci_bert_model = AutoModel.from_pretrained('allenai/scibert_scivocab_uncased')
sci_bert_tokenizer = AutoTokenizer.from_pretrained('allenai/scibert_scivocab_uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/228k [00:00<?, ?B/s]

In [17]:
if __name__ == "__main__":
    sentence_pairs = {'similar': ["""Tachycardia Ramus Intermedius Artery Gastrointestinal Hemorrhage Autoimmune hemolytic anemia Anemia, Hemolytic Restenosis Coronary Arteriosclerosis Ventricular hypertrophy Reduced Myocardial Infarction Stenosis Morphology Tachypnea Hypoxia Weakness Respiratory distress Diabetes Mellitus Hypothyroidism Hypotension Congestive heart failure Pain Hypertensive disease Dyspnea""",
                                """Amputation Cardiomyopathies Diverticular Diseases Peripheral Vascular Diseases Diabetes Muscle Weakness Left-Sided Hepatitis B Benign prostatic hypertrophy Diverticulitis Hypothyroidism Kidney Failure, Chronic Hypercholesterolemia Congestive heart failure Chills Abdominal Pain Dyspnea"""],
                       'dissimilar': ['mild scattered paranasal sinus mucosal thickening is observed',
                                   'deformity of the ventral thecal sac is observed']}


    print(get_bert_based_similarity(sentence_pairs, sci_bert_model, sci_bert_tokenizer))
    # output:
    # {'similar': 0.84465593, 'dissimilar': 0.76393396}

{'similar': 0.92050713, 'dissimilar': 0.7639339}


In [None]:
ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_sci_bert_Results=[]
for i in ls_HADM_ID:

    ls=[]
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.tolist()[0]


    try:
        inputs_1 = sci_bert_tokenizer(a, return_tensors='pt')
        sent_1_embed = np.mean(sci_bert_model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
        ls_embedding_sci_bert_Results.append(['sci_bert',i,sent_1_embed.tolist()])

    except:
        print("An exception occurred")
        print(i)
        print('\n')

NameError: ignored

In [None]:
df_sci_bert=pd.DataFrame(ls_embedding_sci_bert_Results,columns=['Model','HADM_ID','embedding'])
df_sci_bert
df_sci_bert.to_csv('/content/drive/MyDrive/csv/Embeddiing_sci_bert_TFIDF.csv')

#Bio_Clinical BERT

In [None]:
if __name__ == "__main__":
  sentence_pairs = {'similar': ["""Tachycardia Ramus Intermedius Artery Gastrointestinal Hemorrhage Autoimmune hemolytic anemia Anemia, Hemolytic Restenosis Coronary Arteriosclerosis Ventricular hypertrophy Reduced Myocardial Infarction Stenosis Morphology Tachypnea Hypoxia Weakness Respiratory distress Diabetes Mellitus Hypothyroidism Hypotension Congestive heart failure Pain Hypertensive disease Dyspnea""",
                                """Amputation Cardiomyopathies Diverticular Diseases Peripheral Vascular Diseases Diabetes Muscle Weakness Left-Sided Hepatitis B Benign prostatic hypertrophy Diverticulitis Hypothyroidism Kidney Failure, Chronic Hypercholesterolemia Congestive heart failure Chills Abdominal Pain Dyspnea"""],
                       'dissimilar': ['mild scattered paranasal sinus mucosal thickening is observed',
                               'deformity of the ventral thecal sac is observed']}


  print(get_bert_based_similarity(sentence_pairs, bio_clinical_bert_model, bio_clinical_bert_tokenizer))

NameError: ignored

In [None]:
bio_clinical_bert_model = AutoModel.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')
bio_clinical_bert_tokenizer = AutoTokenizer.from_pretrained('emilyalsentzer/Bio_ClinicalBERT')

Downloading:   0%|          | 0.00/385 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/416M [00:00<?, ?B/s]

Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

In [None]:
ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_bio_clinical_bert_Results=[]
for i in ls_HADM_ID:

    ls=[]
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.tolist()[0]


    try:
        inputs_1 = bio_clinical_bert_tokenizer(a, return_tensors='pt')
        sent_1_embed = np.mean(bio_clinical_bert_model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
        ls_embedding_bio_clinical_bert_Results.append(['bio_clinical_bert',i,sent_1_embed.tolist()])

    except:
        print("An exception occurred")
        print(i)
        print('\n')

In [None]:
df_bio_clinical_bert=pd.DataFrame(ls_embedding_bio_clinical_bert_Results,columns=['Model','HADM_ID','embedding'])
df_bio_clinical_bert
df_bio_clinical_bert.to_csv('/content/drive/MyDrive/csv/Embeddiing_bio_clinical_TFIDF.csv')

#BlueBERT

In [None]:
if __name__ == "__main__":
  sentence_pairs = {'similar': ["""Tachycardia Ramus Intermedius Artery Gastrointestinal Hemorrhage Autoimmune hemolytic anemia Anemia, Hemolytic Restenosis Coronary Arteriosclerosis Ventricular hypertrophy Reduced Myocardial Infarction Stenosis Morphology Tachypnea Hypoxia Weakness Respiratory distress Diabetes Mellitus Hypothyroidism Hypotension Congestive heart failure Pain Hypertensive disease Dyspnea""",
                                """Amputation Cardiomyopathies Diverticular Diseases Peripheral Vascular Diseases Diabetes Muscle Weakness Left-Sided Hepatitis B Benign prostatic hypertrophy Diverticulitis Hypothyroidism Kidney Failure, Chronic Hypercholesterolemia Congestive heart failure Chills Abdominal Pain Dyspnea"""],
                       'dissimilar': ['mild scattered paranasal sinus mucosal thickening is observed',
                               'deformity of the ventral thecal sac is observed']}

  blue_bert_model = AutoModel.from_pretrained('bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12')
  blue_bert_tokenizer = AutoTokenizer.from_pretrained('bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12')
  print(get_bert_based_similarity(sentence_pairs, blue_bert_model, blue_bert_tokenizer))

Downloading:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

{'similar': 0.94773, 'dissimilar': 0.700207}


In [None]:
blue_bert_model = AutoModel.from_pretrained('bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12')
blue_bert_tokenizer = AutoTokenizer.from_pretrained('bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12')

Downloading:   0%|          | 0.00/313 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bionlp/bluebert_pubmed_mimic_uncased_L-12_H-768_A-12 were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

In [None]:
ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_blue_bert_Results=[]
for i in ls_HADM_ID:

    ls=[]
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.tolist()[0]


    try:
        inputs_1 = blue_bert_tokenizer(a, return_tensors='pt')
        sent_1_embed = np.mean(blue_bert_model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
        ls_embedding_blue_bert_Results.append(['blue_bert',i,sent_1_embed.tolist()])

    except:
        print("An exception occurred")
        print(i)
        print('\n')

In [None]:
df_blue_bert=pd.DataFrame(ls_embedding_blue_bert_Results,columns=['Model','HADM_ID','embedding'])
df_blue_bert

df_blue_bert.to_csv('/content/drive/MyDrive/csv/Embeddiing_blue_bert_TFIDF.csv')

#BioBERT

In [None]:
if __name__ == "__main__":
    sentence_pairs = {'similar': ["""Tachycardia Ramus Intermedius Artery Gastrointestinal Hemorrhage Autoimmune hemolytic anemia Anemia, Hemolytic Restenosis Coronary Arteriosclerosis Ventricular hypertrophy Reduced Myocardial Infarction Stenosis Morphology Tachypnea Hypoxia Weakness Respiratory distress Diabetes Mellitus Hypothyroidism Hypotension Congestive heart failure Pain Hypertensive disease Dyspnea""",
                                """Amputation Cardiomyopathies Diverticular Diseases Peripheral Vascular Diseases Diabetes Muscle Weakness Left-Sided Hepatitis B Benign prostatic hypertrophy Diverticulitis Hypothyroidism Kidney Failure, Chronic Hypercholesterolemia Congestive heart failure Chills Abdominal Pain Dyspnea"""],
                        'dissimilar': ['mild scattered paranasal sinus mucosal thickening is observed',
                                   'deformity of the ventral thecal sac is observed']}


    print(get_bert_based_similarity(sentence_pairs, bio_bert_model, bio_bert_tokenizer))

Downloading:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

{'similar': 0.96376234, 'dissimilar': 0.87311345}


In [None]:
bio_bert_model = AutoModel.from_pretrained('dmis-lab/biobert-v1.1')
bio_bert_tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

Downloading:   0%|          | 0.00/462 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/413M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [None]:
ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_bio_bert_Results=[]
for i in ls_HADM_ID:

    ls=[]
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.tolist()[0]


    try:
        inputs_1 = bio_bert_tokenizer(a, return_tensors='pt')
        sent_1_embed = np.mean(bio_bert_model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
        ls_embedding_bio_bert_Results.append(['bio_bert',i,sent_1_embed.tolist()])

    except:
        print("An exception occurred")
        print(i)
        print('\n')

In [None]:
df_BioBert=pd.DataFrame(ls_embedding_bio_bert_Results,columns=['Model','HADM_ID','embedding'])
df_BioBert
df_BioBert.to_csv('/content/drive/MyDrive/csv/Embeddiing_Bio_bert_TFIDF.csv')

#BERT

In [None]:
from numpy import dot
from numpy.linalg import norm
from transformers import BertTokenizer, BertModel


def get_bert_similarity(sentence_pairs):
    """
    computes the embeddings of each sentence and its similarity with its corresponding pair
    Args:
        sentence_pairs(dict): dictionary of lists with the similarity type as key and a list of two sentences as value

    Returns:
        similarities(dict): dictionary with similarity type as key and the similarity measure as value
    """
    similarities = dict()
    for sim_type, sent_pair in sentence_pairs.items():
        inputs_1 = tokenizer(sent_pair[0], return_tensors='pt')
        inputs_2 = tokenizer(sent_pair[1], return_tensors='pt')
        sent_1_embed = model(**inputs_1).last_hidden_state[0][0].detach().numpy()
        sent_2_embed = model(**inputs_2).last_hidden_state[0][0].detach().numpy()
        similarities[sim_type] = dot(sent_1_embed, sent_2_embed)/(norm(sent_1_embed)* norm(sent_2_embed)) # computes the average of all the tokens' last_hidden_state
    return similarities



In [None]:
if __name__ == "__main__":
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')
    sentence_pairs = {'similar': ["""Tachycardia Ramus Intermedius Artery Gastrointestinal Hemorrhage Autoimmune hemolytic anemia Anemia, Hemolytic Restenosis Coronary Arteriosclerosis Ventricular hypertrophy Reduced Myocardial Infarction Stenosis Morphology Tachypnea Hypoxia Weakness Respiratory distress Diabetes Mellitus Hypothyroidism Hypotension Congestive heart failure Pain Hypertensive disease Dyspnea""",
                                """Amputation Cardiomyopathies Diverticular Diseases Peripheral Vascular Diseases Diabetes Muscle Weakness Left-Sided Hepatitis B Benign prostatic hypertrophy Diverticulitis Hypothyroidism Kidney Failure, Chronic Hypercholesterolemia Congestive heart failure Chills Abdominal Pain Dyspnea"""]
                       , 'dissimilar': ['mild scattered paranasal sinus mucosal thickening is observed',
                                  'deformity of the ventral thecal sac is observed']}
    print(get_bert_similarity(sentence_pairs))

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/420M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


{'similar': 0.96359426, 'dissimilar': 0.9286201}


In [None]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_bert_Results=[]
for i in ls_HADM_ID:

    ls=[]
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.tolist()[0]


    try:
        inputs_1 = tokenizer(a, return_tensors='pt')
        sent_1_embed = np.mean(model(**inputs_1).last_hidden_state[0].detach().numpy(), axis=0)
        ls_embedding_bert_Results.append(['BERT',i,sent_1_embed.tolist()])

    except:
        print("An exception occurred")
        print(i)
        print('\n')

In [None]:
df_BERT=pd.DataFrame(ls_embedding_bert_Results,columns=['Model','HADM_ID','embedding'])
df_BERT
df_BERT.to_csv('/content/drive/MyDrive/csv/Embeddiing_BERT_TFIDF.csv')

#USE

In [None]:
import numpy as np
import tensorflow.compat.v1 as tf
import tensorflow_hub as hub

from numpy import dot
from numpy.linalg import norm

def start_use_session(module):
    with tf.Graph().as_default():
      sentences = tf.placeholder(tf.string)
      embed = hub.load(module)
      embeddings = embed(sentences)
      session = tf.train.MonitoredSession()
    return lambda x: session.run(embeddings, {sentences: x})

def get_use_similarity(sentence_pairs):
    """
    computes the USE embeddings of each sentence and its similarity with its corresponding pair
    Args:
        sentence_pairs(dict): dictionary of lists with the similarity type as key and a list of two sentences as value

    Returns:
        similarities(dict): dictionary with similarity type as key and the similarity measure as value
    """
    similarities = dict()
    for sim_type, sent_pair in sentence_pairs.items():
        sent_1_embed = model([sent_pair[0]])[0]

        sent_2_embed = model([sent_pair[1]])[0]
        similarities[sim_type] = dot(sent_1_embed, sent_2_embed)/(norm(sent_1_embed)* norm(sent_2_embed))
    return similarities,[sent_pair[0]]





In [None]:
model(['Tachycardia Ramus Intermedius Artery '])

array([[ 2.30481289e-02,  1.87599976e-02,  4.63330336e-02,
         2.36791614e-02,  5.40048964e-02, -6.66214600e-02,
         4.67160940e-02, -1.71915106e-02, -2.26923767e-02,
         7.66097456e-02,  8.42459723e-02, -1.66938896e-03,
         3.67288780e-03,  1.81711018e-02, -8.69028084e-03,
         2.36422252e-02, -8.43953863e-02, -2.48459950e-02,
         7.19748624e-03,  2.03643069e-02, -4.97180447e-02,
         7.24908803e-03,  1.45427119e-02, -7.80116245e-02,
        -5.18221110e-02, -1.87576320e-02,  5.95559292e-02,
         5.49741164e-02, -6.36846349e-02,  1.99013148e-02,
         1.78192016e-02,  8.68044496e-02,  3.49112526e-02,
         7.09203631e-02,  2.40371432e-02, -3.82480305e-03,
        -5.10989651e-02,  5.63463494e-02, -7.18116164e-02,
        -5.00995703e-02, -3.76599580e-02,  4.19051386e-02,
         1.59949455e-02,  1.39849922e-02,  3.53941210e-02,
        -2.64824927e-02, -6.43908978e-02,  5.00836410e-02,
        -6.53285533e-02,  4.01891768e-03, -4.86019142e-0

In [None]:
if __name__ == "__main__":
    embed_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
    model = start_use_session(embed_url)
    sentence_pairs = {'similar': ["""Tachycardia Ramus Intermedius Artery Gastrointestinal Hemorrhage Autoimmune hemolytic anemia Anemia, Hemolytic Restenosis Coronary Arteriosclerosis Ventricular hypertrophy Reduced Myocardial Infarction Stenosis Morphology Tachypnea Hypoxia Weakness Respiratory distress Diabetes Mellitus Hypothyroidism Hypotension Congestive heart failure Pain Hypertensive disease Dyspnea""",
                                """Amputation Cardiomyopathies Diverticular Diseases Peripheral Vascular Diseases Diabetes Muscle Weakness Left-Sided Hepatitis B Benign prostatic hypertrophy Diverticulitis Hypothyroidism Kidney Failure, Chronic Hypercholesterolemia Congestive heart failure Chills Abdominal Pain Dyspnea"""],
                      'dissimilar': ['mild scattered paranasal sinus mucosal thickening is observed',
                                  'deformity of the ventral thecal sac is observed']}
    print(get_use_similarity(sentence_pairs))

INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


({'similar': 0.71995556, 'dissimilar': 0.41754285}, ['mild scattered paranasal sinus mucosal thickening is observed'])


In [None]:
embed_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model = start_use_session(embed_url)

In [None]:
%%time

ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_USE_Results=[]
for i in ls_HADM_ID:

    ls=[]
    print('\n')
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.tolist()
    print(a)

    try:
        inputs_1 = a
        sent_1_embed = model(inputs_1)[0]
        ls_embedding_USE_Results.append(['USE',i,sent_1_embed.tolist()])

    except:
        print("An exception occurred")
        print(i)
        print('\n')

In [None]:
df_USE=pd.DataFrame(ls_embedding_USE_Results,columns=['Model','HADM_ID','embedding'])
df_USE
df_USE.to_csv('/content/drive/MyDrive/csv/Embeddiing_USE_TFIDF.csv')

In [None]:
df_USE

Unnamed: 0,Model,HADM_ID,embedding
0,USE,183196,"[-0.055901601910591125, -0.05363648384809494, ..."
1,USE,113704,"[0.03339361399412155, 0.015583532862365246, -0..."
2,USE,184752,"[-0.05138344317674637, -0.054104436188936234, ..."
3,USE,141137,"[-0.009822143241763115, -0.061607301235198975,..."
4,USE,130564,"[0.015090403147041798, -0.03224306181073189, 0..."
...,...,...,...
1498,USE,144627,"[0.00865172315388918, -0.028927268460392952, -..."
1499,USE,160315,"[0.04544822499155998, -0.040899086743593216, 0..."
1500,USE,177843,"[-0.04324737936258316, -0.02594013139605522, -..."
1501,USE,125108,"[-0.01076168566942215, -0.035116761922836304, ..."


#Doc2vec

In [None]:
from gensim.test.utils import common_texts
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
import math
data =df.canonical_name.tolist()
print(len(data))
newlist=[]
for i in range(len(df)):
  x=df.iloc[i].canonical_name
  if (x is  np.nan):
    print(i)
  else:
    newlist.append([df.iloc[i].HADM_ID,df.iloc[i].canonical_name])

df2=pd.DataFrame(newlist,columns=['HADM_ID','canonical_name'])

1528
68
102
108
112
134
278
283
417
482
602
622
648
721
806
874
970
974
1013
1089
1131
1237
1275
1414
1458
1463


In [None]:


tagged_data = [TaggedDocument(words=word_tokenize(_d), tags=[str(i)]) for i, _d in enumerate(df2.canonical_name)]

max_epochs = 10
vec_size = 200
alpha = 0.025

model = Doc2Vec(size=vec_size,
                alpha=alpha,
                min_alpha=0.00025,
                min_count=1,
                dm =0,workers=10)

model.build_vocab(tagged_data)

for epoch in range(max_epochs):

    model.train(tagged_data,
                total_examples=model.corpus_count,
                epochs=model.iter)
    # decrease the learning rate
    model.alpha -= 0.0002
    # fix the learning rate, no decay
    model.min_alpha = model.alpha




In [None]:
model.save("d2v.model")
print("Model Saved")

Model Saved


[-0.023193661123514175,
 -1.311234951019287,
 1.117817759513855,
 -0.2301034927368164,
 1.7206670045852661,
 1.253474235534668,
 0.5260273814201355,
 0.873905599117279,
 -1.2135854959487915,
 -1.4309202432632446,
 -0.5149112343788147,
 0.539909839630127,
 0.5866551399230957,
 -1.4206856489181519,
 0.5555383563041687,
 1.2256603240966797,
 -0.6931719183921814,
 2.7608988285064697,
 -1.4708186388015747,
 0.08048756420612335]

In [None]:
list_vec=[]
for i in range(len(df2.HADM_ID.tolist())):
    #list_vec.append([list_sen[i][0],list_sen[i][1],model.docvecs[i],'Malignant',1])

    list_vec.append(['Doc2Vec_dm_0',df2.iloc[i].HADM_ID,model.docvecs[i].tolist()])



In [None]:
df_Doc2Vec=pd.DataFrame(list_vec,columns=['Model','HADM_ID','embedding'])
df_Doc2Vec
df_Doc2Vec.to_csv('/content/drive/MyDrive/csv/Embeddiing_Doc2Vec_dm_0_TFIDF.csv')

In [None]:
df_Doc2Vec

Unnamed: 0,Model,HADM_ID,embedding
0,Doc2Vec_dm_1,183196,"[0.17279177904129028, 0.27303844690322876, 0.4..."
1,Doc2Vec_dm_1,113704,"[-0.452552855014801, 0.04709945246577263, 0.09..."
2,Doc2Vec_dm_1,184752,"[0.07402870804071426, 0.21282319724559784, 0.1..."
3,Doc2Vec_dm_1,141137,"[0.06763401627540588, -0.09549178183078766, 0...."
4,Doc2Vec_dm_1,130564,"[0.208870068192482, -0.009179693646728992, -0...."
...,...,...,...
1498,Doc2Vec_dm_1,144627,"[0.5761002898216248, 0.4942508637905121, -0.13..."
1499,Doc2Vec_dm_1,160315,"[-0.34556853771209717, -0.5628878474235535, -0..."
1500,Doc2Vec_dm_1,177843,"[0.13409949839115143, -0.04859788715839386, 0...."
1501,Doc2Vec_dm_1,125108,"[0.0025058123283088207, -0.023107944056391716,..."


#FastText

In [None]:
!git clone https://github.com/facebookresearch/fastText.git
!cd fastText
!pip install fastText


Cloning into 'fastText'...
remote: Enumerating objects: 3854, done.[K
remote: Total 3854 (delta 0), reused 0 (delta 0), pack-reused 3854[K
Receiving objects: 100% (3854/3854), 8.22 MiB | 7.89 MiB/s, done.
Resolving deltas: 100% (2417/2417), done.
Checking out files: 100% (526/526), done.
Collecting fastText
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[K     |████████████████████████████████| 68 kB 2.6 MB/s 
[?25hCollecting pybind11>=2.2
  Using cached pybind11-2.9.1-py2.py3-none-any.whl (211 kB)
Building wheels for collected packages: fastText
  Building wheel for fastText (setup.py) ... [?25l[?25hdone
  Created wheel for fastText: filename=fasttext-0.9.2-cp37-cp37m-linux_x86_64.whl size=3136495 sha256=04de5547c93917e209208cebda055cae759642a2dcdbc3949db822fe811f5250
  Stored in directory: /root/.cache/pip/wheels/4e/ca/bf/b020d2be95f7641801a6597a29c8f4f19e38f9c02a345bab9b
Successfully built fastText
Installing collected packages: pybind11, fastText
Successfully installed fastText-

In [None]:
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')  # English
model = fasttext.load_model('cc.en.300.bin')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz


KeyboardInterrupt: ignored

In [None]:
import fasttext
import nltk
import numpy as np

from numpy import dot
from numpy.linalg import norm


def get_sentence_embed(sentence):
    return np.mean([model.get_word_vector(w) for w in nltk.word_tokenize(sentence)], axis=0)

def get_fasttext_similarity(sentence_pairs):
    """
    computes the fasttext embeddings of each sentence and its similarity with its corresponding pair
    Args:
        sentence_pairs(dict): dictionary of lists with the similarity type as key and a list of two sentences as value

    Returns:
        similarities(dict): dictionary with similarity type as key and the similarity measure as value
    """
    similarities = dict()
    for sim_type, sent_pair in sentence_pairs.items():
        sent_1_embed = get_sentence_embed(sent_pair[0])
        sent_2_embed = get_sentence_embed(sent_pair[1])
        similarities[sim_type] = dot(sent_1_embed, sent_2_embed)/(norm(sent_1_embed)* norm(sent_2_embed))
    return similarities


if __name__=='__main__':
  model = fasttext.train_unsupervised('data/enwik9')  # downloaded from https://fasttext.cc/docs/en/unsupervised-tutorial.html
  sentence_pairs = {'similar': ['the MRI of the abdomen is normal and without evidence of malignancy',
                                'no significant abnormalities involving the abdomen is observed'],
                  'dissimilar': ['mild scattered paranasal sinus mucosal thickening is observed',
                                 'deformity of the ventral thecal sac is observed']}
  print(get_fasttext_similarity(sentence_pairs))

ModuleNotFoundError: ignored

##Cosine Similarity

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cd /content/drive/MyDrive/csv

/content/drive/MyDrive/csv


In [None]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
df=pd.read_csv('df_canonical_name_TFIDF_l2.csv')

embed_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'
model = start_use_session(embed_url)

INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Graph was finalized.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Running local_init_op.


INFO:tensorflow:Done running local_init_op.


INFO:tensorflow:Done running local_init_op.


In [None]:

ls_HADM_ID=df.HADM_ID.unique().tolist()
ls_embedding_USE_Results=[]
for i in ls_HADM_ID:

    ls=[]
    print('\n')
    print(i)
    a=df[df['HADM_ID']==i].canonical_name.to_string()


    #try:
        #inputs_1 = umls_bert_tokenizer(a, return_tensors='pt')
    sent_1_embed =model([a])[0]
    ls_embedding_USE_Results.append(['USE',i,sent_1_embed.tolist()])

    #except:
        #print("An exception occurred")
        #print(i)
        #print('\n')

df_USE=pd.DataFrame(ls_embedding_USE_Results,columns=['Model','HADM_ID','embedding'])

df_USE

In [None]:
import numpy as np

ls=[]
for i in range(len(df_USE)):
    print(i)
    v= np.array(df_USE.iloc[i].embedding)
    ls.append(v)

In [None]:
var2=cosine_similarity(ls, ls)
var2

array([[1.        , 0.42301733, 0.29371756, ..., 0.49437554, 0.2867104 ,
        0.45351113],
       [0.42301733, 1.        , 0.25840592, ..., 0.5627734 , 0.2765667 ,
        0.36007039],
       [0.29371756, 0.25840592, 1.        , ..., 0.23236045, 0.27624033,
        0.20104187],
       ...,
       [0.49437554, 0.5627734 , 0.23236045, ..., 1.        , 0.36031456,
        0.36428355],
       [0.2867104 , 0.2765667 , 0.27624033, ..., 0.36031456, 1.        ,
        0.2590029 ],
       [0.45351113, 0.36007039, 0.20104187, ..., 0.36428355, 0.2590029 ,
        1.        ]])

In [None]:
df1=pd.DataFrame(var2, index=[i for i in df_USE.HADM_ID.tolist()])
df1.columns = df_USE.HADM_ID.tolist()
df1.to_csv('/content/drive/MyDrive/csv/dis_USE.csv')
df1.describe()

Unnamed: 0,183196,113704,184752,141137,130564,195782,162248,197204,177047,104141,130491,179653,127631,192421,199103,137072,185941,193143,128111,195320,184678,158948,119417,155910,196173,120433,111848,135411,115830,177958,193820,160730,147259,192777,115339,107306,147496,145457,155584,153844,...,128416,130493,164688,135644,152074,177240,105889,154684,110974,160802,106955,194572,159476,134984,145392,116105,174330,165899,197991,103926,134152,103671,179720,116955,136952,187241,169761,129519,186444,148275,131258,108303,151700,144460,167404,144627,160315,177843,125108,154714
count,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,...,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0,1528.0
mean,0.33694,0.313592,0.24045,0.270996,0.313538,0.263859,0.253271,0.232545,0.339388,0.231295,0.326379,0.291644,0.291854,0.226405,0.281955,0.325871,0.294945,0.329387,0.294027,0.232054,0.252832,0.31115,0.32416,0.282777,0.174545,0.257916,0.332634,0.232215,0.331669,0.290256,0.232183,0.292585,0.285642,0.276486,0.228859,0.190739,0.274138,0.233928,0.267467,0.243726,...,0.270339,0.316391,0.306907,0.311742,0.211905,0.305253,0.28427,0.24973,0.168519,0.207003,0.284143,0.322477,0.264343,0.285873,0.270961,0.28416,0.255078,0.359189,0.281357,0.32275,0.297285,0.269046,0.304201,0.334433,0.368668,0.279027,0.315634,0.372659,0.280961,0.310666,0.302891,0.313301,0.296761,0.323946,0.29433,0.292228,0.222879,0.326145,0.291764,0.339769
std,0.114905,0.100724,0.078103,0.108146,0.104273,0.108532,0.099605,0.10572,0.095068,0.075756,0.116958,0.087273,0.085109,0.083899,0.10438,0.118413,0.083949,0.094299,0.074882,0.085915,0.084177,0.103142,0.103017,0.115179,0.083227,0.094205,0.097588,0.09906,0.123623,0.108091,0.102212,0.105722,0.100588,0.102762,0.08016,0.106616,0.111232,0.088575,0.08352,0.084743,...,0.106984,0.113948,0.110635,0.106074,0.092961,0.107882,0.101481,0.116548,0.069435,0.088357,0.080967,0.104777,0.11341,0.108729,0.100245,0.122718,0.115749,0.113279,0.077494,0.109369,0.108514,0.11361,0.094931,0.10944,0.102764,0.107489,0.116004,0.120289,0.099823,0.098719,0.103776,0.097905,0.08811,0.111264,0.110051,0.090812,0.09289,0.11293,0.097159,0.100487
min,-0.086204,-0.036251,-0.016316,0.009597,-0.02151,-0.039663,-0.034277,-0.044164,-0.084345,0.020158,-0.038333,-0.050728,-0.027003,-0.077594,-0.028261,-0.111134,0.016574,-0.041338,0.049104,-0.043948,-0.038538,0.023769,0.02791,-0.074146,-0.080513,-0.053049,-0.018836,-0.032072,-0.093867,-0.070029,-0.124278,0.017158,-0.001031,0.007304,-0.005312,-0.115225,-0.021847,0.000259,-0.002708,0.027687,...,0.009785,-0.004486,0.013077,-0.051183,-0.113898,0.005375,-0.058238,-0.100337,-0.081599,-0.013444,0.015689,0.027807,-0.029663,-0.080759,-0.029985,-0.087662,-0.022649,-0.054543,0.061115,-0.035276,-0.021433,-0.037634,-0.046084,-0.045485,-4.5e-05,-0.067115,-0.047168,-0.068297,-0.084419,0.012027,-0.003093,-0.02564,-0.023586,-0.108125,-0.005104,-0.028137,-0.044744,-0.102658,-0.045516,0.004829
25%,0.263081,0.252639,0.192425,0.201151,0.24864,0.194825,0.186998,0.171932,0.287677,0.183758,0.250485,0.239146,0.238671,0.17587,0.217663,0.252595,0.240232,0.275901,0.248562,0.172705,0.204345,0.242806,0.254743,0.205383,0.124431,0.198588,0.266172,0.166647,0.254026,0.220466,0.167937,0.220333,0.218716,0.2162,0.179315,0.118124,0.197427,0.172157,0.21134,0.188525,...,0.194476,0.243007,0.240889,0.245538,0.151448,0.241707,0.224924,0.174533,0.123089,0.150481,0.234614,0.249837,0.190734,0.214864,0.2055,0.206464,0.193399,0.287049,0.230815,0.2521,0.231414,0.196766,0.243993,0.268036,0.310533,0.209748,0.238384,0.299193,0.219238,0.247278,0.236557,0.249726,0.248356,0.264609,0.22453,0.23807,0.163165,0.257665,0.238154,0.279748
50%,0.336699,0.31375,0.242424,0.255864,0.315539,0.254992,0.248241,0.221572,0.345777,0.226964,0.323405,0.291912,0.289916,0.226424,0.276153,0.31908,0.292244,0.338066,0.296548,0.229427,0.257378,0.303675,0.32066,0.272322,0.173272,0.255911,0.334183,0.224842,0.323672,0.291392,0.23314,0.290138,0.281378,0.270701,0.225087,0.186719,0.261745,0.234902,0.262017,0.23575,...,0.265978,0.308586,0.300867,0.313586,0.211179,0.303193,0.283218,0.238399,0.165881,0.1993,0.286477,0.317452,0.257796,0.285036,0.268751,0.2737,0.240198,0.358311,0.284204,0.324745,0.291532,0.252915,0.299298,0.33629,0.3719,0.272125,0.309106,0.372634,0.28397,0.303437,0.297029,0.312521,0.300946,0.329331,0.284452,0.295764,0.218595,0.329218,0.292043,0.339956
75%,0.412923,0.376209,0.287408,0.323645,0.379308,0.321406,0.306665,0.27146,0.398611,0.276944,0.398406,0.345367,0.340782,0.275701,0.3409,0.398152,0.344889,0.393739,0.341493,0.28507,0.30268,0.376866,0.388977,0.350981,0.223246,0.31582,0.398074,0.286433,0.40656,0.363033,0.294971,0.36247,0.34075,0.327005,0.272857,0.256644,0.341819,0.291129,0.314477,0.28896,...,0.33824,0.382099,0.361581,0.375463,0.274366,0.357476,0.338676,0.30972,0.212514,0.253793,0.337627,0.390839,0.327334,0.35256,0.334222,0.350444,0.290513,0.433214,0.330952,0.394855,0.364356,0.315879,0.361321,0.405927,0.437994,0.345517,0.388344,0.449074,0.344592,0.368501,0.359554,0.373986,0.350339,0.395041,0.35312,0.352165,0.2772,0.398778,0.349945,0.40583
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


#Cosin Similarity

---



In [None]:
import numpy as np

ls=[]
for i in range(len(df_BERT)):
    print(i)
    v= np.array(df_BERT.iloc[i].embedding)
    ls.append(v)



In [None]:

var2=cosine_similarity(ls, ls)
var2

array([[1.        , 0.8959684 , 0.96102092, ..., 0.9040968 , 0.9088356 ,
        0.96206052],
       [0.8959684 , 1.        , 0.90837879, ..., 0.89757883, 0.87645184,
        0.90951425],
       [0.96102092, 0.90837879, 1.        , ..., 0.94169238, 0.93783204,
        0.96271701],
       ...,
       [0.9040968 , 0.89757883, 0.94169238, ..., 1.        , 0.95286707,
        0.9355704 ],
       [0.9088356 , 0.87645184, 0.93783204, ..., 0.95286707, 1.        ,
        0.9113453 ],
       [0.96206052, 0.90951425, 0.96271701, ..., 0.9355704 , 0.9113453 ,
        1.        ]])

In [None]:
df1=pd.DataFrame(var2, index=[i for i in df_blue_bert.HADM_ID.tolist()])
df1.columns = df_BERT.HADM_ID.tolist()
df1.describe()


Unnamed: 0,183196,113704,184752,141137,130564,195782,162248,197204,177047,104141,130491,179653,127631,192421,199103,137072,185941,193143,128111,195320,184678,158948,119417,155910,196173,120433,111848,135411,115830,177958,193820,160730,147259,192777,115339,107306,147496,145457,155584,153844,...,128416,130493,164688,135644,152074,177240,105889,154684,110974,160802,106955,194572,159476,134984,145392,116105,174330,165899,197991,103926,134152,103671,179720,116955,136952,187241,169761,129519,186444,148275,131258,108303,151700,144460,167404,144627,160315,177843,125108,154714
count,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,...,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0,1503.0
mean,0.909705,0.888267,0.921308,0.877933,0.884907,0.914296,0.901592,0.889387,0.920343,0.895964,0.903431,0.911416,0.892431,0.887308,0.841217,0.881686,0.885907,0.912874,0.900758,0.910552,0.903155,0.896801,0.912546,0.830653,0.860905,0.907158,0.903197,0.896969,0.920231,0.857785,0.852713,0.894749,0.909898,0.904494,0.876624,0.87715,0.838442,0.857463,0.87575,0.903876,...,0.859241,0.888113,0.884411,0.847787,0.874154,0.905039,0.914369,0.883986,0.911113,0.907972,0.881821,0.89587,0.922468,0.887351,0.882836,0.908407,0.864837,0.911792,0.910202,0.913747,0.900621,0.908114,0.873761,0.897372,0.914264,0.908165,0.887987,0.85742,0.89982,0.880663,0.91553,0.902078,0.875116,0.87573,0.905251,0.911106,0.882139,0.893514,0.889699,0.91988
std,0.047952,0.030702,0.044121,0.043819,0.045247,0.039987,0.04197,0.055922,0.039034,0.044972,0.04879,0.045732,0.030827,0.038648,0.040276,0.052354,0.048971,0.042023,0.041004,0.040757,0.039358,0.052147,0.043036,0.040467,0.046583,0.043254,0.037786,0.042612,0.038366,0.040988,0.035007,0.047101,0.045256,0.044981,0.037272,0.037906,0.043231,0.03471,0.053815,0.045809,...,0.040226,0.051125,0.037915,0.046062,0.043246,0.04278,0.045829,0.044601,0.042896,0.042606,0.043176,0.045331,0.043661,0.053981,0.057494,0.040994,0.033505,0.044686,0.040194,0.046316,0.04257,0.048198,0.046618,0.035995,0.037738,0.04696,0.04504,0.035138,0.04441,0.034855,0.035642,0.040887,0.029685,0.049552,0.046529,0.040811,0.030128,0.053892,0.054763,0.04329
min,0.441352,0.522431,0.492256,0.494396,0.556049,0.522194,0.546109,0.479494,0.501438,0.465909,0.485227,0.465554,0.541117,0.52011,0.50856,0.50998,0.506174,0.479754,0.549911,0.525855,0.509808,0.526651,0.52291,0.457812,0.518662,0.489665,0.518076,0.490113,0.499859,0.510092,0.533041,0.469107,0.466358,0.508238,0.495791,0.555817,0.475072,0.476424,0.509775,0.516243,...,0.496712,0.484927,0.510343,0.535922,0.474077,0.504197,0.481705,0.518115,0.497806,0.518995,0.476473,0.508035,0.504999,0.518461,0.508759,0.490067,0.540379,0.496026,0.500875,0.494509,0.476553,0.448166,0.426266,0.513868,0.507115,0.485739,0.500521,0.47471,0.526718,0.536599,0.522189,0.544608,0.523862,0.506895,0.473501,0.47627,0.554137,0.493818,0.506769,0.480655
25%,0.886953,0.879541,0.907088,0.853227,0.86588,0.902023,0.88728,0.864011,0.907209,0.870813,0.884862,0.886419,0.881481,0.872886,0.815933,0.858402,0.864602,0.89025,0.888453,0.896973,0.880234,0.873251,0.899282,0.808769,0.838376,0.888251,0.892483,0.874543,0.899886,0.834262,0.83967,0.867662,0.889251,0.888594,0.857766,0.864221,0.816097,0.842386,0.849108,0.886854,...,0.842215,0.866027,0.872174,0.823611,0.849004,0.886693,0.899512,0.86759,0.888052,0.895149,0.855158,0.879623,0.908943,0.863412,0.853446,0.887694,0.847368,0.892529,0.895845,0.897804,0.873604,0.887638,0.845604,0.887612,0.905137,0.891889,0.87159,0.841766,0.884346,0.864229,0.899718,0.889395,0.866176,0.851064,0.888088,0.889144,0.872676,0.86862,0.864011,0.903249
50%,0.917754,0.893702,0.930245,0.884238,0.893461,0.92294,0.911333,0.900178,0.928193,0.905643,0.912203,0.921228,0.896837,0.895032,0.846974,0.891625,0.895022,0.922954,0.91022,0.918506,0.91189,0.908049,0.923476,0.834479,0.865331,0.913779,0.912491,0.905399,0.928588,0.865312,0.858296,0.903342,0.91715,0.915108,0.88245,0.882572,0.843744,0.862556,0.884115,0.914051,...,0.863924,0.897815,0.891604,0.853295,0.883288,0.913066,0.922131,0.893374,0.918925,0.918436,0.889508,0.90605,0.931815,0.897601,0.890894,0.917307,0.868917,0.918789,0.918655,0.922777,0.910375,0.915224,0.883864,0.904287,0.923867,0.917674,0.896683,0.862745,0.910492,0.885783,0.923644,0.911331,0.879281,0.882553,0.915839,0.919695,0.887067,0.904074,0.899823,0.926753
75%,0.94347,0.904977,0.949265,0.909426,0.915469,0.938666,0.927259,0.928349,0.944442,0.928845,0.934271,0.945324,0.910176,0.911317,0.870142,0.918211,0.920035,0.943004,0.926965,0.934827,0.932515,0.933125,0.939156,0.857032,0.893843,0.936928,0.924042,0.927018,0.947692,0.888424,0.873633,0.929884,0.940826,0.933637,0.902292,0.899812,0.865495,0.878646,0.913443,0.933975,...,0.88428,0.922632,0.905888,0.880761,0.906143,0.933995,0.942969,0.912071,0.942629,0.933846,0.914902,0.925493,0.950175,0.925224,0.924572,0.937347,0.88632,0.942555,0.935614,0.9426,0.932504,0.940159,0.908533,0.917303,0.934393,0.937884,0.915156,0.877729,0.927075,0.90303,0.93916,0.927731,0.890839,0.911889,0.936138,0.94002,0.899086,0.931489,0.927803,0.947473
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [None]:
df1.to_csv('/content/drive/MyDrive/csv/dis_Bert.csv')
