In [18]:
import random, os

from configobj import ConfigObj
from transformers import BertForMaskedLM, BertTokenizer
from batch_loaders.MLM_batch_loader import RandomTreeWalkBatchLoader
from batch_loaders.alignment_batch_loader import AlignmentBatchLoader
from batch_loaders.pair_alignment_batch_loader import PairAlignmentBatchLoader
from matchers.greedy_matcher import GreedyMatcher
from matchers.stable_marriage import StableMarriage
from metrics import Metrics
from models.mlm_model import MLMOntoBert

from models.pair_alignment_model import PairOntoBert
from models.sorbet import SORBET
from models.tf_idf_similarity import SubTokenSimilarity
from trainer import Trainer


from batch_loaders.ontology_parsing.ontology import Ontology
from batch_loaders.ontology_parsing.ontology_config import OntoConfig
import torch

config = ConfigObj('config.ini')

track = config['General']['track']

In [19]:
model = MLMOntoBert(from_pretrained="emilyalsentzer/Bio_ClinicalBERT")
tokenizer = BertTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")
model.tokenizer = tokenizer

onto1 = Ontology("/usagers/frgosb/Documents/datasets/anatomy/ontologies/human.owl", ontology_config=OntoConfig(config["anatomy"]['parsing_parameters']))
onto1.classes = ["NCI_C33550", "NCI_C33578", "NCI_C33581"]


Some weights of the model checkpoint at emilyalsentzer/Bio_ClinicalBERT were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [20]:
labels = {}
groups_indices = []
i = -1
for c in onto1.classes:
    concept_label = " ".join(onto1.get_id_label(c))
    labels[c] = (concept_label)
    class_labels = [concept_label]
    
    i = i+1
    groups_indices.append([i])


    for synonym in onto1.synonyms[c]:
        label = " ".join(onto1.get_id_label(synonym))
        if label not in class_labels:
            labels[synonym] = (label)
            class_labels.append(label)
            i=i+1
            groups_indices[-1].append(i) 

print(labels)
print(groups_indices)

{'NCI_C33550': 'sigmoid flexure', 'genid3764': 'sigmoid flexure colon', 'NCI_C33578': 'somatotrope cell', 'genid1608': 'somatotroph cell', 'genid1609': 'somatotroph', 'genid1611': 'gh cell', 'NCI_C33581': 'spermatic artery', 'genid6936': 'testicular artery', 'genid6938': 'internal spermatic artery'}
[[0, 1], [2, 3, 4, 5], [6, 7, 8]]


In [21]:

embeddings = None

with torch.no_grad():

    input = tokenizer(list(labels.values()), return_tensors='pt',
                   max_length=512, truncation=True, padding='max_length').to(model.device)

    outputs = model.model(**input)

    for i, j in enumerate(torch.count_nonzero(input["attention_mask"], dim=1)):
        input["attention_mask"][i,j-1] = 0
    input["attention_mask"][:,0] = 0

    for i, output in enumerate(outputs.hidden_states[-1]):
        if embeddings == None:
            embeddings = torch.mean(output[input["attention_mask"][i]], dim=0).view(1,-1)
        else:
            embeddings = torch.cat((embeddings, torch.mean(output[input["attention_mask"][i]], dim=0).view(1,-1)))


In [22]:
import torch.nn.functional as F


sim = F.normalize(embeddings,dim=1) @ F.normalize(embeddings,dim=1).t()

word_vectors = embeddings.cpu().tolist()


In [23]:
print(sim)
print(labels.values())


tensor([[1.0000, 0.9306, 0.8288, 0.8666, 0.8439, 0.7609, 0.9010, 0.9215, 0.9184],
        [0.9306, 1.0000, 0.8680, 0.9198, 0.9043, 0.7830, 0.8980, 0.9229, 0.9187],
        [0.8288, 0.8680, 1.0000, 0.9351, 0.9532, 0.8619, 0.8327, 0.8999, 0.8165],
        [0.8666, 0.9198, 0.9351, 1.0000, 0.9523, 0.8509, 0.8844, 0.9202, 0.8666],
        [0.8439, 0.9043, 0.9532, 0.9523, 1.0000, 0.8510, 0.8441, 0.8969, 0.8386],
        [0.7609, 0.7830, 0.8619, 0.8509, 0.8510, 1.0000, 0.7584, 0.8200, 0.7549],
        [0.9010, 0.8980, 0.8327, 0.8844, 0.8441, 0.7584, 1.0000, 0.9508, 0.9577],
        [0.9215, 0.9229, 0.8999, 0.9202, 0.8969, 0.8200, 0.9508, 1.0000, 0.9439],
        [0.9184, 0.9187, 0.8165, 0.8666, 0.8386, 0.7549, 0.9577, 0.9439, 1.0000]],
       device='cuda:0')
dict_values(['sigmoid flexure', 'sigmoid flexure colon', 'somatotrope cell', 'somatotroph cell', 'somatotroph', 'gh cell', 'spermatic artery', 'testicular artery', 'internal spermatic artery'])


In [44]:
import plotly
import numpy as np
import plotly.graph_objs as go
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

def display_tsne_scatterplot(word_vectors, words=None):

    
    three_dim = PCA(n_components = 2).fit_transform(word_vectors)

    three_dim[0,1]+= 0.05
    three_dim[7,1]-= 0.05

    # For 2D, change the three_dim variable into something like two_dim like the following:
    # two_dim = TSNE(n_components = 2, random_state=0, perplexity = perplexity, learning_rate = learning_rate, n_iter = iteration).fit_transform(word_vectors)[:,:2]

    data = []


    # for i in range (len(three_dim) - 1):

    #             trace = go.Scatter(
    #                 x = [three_dim[i,0], three_dim[i+1,0]], 
    #                 y = [three_dim[i,1], three_dim[i+1,1]],  
    #                 # z = three_dim[count:count+topn,2],
    #                 text = words[i],
    #                 textposition = "top center",
    #                 textfont_size = 20,
    #                 mode = 'markers+text',
    #                 marker = {
    #                     'size': 10,
    #                     'opacity': 0.8,
    #                     'color': 2
    #                 }
       
    #             )

    for i in range (len(groups_indices)):

                trace = go.Scatter(
                    x = three_dim[groups_indices[i][0]:groups_indices[i][-1]+1,0], 
                    y = three_dim[groups_indices[i][0]:groups_indices[i][-1]+1,1],  
                    # z = three_dim[count:count+topn,2],
                    text = words[groups_indices[i][0]:groups_indices[i][-1]+1],
                    name = words[groups_indices[i][0]],
                    textposition = "top center",
                    textfont_size = 20,
                    mode = 'markers+text',
                    marker = {
                        'size': 10,
                        'opacity': 0.8,
                        'color': 2
                    }
       
                )
                print()
                print([three_dim[groups_indices[i][0]:groups_indices[i][-1]+1,0]])
                print([three_dim[groups_indices[i][0]:groups_indices[i][-1]+1,1]])
                print(words[groups_indices[i][0]:groups_indices[i][-1]+1])
                
                # For 2D, instead of using go.Scatter3d, we need to use go.Scatter and delete the z variable. Also, instead of using
                # variable three_dim, use the variable that we have declared earlier (e.g two_dim)
            
                data.append(trace)

    # trace_input = go.Scatter(
    #                 x = three_dim[i,0], 
    #                 y = three_dim[i,1],  
    #                 text = words[i],
    #                 name = 'input words',
    #                 textposition = "top center",
    #                 textfont_size = 20,
    #                 mode = 'markers+text',
    #                 marker = {
    #                     'size': 10,
    #                     'opacity': 1,
    #                     'color': 'black'
    #                 }
    #                 )

    # # For 2D, instead of using go.Scatter3d, we need to use go.Scatter and delete the z variable.  Also, instead of using
    # # variable three_dim, use the variable that we have declared earlier (e.g two_dim)
            
    # data.append(trace_input)
    
# Configure the layout

    layout = go.Layout(
        margin = {'l': 0, 'r': 0, 'b': 0, 't': 0},
        showlegend=True,
        legend=dict(
        font=dict(
            family="Courier New",
            size=25,
            color="black"
        )),
        font = dict(
            family = " Courier New ",
            size = 15),
            autosize = False,
            width = 1400,
            height = 1000

        )


    plot_figure = go.Figure(data = data, layout = layout)
    plot_figure.update_layout(yaxis_range=[-3.5,5.5])
    plot_figure.update_layout(xaxis_range=[-5.5,6.5])
    plot_figure.show()

    plot_figure.write_image("fig2.png")


    
display_tsne_scatterplot(word_vectors, list(labels.values()))


[array([-2.97176106, -1.57486927])]
[array([ 0.5101017 , -1.34607619])]
['sigmoid flexure', 'sigmoid flexure colon']

[array([3.17196713, 1.58427993, 2.60003307, 5.48522031])]
[array([-1.94058681, -1.81396481, -2.55113829,  4.35033012])]
['somatotrope cell', 'somatotroph cell', 'somatotroph', 'gh cell']

[array([-3.19709516, -1.37068041, -3.72709454])]
[array([1.05557867, 0.40735343, 1.32840218])]
['spermatic artery', 'testicular artery', 'internal spermatic artery']
