In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
classifier = pipeline(model="facebook/bart-large-mnli")

Downloading config.json: 100%|██████████| 1.15k/1.15k [00:00<00:00, 226kB/s]
Downloading model.safetensors: 100%|██████████| 1.63G/1.63G [00:03<00:00, 447MB/s]
Downloading tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 7.31kB/s]
Downloading vocab.json: 100%|██████████| 899k/899k [00:00<00:00, 1.63MB/s]
Downloading merges.txt: 100%|██████████| 456k/456k [00:03<00:00, 126kB/s]
Downloading tokenizer.json: 100%|██████████| 1.36M/1.36M [00:00<00:00, 34.2MB/s]


In [3]:
classifier(
    "I have a problem with my iphone that needs to be resolved asap!!",
    candidate_labels=["urgent", "not urgent", "phone", "tablet", "computer"],
)

{'sequence': 'I have a problem with my iphone that needs to be resolved asap!!',
 'labels': ['urgent', 'phone', 'computer', 'not urgent', 'tablet'],
 'scores': [0.5036356449127197,
  0.47879964113235474,
  0.012600167654454708,
  0.002655783900991082,
  0.0023087572772055864]}

In [4]:
pipe = pipeline("fill-mask", model="microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

Downloading config.json: 100%|██████████| 385/385 [00:00<00:00, 365kB/s]
Downloading pytorch_model.bin: 100%|██████████| 440M/440M [00:01<00:00, 390MB/s] 
Some weights of the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Downloading tokenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 28.0kB/s]
Downloading vocab.txt: 100%|██████████| 226k/226k [00:00<00:00, 10.0MB/s]


In [5]:
pipe("[MASK] is a tumor suppressor gene.")

[{'score': 0.28550955653190613,
  'token': 5105,
  'token_str': 'p53',
  'sequence': 'p53 is a tumor suppressor gene.'},
 {'score': 0.16949503123760223,
  'token': 13544,
  'token_str': 'tp53',
  'sequence': 'tp53 is a tumor suppressor gene.'},
 {'score': 0.0856199562549591,
  'token': 11779,
  'token_str': 'brca1',
  'sequence': 'brca1 is a tumor suppressor gene.'},
 {'score': 0.07339830696582794,
  'token': 9496,
  'token_str': 'pten',
  'sequence': 'pten is a tumor suppressor gene.'},
 {'score': 0.06466539949178696,
  'token': 2176,
  'token_str': 'it',
  'sequence': 'it is a tumor suppressor gene.'}]

In [6]:
pipe("[MASK] is implicated in type-2 diabetes.")

[{'score': 0.14128601551055908,
  'token': 4531,
  'token_str': 'inflammation',
  'sequence': 'inflammation is implicated in type - 2 diabetes.'},
 {'score': 0.11703887581825256,
  'token': 5379,
  'token_str': 'obesity',
  'sequence': 'obesity is implicated in type - 2 diabetes.'},
 {'score': 0.06760377436876297,
  'token': 9598,
  'token_str': 'leptin',
  'sequence': 'leptin is implicated in type - 2 diabetes.'},
 {'score': 0.04628118872642517,
  'token': 29587,
  'token_str': 'hyperinsulinemia',
  'sequence': 'hyperinsulinemia is implicated in type - 2 diabetes.'},
 {'score': 0.03999573364853859,
  'token': 2176,
  'token_str': 'it',
  'sequence': 'it is implicated in type - 2 diabetes.'}]

In [1]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")
model = AutoModelForMaskedLM.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")

  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
tokenizer.cls_token_id

2

In [3]:
tokenizer.model_max_length

1000000000000000019884624838656

In [3]:
import torch

In [4]:
inputs = tokenizer("[MASK] is a causal gene for Inflammatory Bowel Disease.", return_tensors="pt")
with torch.no_grad():
    logits = model(**inputs).logits

In [5]:
inputs.input_ids

tensor([[   2,    4, 1977,   43, 9263, 2359, 1958, 3769, 9472, 2573,   18,    3]])

In [6]:
print(tokenizer.sep_token_id)
print(tokenizer.mask_token_id)

3
4


In [7]:
tokenizer.decode(18)

'.'

In [11]:
inputs.input_ids == tokenizer.mask_token_id

tensor([[False,  True, False, False, False, False, False, False, False, False,
         False, False]])

In [12]:
mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
mask_token_index

tensor([1])

In [13]:
logits[0, mask_token_index]

tensor([[-4.5259,  2.0390, -4.7685,  ...,  6.4460, -4.4976,  0.9217]])

In [15]:
len(logits[0,mask_token_index][0])

30522

In [16]:
predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
tokenizer.decode(predicted_token_id)

'nod2'

In [18]:
sorted_tensor = torch.topk(logits[0,mask_token_index][0],10).indices
for i in range(10):
    print(tokenizer.decode(sorted_tensor[i]))

nod2
it
tnf
il10
tlr4
foxp3
fto
apc
stat3
ctla4


In [19]:
model.__dict__

{'training': False,
 '_parameters': OrderedDict(),
 '_buffers': OrderedDict(),
 '_non_persistent_buffers_set': set(),
 '_backward_hooks': OrderedDict(),
 '_is_full_backward_hook': None,
 '_forward_hooks': OrderedDict(),
 '_forward_pre_hooks': OrderedDict(),
 '_state_dict_hooks': OrderedDict(),
 '_load_state_dict_pre_hooks': OrderedDict(),
 '_load_state_dict_post_hooks': OrderedDict(),
 '_modules': OrderedDict([('bert',
               BertModel(
                 (embeddings): BertEmbeddings(
                   (word_embeddings): Embedding(30522, 768, padding_idx=0)
                   (position_embeddings): Embedding(512, 768)
                   (token_type_embeddings): Embedding(2, 768)
                   (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
                   (dropout): Dropout(p=0.1, inplace=False)
                 )
                 (encoder): BertEncoder(
                   (layer): ModuleList(
                     (0): BertLayer(
                      

In [26]:
with torch.no_grad():
    embeddings = model(**inputs, output_hidden_states=True).hidden_states

In [29]:
len(embeddings)

13

In [30]:
for i,layer in enumerate(embeddings):
    print(layer.shape)

torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])
torch.Size([1, 12, 768])


In [31]:
embeddings[0][0]

tensor([[ 0.1079,  0.2989,  0.0111,  ...,  0.1501,  0.0076,  0.0868],
        [-0.1667, -0.2235, -0.0414,  ...,  0.2386, -0.0464, -0.1347],
        [-0.0484,  0.5851,  0.4326,  ...,  0.1765, -0.0968,  0.6698],
        ...,
        [-0.1578, -0.0203,  0.1617,  ..., -0.2010, -0.4561,  0.5157],
        [ 0.0249,  0.4300, -0.1002,  ..., -0.2891,  0.1824,  0.5265],
        [ 0.1931,  0.2004, -0.4961,  ..., -0.0038,  0.1135,  0.5273]])

In [32]:
embeddings[-1][0]

tensor([[-0.1726, -0.6324, -0.0109,  ..., -0.2206,  0.8523,  0.1826],
        [-0.1518, -0.3321,  0.1190,  ..., -0.0778,  0.2135, -0.0367],
        [-0.1118, -0.2569,  0.0689,  ..., -0.1819, -0.0332,  0.2606],
        ...,
        [ 0.1249,  0.1328,  0.1745,  ..., -0.3186, -0.0856,  0.4124],
        [-0.1887, -0.2955, -0.0451,  ..., -0.1536,  0.2173,  0.1748],
        [-0.1887, -0.2954, -0.0451,  ..., -0.1535,  0.2173,  0.1747]])

In [33]:
inputs2 = tokenizer("[MASK] is a causal gene for IBD.", return_tensors="pt")
with torch.no_grad():
    embeddings2 = model(**inputs2, output_hidden_states=True).hidden_states

In [34]:
embeddings2[0][0]

tensor([[ 0.1079,  0.2989,  0.0111,  ...,  0.1501,  0.0076,  0.0868],
        [-0.1667, -0.2235, -0.0414,  ...,  0.2386, -0.0464, -0.1347],
        [-0.0484,  0.5851,  0.4326,  ...,  0.1765, -0.0968,  0.6698],
        ...,
        [-0.2087,  0.2629,  0.4794,  ..., -0.2845,  0.1548,  0.4426],
        [ 0.0414,  0.4427,  0.0579,  ..., -0.1193,  0.0763,  0.4232],
        [ 0.1203,  0.1681, -0.2495,  ...,  0.0127,  0.0307,  0.5334]])

In [35]:
embeddings2[-1][0]

tensor([[-1.4288e-01, -6.1786e-01,  1.1789e-01,  ..., -3.7112e-01,
          8.1703e-01,  3.8267e-02],
        [-2.4988e-01, -3.8653e-01,  1.6087e-01,  ..., -1.4411e-01,
          2.0424e-01, -9.3418e-02],
        [-1.0311e-01, -2.7191e-01,  8.3761e-02,  ..., -2.1988e-01,
         -5.2022e-02,  2.4860e-01],
        ...,
        [-1.2637e-01, -1.3083e-01,  1.0110e-04,  ..., -2.6098e-01,
          4.3356e-01,  6.3144e-02],
        [-1.7639e-01, -3.1500e-01,  5.4075e-03,  ..., -2.0239e-01,
          2.6251e-01,  1.0453e-01],
        [-1.7639e-01, -3.1489e-01,  5.4220e-03,  ..., -2.0232e-01,
          2.6252e-01,  1.0445e-01]])

In [37]:
all(embeddings[0][0][0] == embeddings2[0][0][0])

True

In [38]:
all(embeddings[0][0][1] == embeddings2[0][0][1])

True

In [39]:
all(embeddings[0][0][2] == embeddings2[0][0][2])

True

In [40]:
all(embeddings[0][0][3] == embeddings2[0][0][3])

True

In [41]:
all(embeddings[0][0][4] == embeddings2[0][0][4])

True

In [42]:
all(embeddings[0][0][5] == embeddings2[0][0][5])

True

In [43]:
all(embeddings[0][0][6] == embeddings2[0][0][6])

True

In [44]:
all(embeddings[0][0][7] == embeddings2[0][0][7])

False

In [45]:
all(embeddings[0][0][8] == embeddings2[0][0][8])

False

In [49]:
inputs3 = tokenizer("A rapid heartrate that exceeds the range of the normal resting heartrate for age.", return_tensors="pt")
with torch.no_grad():
    embeddings3 = model(**inputs3, output_hidden_states=True).hidden_states

In [50]:
embeddings3[-1][0]

tensor([[-0.3074, -0.1327,  0.0125,  ..., -0.4887,  0.1633,  0.2599],
        [-0.0959, -0.0583, -0.2491,  ..., -0.1717, -0.0475,  0.3064],
        [-0.2508,  0.0943, -0.4464,  ..., -0.2531, -0.8193,  0.1961],
        ...,
        [-0.3938, -0.1992, -0.3423,  ..., -0.2177, -0.0013,  0.0479],
        [-0.2669, -0.0567, -0.1716,  ..., -0.3587,  0.1777,  0.1864],
        [-0.2669, -0.0566, -0.1716,  ..., -0.3586,  0.1777,  0.1864]])

In [51]:
inputs4 = tokenizer("[CLS] Abnormal mitochondrial morphology", return_tensors="pt")
with torch.no_grad():
    embeddings4 = model(**inputs4, output_hidden_states=True).hidden_states

In [52]:
embeddings4[0][0]

tensor([[ 0.1079,  0.2989,  0.0111,  ...,  0.1501,  0.0076,  0.0868],
        [-0.3680, -0.1059,  0.0015,  ...,  0.2894, -0.0268, -0.0391],
        [-0.0907, -0.1663, -0.3307,  ...,  0.1273, -0.5079, -0.1308],
        [ 0.5174, -0.3276, -0.3643,  ...,  0.0572, -0.6786,  0.4968],
        [-0.0858, -0.2374,  0.5178,  ...,  0.3518, -0.5319, -0.3129],
        [ 0.0472, -0.0933, -0.2096,  ...,  0.1460,  0.0254,  0.3831]])

In [53]:
embeddings4[-1][0]

tensor([[-0.3730, -0.2231,  0.2971,  ..., -0.4035,  0.6624,  0.2740],
        [-0.3376, -0.1456,  0.5041,  ..., -0.2092,  0.2891,  0.1060],
        [-0.2214,  0.0038,  0.0025,  ..., -0.2787,  0.2070,  0.2588],
        [-0.1624, -0.0490,  0.2448,  ..., -0.0849, -0.1461,  0.1211],
        [-0.0432,  0.3704, -0.2316,  ..., -0.3551,  1.2227,  0.3730],
        [-0.2283, -0.1214,  0.0477,  ..., -0.1214,  0.1604,  0.0383]])

In [54]:
tokenizer.__dict__

{'_tokenizer': <tokenizers.Tokenizer at 0x5ce81720>,
 '_decode_use_source_tokenizer': False,
 'init_inputs': (),
 'init_kwargs': {'do_lower_case': True,
  'unk_token': '[UNK]',
  'sep_token': '[SEP]',
  'pad_token': '[PAD]',
  'cls_token': '[CLS]',
  'mask_token': '[MASK]',
  'tokenize_chinese_chars': True,
  'strip_accents': None,
  'special_tokens_map_file': None,
  'name_or_path': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext',
  'do_basic_tokenize': True,
  'never_split': None,
  'tokenizer_file': None},
 'name_or_path': 'microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext',
 '_processor_class': None,
 'model_max_length': 1000000000000000019884624838656,
 'padding_side': 'right',
 'truncation_side': 'right',
 'model_input_names': ['input_ids', 'token_type_ids', 'attention_mask'],
 'clean_up_tokenization_spaces': True,
 '_in_target_context_manager': False,
 '_bos_token': None,
 '_eos_token': None,
 '_unk_token': '[UNK]',
 '_sep_token': '[SEP]',
 '_pad_toke

In [55]:
inputs4

{'input_ids': tensor([[   2,    2, 4552, 4596, 6076,    3]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

In [57]:
input_list = ["Abnormality of the cardiovascular system","Abnormality on pulmonary function testing","Abnormal cell proliferation"]
inputs5 = tokenizer(input_list, return_tensors="pt",padding=True)
inputs5

{'input_ids': tensor([[    2, 14311,  1927,  1920,  5321,  2433,     3],
        [    2, 14311,  1990,  5352,  2347,  4213,     3],
        [    2,  4552,  2024,  4031,     3,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0, 0]])}

In [58]:
with torch.no_grad():
    embeddings5 = model(**inputs5, output_hidden_states=True).hidden_states

In [60]:
embeddings5[-1].shape

torch.Size([3, 7, 768])