In [1]:
from transformers import pipeline, AutoTokenizer, AutoModelForMaskedLM
from sentence_transformers import util

In [2]:
# model_name = "nasa-impact/bert-e-base-mlm"
# model = AutoModelForMaskedLM.from_pretrained(model_name)
# tokenizer = AutoTokenizer.from_pretrained(model_name)
# model.save_pretrained("./bert")
# tokenizer.save_pretrained("./bert")

model = AutoModelForMaskedLM.from_pretrained("./bert")
tokenizer = AutoTokenizer.from_pretrained("./bert")

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [37]:
document = """This year marks half a century since Stephen Hawking made his greatest scientific discovery by theoretically proving that “black holes ain’t so black”, as they behave like hot bodies with an absolute temperature that depends inversely on their mass. This discovery is expressed by a simple and elegant equation known as the Hawking temperature. The best way to commemorate this great scientific event is by bringing it to a wide audience. The simplest and most transparent and intuitive tool to achieve this goal is dimensional analysis. The objective of this work is to use this tool to derive the Hawking equation, reveal its meaning, and explore its main physical consequences"""

summary = """In this paper, the authors commemorate the half-century anniversary of Stephen Hawking\'s discovery that "black holes ain\'t so black" by describing its temperature with an absolute temperature proportional to its mass. They use a technique called dimensional analysis to derive the "Hawking equation" and explore its possible consequences. The authors acknowledge that the scientific value of this discovery remains largely unexplored, and they outline their plan to make it accessible to a wide audience. In particular, they focus on the "dimensional analysis" part of the task, which will be useful to gifted high school students, physics teachers, scientists and engineers, and"""

In [38]:
summary = summary.lower().strip()
document = document.lower().strip()

In [39]:
summary_tokens = tokenizer(summary, return_tensors="pt", padding=True, truncation=True, max_length=128)
document_tokens = tokenizer(document, return_tensors="pt", padding=True, truncation=True, max_length=128)

In [40]:
summary_embedding = model(**summary_tokens)
document_embedding = model(**document_tokens)

In [41]:
document_embedding[0].shape

torch.Size([1, 128, 31116])

In [42]:
summary_embedding[0].shape

torch.Size([1, 128, 31116])

In [43]:
a = document_embedding[0].detach().numpy().reshape(1,-1)
b = summary_embedding[0].detach().numpy().reshape(1,-1)

In [44]:
util.cos_sim(a,b)

tensor([[0.8463]])

In [45]:
util.euclidean_sim(a,b)

tensor([[-7250.8081]])

tensor(0., grad_fn=<LinalgVectorNormBackward0>)