In [2]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

def analyze_masks(sentence, model, tokenizer, search_word=None, top_n=5):
    """
    For each <mask> token in the sentence:
    - Show rank/logit/prob of a searched word (if provided).
    - Show top_n most probable candidates.
    """
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # [batch, seq_len, vocab_size]

    mask_token_id = tokenizer.mask_token_id
    mask_positions = (inputs["input_ids"] == mask_token_id).nonzero(as_tuple=True)[1]

    search_word_id = None
    if search_word:
        search_word_id = tokenizer.convert_tokens_to_ids(search_word)

    results = {}
    for pos in mask_positions:
        mask_logits = logits[0, pos]
        probs = torch.softmax(mask_logits, dim=-1)

        # Top-N predictions
        top_logits, top_ids = torch.topk(mask_logits, top_n)
        tokens = tokenizer.convert_ids_to_tokens(top_ids.tolist())
        top_probs = probs[top_ids].tolist()

        entry = {
            "top_predictions": [
                {"token": t, "logit": l.item(), "prob": p}
                for t, l, p in zip(tokens, top_logits, top_probs)
            ]
        }

        # If searching a specific word
        if search_word_id is not None:
            word_logit = mask_logits[search_word_id].item()
            word_prob = probs[search_word_id].item()

            sorted_ids = torch.argsort(mask_logits, descending=True)
            rank = (sorted_ids == search_word_id).nonzero(as_tuple=True)[0].item() + 1  # 1-based

            entry["searched_word"] = {
                "word": search_word,
                "logit": word_logit,
                "prob": word_prob,
                "rank": rank,
                "vocab_size": mask_logits.shape[0]
            }

        results[pos.item()] = entry

    return results


# Example usage
if __name__ == "__main__":
    model_name = "roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)

    sentence = "<mask> <mask> <mask> abdominals <mask> <mask> <mask> <mask> <mask> <mask>" 

    preds = analyze_masks(sentence, model, tokenizer, search_word="Ġpain", top_n=1)

    for pos, info in preds.items():
        if pos < 20:
            print(f"\nMask at position {pos}:")
            if "searched_word" in info:
                sw = info["searched_word"]
                print(f"  Word '{sw['word']}': logit={sw['logit']:.2f}, prob={sw['prob']:.8f}, rank={sw['rank']} / {sw['vocab_size']}")
            print("  Top predictions:")
            for c in info["top_predictions"]:
                print(f"    {c['token']:>10s} | logit={c['logit']:.2f} | prob={c['prob']:.8f}")



Mask at position 1:
  Word 'Ġpain': logit=0.97, prob=0.00000067, rank=10733 / 50265
  Top predictions:
             L | logit=12.13 | prob=0.04729617

Mask at position 2:
  Word 'Ġpain': logit=2.75, prob=0.00008734, rank=1681 / 50265
  Top predictions:
           Ġof | logit=8.40 | prob=0.02499210

Mask at position 3:
  Word 'Ġpain': logit=1.37, prob=0.00000512, rank=4879 / 50265
  Top predictions:
          Ġand | logit=11.72 | prob=0.16075793

Mask at position 6:
  Word 'Ġpain': logit=2.53, prob=0.00001743, rank=1714 / 50265
  Top predictions:
             , | logit=12.17 | prob=0.26642850

Mask at position 7:
  Word 'Ġpain': logit=1.81, prob=0.00005232, rank=2432 / 50265
  Top predictions:
          </s> | logit=8.33 | prob=0.03569919

Mask at position 8:
  Word 'Ġpain': logit=1.93, prob=0.00004543, rank=2527 / 50265
  Top predictions:
             , | logit=8.53 | prob=0.03357203

Mask at position 9:
  Word 'Ġpain': logit=1.87, prob=0.00003874, rank=2705 / 50265
  Top predictions:

In [3]:

word = " diversions"
tokens = tokenizer.tokenize(word)
print(tokens)

['Ġdivers', 'ions']
