In [25]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

def analyze_masks(sentence, model, tokenizer, search_word=None, top_n=5):
    """
    For each <mask> token in the sentence:
    - Show rank/logit/prob of a searched word (if provided).
    - Show top_n most probable candidates.
    """
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # [batch, seq_len, vocab_size]

    mask_token_id = tokenizer.mask_token_id
    mask_positions = (inputs["input_ids"] == mask_token_id).nonzero(as_tuple=True)[1]

    search_word_id = None
    if search_word:
        search_word_id = tokenizer.convert_tokens_to_ids(search_word)

    results = {}
    for pos in mask_positions:
        mask_logits = logits[0, pos]
        probs = torch.softmax(mask_logits, dim=-1)

        # Top-N predictions
        top_logits, top_ids = torch.topk(mask_logits, top_n)
        tokens = tokenizer.convert_ids_to_tokens(top_ids.tolist())
        top_probs = probs[top_ids].tolist()

        entry = {
            "top_predictions": [
                {"token": t, "logit": l.item(), "prob": p}
                for t, l, p in zip(tokens, top_logits, top_probs)
            ]
        }

        # If searching a specific word
        if search_word_id is not None:
            word_logit = mask_logits[search_word_id].item()
            word_prob = probs[search_word_id].item()

            sorted_ids = torch.argsort(mask_logits, descending=True)
            rank = (sorted_ids == search_word_id).nonzero(as_tuple=True)[0].item() + 1  # 1-based

            entry["searched_word"] = {
                "word": search_word,
                "logit": word_logit,
                "prob": word_prob,
                "rank": rank,
                "vocab_size": mask_logits.shape[0]
            }

        results[pos.item()] = entry

    return results


# Example usage
if __name__ == "__main__":
    model_name = "roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)

    sentence = "<mask> <mask> <mask> <mask> <mask> <mask> book <mask> <mask> <mask> <mask> <mask> <mask> <mask>"

    preds = analyze_masks(sentence, model, tokenizer, search_word="Ġpages", top_n=100)

    for pos, info in preds.items():
        if pos == 6:
            print(f"\nMask at position {pos}:")
            if "searched_word" in info:
                sw = info["searched_word"]
                print(f"  Word '{sw['word']}': logit={sw['logit']:.2f}, prob={sw['prob']:.4f}, rank={sw['rank']} / {sw['vocab_size']}")
            print("  Top predictions:")
            for c in info["top_predictions"]:
                print(f"    {c['token']:>10s} | logit={c['logit']:.2f} | prob={c['prob']:.4f}")



Mask at position 6:
  Word 'Ġpages': logit=0.94, prob=0.0000, rank=5210 / 50265
  Top predictions:
          Ġnew | logit=10.81 | prob=0.1607
          Ġthe | logit=10.62 | prob=0.1332
            's | logit=9.74 | prob=0.0551
            Ġa | logit=9.19 | prob=0.0318
          ĠThe | logit=9.15 | prob=0.0305
        Ġfirst | logit=9.12 | prob=0.0297
          Ġhis | logit=8.82 | prob=0.0219
         Ġthis | logit=8.73 | prob=0.0201
       Ġsecond | logit=8.62 | prob=0.0180
    Ġbestselling | logit=8.35 | prob=0.0138
       Ġlatest | logit=8.24 | prob=0.0122
             , | logit=8.04 | prob=0.0101
         Ġnext | logit=7.77 | prob=0.0077
           The | logit=7.68 | prob=0.0070
         ĠThis | logit=7.63 | prob=0.0067
        Ġthird | logit=7.46 | prob=0.0056
          Ġher | logit=7.43 | prob=0.0055
         Ġbest | logit=7.39 | prob=0.0052
           Ġof | logit=7.29 | prob=0.0048
    Ġexcellent | logit=7.16 | prob=0.0042
            ĠA | logit=7.10 | prob=0.0039
           Ġmy

In [17]:
word = "toilet"
tokens = tokenizer.tokenize(word)
print(tokens)

['to', 'ilet']
