In [56]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

def analyze_masks(sentence, model, tokenizer, search_word=None, top_n=5):
    """
    For each <mask> token in the sentence:
    - Show rank/logit/prob of a searched word (if provided).
    - Show top_n most probable candidates.
    """
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits  # [batch, seq_len, vocab_size]

    mask_token_id = tokenizer.mask_token_id
    mask_positions = (inputs["input_ids"] == mask_token_id).nonzero(as_tuple=True)[1]

    search_word_id = None
    if search_word:
        search_word_id = tokenizer.convert_tokens_to_ids(search_word)

    results = {}
    for pos in mask_positions:
        mask_logits = logits[0, pos]
        probs = torch.softmax(mask_logits, dim=-1)

        # Top-N predictions
        top_logits, top_ids = torch.topk(mask_logits, top_n)
        tokens = tokenizer.convert_ids_to_tokens(top_ids.tolist())
        top_probs = probs[top_ids].tolist()

        entry = {
            "top_predictions": [
                {"token": t, "logit": l.item(), "prob": p}
                for t, l, p in zip(tokens, top_logits, top_probs)
            ]
        }

        # If searching a specific word
        if search_word_id is not None:
            word_logit = mask_logits[search_word_id].item()
            word_prob = probs[search_word_id].item()

            sorted_ids = torch.argsort(mask_logits, descending=True)
            rank = (sorted_ids == search_word_id).nonzero(as_tuple=True)[0].item() + 1  # 1-based

            entry["searched_word"] = {
                "word": search_word,
                "logit": word_logit,
                "prob": word_prob,
                "rank": rank,
                "vocab_size": mask_logits.shape[0]
            }

        results[pos.item()] = entry

    return results


# Example usage
if __name__ == "__main__":
    model_name = "roberta-base"
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForMaskedLM.from_pretrained(model_name)

    sentence = "<mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask> <mask>"

    preds = analyze_masks(sentence, model, tokenizer, search_word="Ġgrandmother", top_n=2)

    for pos, info in preds.items():
        if pos < 20:
            print(f"\nMask at position {pos}:")
            if "searched_word" in info:
                sw = info["searched_word"]
                print(f"  Word '{sw['word']}': logit={sw['logit']:.2f}, prob={sw['prob']:.4f}, rank={sw['rank']} / {sw['vocab_size']}")
            print("  Top predictions:")
            for c in info["top_predictions"]:
                print(f"    {c['token']:>10s} | logit={c['logit']:.2f} | prob={c['prob']:.4f}")



Mask at position 1:
  Word 'Ġgrandmother': logit=-4.19, prob=0.0000, rank=41109 / 50265
  Top predictions:
          </s> | logit=12.08 | prob=0.0854
           The | logit=11.20 | prob=0.0353

Mask at position 2:
  Word 'Ġgrandmother': logit=-2.90, prob=0.0000, rank=38365 / 50265
  Top predictions:
             : | logit=6.90 | prob=0.0250
          </s> | logit=6.83 | prob=0.0234

Mask at position 3:
  Word 'Ġgrandmother': logit=-3.27, prob=0.0000, rank=42499 / 50265
  Top predictions:
          </s> | logit=7.05 | prob=0.0316
             : | logit=6.85 | prob=0.0258

Mask at position 4:
  Word 'Ġgrandmother': logit=-3.11, prob=0.0000, rank=41060 / 50265
  Top predictions:
          </s> | logit=7.45 | prob=0.0475
             , | logit=7.09 | prob=0.0333

Mask at position 5:
  Word 'Ġgrandmother': logit=-3.04, prob=0.0000, rank=40347 / 50265
  Top predictions:
          </s> | logit=7.62 | prob=0.0566
             , | logit=7.16 | prob=0.0357

Mask at position 6:
  Word 'Ġgrandmot

In [57]:

word = " specifically"
tokens = tokenizer.tokenize(word)
print(tokens)

['Ġspecifically']
