In [28]:
from transformers import RobertaTokenizer, RobertaForMaskedLM
import torch
import torch.nn.functional as F
import csv

# Load pretrained model
model_name = "roberta-base"
tokenizer = RobertaTokenizer.from_pretrained(model_name)
model = RobertaForMaskedLM.from_pretrained(model_name)
model.eval()

# ---- Load CSV once, preserving rows ----
with open(r"NGSL_lists\NEWEST_NGSL_2.csv", newline="", encoding="utf-8") as f:
    reader2 = csv.reader(f)
    csv_rows = [row for row in reader2]   # list of lists, each row preserved


def masked_tokens_probabilities(sentence: str, M, word_prob, i):
    # Tokenize once
    inputs = tokenizer(sentence, return_tensors="pt")

    # Find all mask positions
    mask_positions = (inputs["input_ids"] == tokenizer.mask_token_id).nonzero(as_tuple=False)

    # Forward pass once
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)


    # Iterate over mask positions
    for pos in mask_positions[:, 1]:
        if pos > 2 and pos < (mask_positions[:, 1][-1] - 1):
            for j, row in enumerate(csv_rows):
                for tok in row:
                    token_id = tokenizer.convert_tokens_to_ids(tok)
                    prob = probs[0, pos, token_id].item()
                    M[i][j] += word_prob * prob


In [29]:
from wordfreq import word_frequency

# Function to compute frequency of a word
def get_word_frequency(word: str, lang: str = "en") -> float:
    """
    Returns the frequency of a word in the given language.
    By default, English ("en") is used.
    
    The result is a probability between 0 and 1.
    """
    return word_frequency(word, lang)


In [30]:
import numpy as np
import csv

with open(r"NGSL_lists\NEWEST_NGSL.csv", newline="") as f:
    n = sum(1 for _ in f)

M = np.zeros((n, n), dtype=np.float64)

i=-1

with open(r"NGSL_lists\NEWEST_NGSL.csv", newline="", encoding='utf-8') as f:
    reader = csv.reader(f)
    for row in reader:
        i+=1
        if(i%50 == 0):
            print(i)
        for word in row:
            if(word != ""):
                word_prob = get_word_frequency(word, "en")
                masked_tokens_probabilities("<mask> <mask> <mask> " + word + " <mask> <mask> <mask> <mask> <mask> <mask>", M, word_prob, i)
            
            



0
50
100
150
200
250
300
350
400
450
500
550
600
650
700
750
800
850
900
950
1000
1050
1100
1150
1200
1250
1300
1350
1400
1450
1500
1550
1600
1650
1700
1750
1800
1850
1900
1950
2000
2050
2100
2150
2200
2250
2300
2350
2400
2450
2500
2550
2600
2650
2700
2750
2800
2850
2900
2950
3000
3050
3100
3150
3200
3250
3300
3350
3400
3450
3500
3550
3600
3650
3700
3750
3800
3850
3900
3950
4000
4050
4100
4150
4200
4250
4300
4350
4400
4450
4500
4550


In [31]:
np.save("Probabilities_NGSL.npy", M)

In [38]:
M[4494][4495]

np.float64(1.3907099308312355e-10)

In [39]:
get_word_frequency("unlimited")

1.26e-05