<a href="https://colab.research.google.com/github/George-Okello/Ambiguity/blob/main/Ambiguity.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Install HuggingFace Transformers and Datasets libraries
!pip install transformers datasets -q


In [2]:
from datasets import load_dataset

# Load the WSC dataset
dataset = load_dataset("super_glue", "wsc.fixed")
dataset = dataset["validation"]  # Using the validation set for testing


README.md: 0.00B [00:00, ?B/s]

wsc.fixed/train-00000-of-00001.parquet:   0%|          | 0.00/27.3k [00:00<?, ?B/s]

wsc.fixed/validation-00000-of-00001.parq(…):   0%|          | 0.00/10.2k [00:00<?, ?B/s]

wsc.fixed/test-00000-of-00001.parquet:   0%|          | 0.00/12.2k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/554 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/104 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/146 [00:00<?, ? examples/s]

In [3]:
# View a sample
sample = dataset[0]
for k, v in sample.items():
    print(f"{k}: {v}")


text: Bernard , who had not told the government official that he was less than 21 when he filed for a homestead claim, did not consider that he had done anything dishonest. Still, anyone who knew that he was 19 years old could take his claim away from him .
span1_index: 32
span2_index: 47
span1_text: anyone
span2_text: him
idx: 0
label: 0


In [4]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelForMaskedLM.from_pretrained("bert-base-uncased")
model.eval()


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwi

In [5]:
from torch.nn.functional import softmax

def score_sentence(sentence, target):
    # Tokenize and locate the token span
    inputs = tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        logits = model(**inputs).logits
    # Get score for entire sentence (log prob of each token)
    input_ids = inputs.input_ids[0]
    log_probs = softmax(logits[0], dim=-1)
    token_probs = [log_probs[i, input_ids[i]].item() for i in range(len(input_ids))]
    return sum(torch.log(torch.tensor(token_probs))).item()


In [6]:
text = sample["text"]
span1 = sample["span1_text"]
span2 = sample["span2_text"]

# Replace ambiguous pronoun with noun
text_with_sub = text.replace(span1, span2)

# Score both
original_score = score_sentence(text, span1)
sub_score = score_sentence(text_with_sub, span2)

print(f"Original: {text}")
print(f"Modified: {text_with_sub}")
print(f"Score with pronoun: {original_score:.2f}")
print(f"Score with candidate: {sub_score:.2f}")

# Prediction
prediction = int(sub_score > original_score)
print(f"Predicted label: {prediction}, True label: {sample['label']}")


  return forward_call(*args, **kwargs)


Original: Bernard , who had not told the government official that he was less than 21 when he filed for a homestead claim, did not consider that he had done anything dishonest. Still, anyone who knew that he was 19 years old could take his claim away from him .
Modified: Bernard , who had not told the government official that he was less than 21 when he filed for a homestead claim, did not consider that he had done anything dishonest. Still, him who knew that he was 19 years old could take his claim away from him .
Score with pronoun: -41.93
Score with candidate: -43.39
Predicted label: 0, True label: 0


In [7]:
correct = 0

for i in range(len(dataset)):
    ex = dataset[i]
    if ex['span1_text'] not in ex['text']:
        continue  # Skip malformed entries
    text = ex['text']
    span1 = ex['span1_text']
    span2 = ex['span2_text']
    label = ex['label']

    try:
        text_with_sub = text.replace(span1, span2)
        original_score = score_sentence(text, span1)
        sub_score = score_sentence(text_with_sub, span2)
        prediction = int(sub_score > original_score)
        correct += (prediction == label)
    except:
        continue

accuracy = correct / len(dataset)
print(f"\n📊 Accuracy: {accuracy * 100:.2f}%")



📊 Accuracy: 58.65%


Evaluate RoBERTa

In [8]:
from transformers import AutoTokenizer, AutoModelForMaskedLM
import torch
from torch.nn.functional import softmax
from datasets import load_dataset

# Load model
roberta_tokenizer = AutoTokenizer.from_pretrained("roberta-large")
roberta_model = AutoModelForMaskedLM.from_pretrained("roberta-large").eval()

# Load WSC data
wsc = load_dataset("super_glue", "wsc.fixed")["validation"]


tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

In [9]:
def roberta_score_sentence(sentence):
    inputs = roberta_tokenizer(sentence, return_tensors="pt")
    with torch.no_grad():
        logits = roberta_model(**inputs).logits
    input_ids = inputs.input_ids[0]
    probs = softmax(logits[0], dim=-1)
    token_probs = [probs[i, input_ids[i]].item() for i in range(len(input_ids))]
    return sum(torch.log(torch.tensor(token_probs))).item()


In [None]:
correct = 0
total = 0

for ex in wsc:
    try:
        t = ex['text']
        span1 = ex['span1_text']
        span2 = ex['span2_text']
        if span1 not in t or span2 not in t:
            continue
        modified = t.replace(span2, span1)
        orig_score = roberta_score_sentence(t)
        mod_score = roberta_score_sentence(modified)
        pred = int(mod_score > orig_score)
        if pred == ex['label']:
            correct += 1
        total += 1
    except Exception:
        continue

print(f"📊 RoBERTa Accuracy: {correct / total * 100:.2f}%")


  return forward_call(*args, **kwargs)
