In [1]:
import json
import numpy as np
import spacy

nlp = spacy.load("en_core_web_sm")

valid_data = json.load(open("../data/valid_text_features_matres.json"))
valid_data_ids = json.load(open("../data/valid_features_matres.json"))
num_dict = {0: "before", 1: "after", 2: "equal", 3: "vague"}
ht_prob_dict = np.load("./ht_prob_dict.npy", allow_pickle=True)[()]

In [5]:
from collections import Counter
Counter([len([num_dict[l] for l in item['labels']]) for item in valid_data])

Counter({1: 6404})

In [8]:
ht_prob_dict[()][('predict', 'try')]

({'before': 1.0, 'after': 0.0, 'equal': 0.0}, 1)

# 1. ht bias

In [14]:
from tqdm import tqdm

selected_items = []
ht_freq_dict = {}
for item in tqdm(valid_data):
    h = nlp(item['e1'])[0].lemma_
    t = nlp(item['e2'])[0].lemma_
    rs = [num_dict[l] for l in item['labels']]
    assert len(rs) == 1
    r = rs[0]
    prob, freq = ht_prob_dict.get((h, t), ({"before":0, "after":0, "equal":0, "vague":0}, 0))
    prob = prob.get(r, 1)
    
    if r in ["before", "after"]:
        if prob < 0.3 and freq > 2:
            selected_items.append(item)
    if r == "equal":
        if prob < 0.1 and freq > 2: 
            selected_items.append(item)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6404/6404 [01:01<00:00, 103.68it/s]


In [15]:
len(selected_items)

128

In [None]:
with open('./valid_text_features_matres_erp_bias.json', 'w') as writer:
    json.dump(tense_selected_items, writer) 

# 2. tense bias

In [16]:
tense_prob_dict = np.load("./tense_prob_dict.npy", allow_pickle=True)[()]

In [17]:
tense_prob_dict

{('VBN',
  'VBN'): ({'before': 0.47752808988764045,
   'after': 0.47752808988764045,
   'equal': 0.0449438202247191}, 534),
 ('VBN',
  'VB'): ({'before': 0.7986577181208053,
   'after': 0.1610738255033557,
   'equal': 0.040268456375838924}, 149),
 ('VBN',
  'VBZ'): ({'before': 0.6629213483146067,
   'after': 0.30337078651685395,
   'equal': 0.033707865168539325}, 178),
 ('VB',
  'VBZ'): ({'before': 0.25,
   'after': 0.6818181818181818,
   'equal': 0.06818181818181818}, 44),
 ('VBZ',
  'VBN'): ({'before': 0.30337078651685395,
   'after': 0.6629213483146067,
   'equal': 0.033707865168539325}, 178),
 ('VBZ',
  'VBD'): ({'before': 0.5120274914089347,
   'after': 0.44329896907216493,
   'equal': 0.044673539518900345}, 291),
 ('VBN',
  'VBD'): ({'before': 0.6976744186046512,
   'after': 0.27906976744186046,
   'equal': 0.023255813953488372}, 1032),
 ('VBD',
  'VBD'): ({'before': 0.48053352559480894,
   'after': 0.48053352559480894,
   'equal': 0.03893294881038212}, 2774),
 ('VBD',
  'VB'): (

In [None]:
tense_selected_items = []
tense_selected_items_ids = []

for item, item_ids in tqdm(zip(valid_data, valid_data_ids), total=len(valid_data)):
    text = item['text'].replace('[CLS]', "").replace('[SEP]', "").strip()
    parsed_text = nlp(text)
        
    for i in range(len(parsed_text)):
        if parsed_text[i].text == item['e1']:
            h = parsed_text[i].lemma_
            tense_h = parsed_text[i].tag_
        if parsed_text[i].text == item['e2']:
            t = parsed_text[i].lemma_
            tense_t = parsed_text[i].tag_
    
    rs = [num_dict[l] for l in item['labels']]
    assert len(rs) == 1
    r = rs[0]
    prob, freq = tense_prob_dict.get((tense_h, tense_t), ({"before":0, "after":0, "equal":0, "vague":0}, 0))
    prob = prob.get(r, 1)
    
    if r in ["before", "after"]:
        if prob < 0.3 and freq > 2:
            tense_selected_items.append(item)
            tense_selected_items_ids.append(item_ids)
    if r == "equal":
        if prob < 0.1 and freq > 2: 
            tense_selected_items.append(item)
            tense_selected_items_ids.append(item_ids)

In [28]:
len(tense_selected_items)

1093

In [23]:
# convert to the input_ids forms.

tense_selected_items[0]

{'text': "[CLS] Gonzalez's arrival should also help clear the way for the Justice Department to make arrangements to transfer custody of the child.[SEP] The Miami relatives have left unclear whether they will willingly relinquish custody of Elian, and have done their best to stir passions against the Justice Department among Cuban-Americans in South Florida.[SEP] In hopes of trumping the law, they have unreasonably demanded that a panel of child psychologists determine Elian's fate.[SEP]",
 'e1': 'left',
 'e2': 'stir',
 'labels': [2]}

In [29]:
with open('./valid_features_matres_tense_bias.json', 'w') as writer:
    json.dump(tense_selected_items_ids, writer) 

# General Reporting Bias by LMs

- RoBERTa-large
- GPT2-XL

In [1]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM

tokenizer = AutoTokenizer.from_pretrained("roberta-large")
model = AutoModelForMaskedLM.from_pretrained("roberta-large").to("cuda")

In [37]:
import torch.nn.functional as F

e1 = "arrest"
e2 = "kill"


relation_token_ids = {"before":tokenizer.encode("before", add_special_tokens=False)[0],
                      "after":tokenizer.encode("after", add_special_tokens=False)[0],
                      "equal":tokenizer.encode("during", add_special_tokens=False)[0]
                    }

with torch.no_grad():
  text = f"{e1} happens {tokenizer.mask_token} {e2}"
  input = tokenizer(text, return_tensors="pt").to("cuda")
  mask_token_index = torch.where(input["input_ids"][0] == tokenizer.mask_token_id)[0]
  token_logits = model(**input).logits
  mask_token_logits = token_logits[0, mask_token_index, :]
  mask_token_softmax = F.softmax(mask_token_logits, dim=1)

In [39]:
torch.argmax(mask_token_softmax[0][[relation_token_ids["before"], relation_token_ids["after"], relation_token_ids["equal"]]])

tensor(1, device='cuda:0')

In [30]:
mask_token_softmax[0][[relation_token_ids["before"], relation_token_ids["after"], relation_token_ids["equal"]]]

tensor([3.1666e-07, 1.0508e-07, 7.8023e-08], device='cuda:0')

In [11]:
tokenizer(text, return_tensors="pt")

{'input_ids': tensor([[    0, 16424,  2594, 50264,  2237,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}

## acquire from subset

In [32]:
features_valid_subset = json.load(open("../data/valid_subset_text.json"))

In [46]:
tense_subset_idx = []

for item in tqdm(features_valid_subset):
    text = item['text'].replace('[CLS]', "").replace('[SEP]', "").strip()
    parsed_text = nlp(text)
        
    for i in range(len(parsed_text)):
        if parsed_text[i].text == item['e1']:
            h = parsed_text[i].lemma_
            tense_h = parsed_text[i].tag_
        if parsed_text[i].text == item['e2']:
            t = parsed_text[i].lemma_
            tense_t = parsed_text[i].tag_
    
    rs = [num_dict[l] for l in item['labels']]
    assert len(rs) == 1
    r = rs[0]
    prob_dict, freq = tense_prob_dict.get((tense_h, tense_t), ({"before":0, "after":0, "equal":0, "vague":0}, 0))
    prob = prob_dict.get(r, 1)
    
    if r in ["before", "after"]:
        if prob < 0.3 and freq > 2:
            tense_subset_idx.append(item)
    if r == "equal":
        if prob < 0.1 and freq > 2: 
            tense_subset_idx.append(item)
    if r == "vague":
        if any(prob_dict[r_]>0.6 and freq > 2 for r_ in ["before", "after", "equal"]):
            tense_subset_idx.append(item)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:21<00:00, 46.51it/s]


In [38]:
with open('./valid_subset_text_tense_bias.json', 'w') as writer:
    json.dump(tense_subset_idx, writer) 

In [50]:
with open('./valid_subset_text_tense_bias_vague.json', 'w') as writer:
    json.dump(tense_subset_idx, writer) 

In [49]:
len(tense_subset_idx)

210