In [46]:
from pathlib import Path
import re
from typing import Dict, List
from collections import Counter, defaultdict
from itertools import combinations
import pandas as pd
import json
import numpy as np

You can find the full dataset with annotated inter-object causality at https://drive.google.com/file/d/17CTPMoZ4uJH6cSQaxD6Vmyv_bVLJwVJp/view



Some auxiliary functions

In [47]:
def clean_up(x: str) -> str:  # to deal with e.g. '(\'outfield\', "pitcher\'s mound"), or 'None', or ""?""
    cleaned_up = re.sub(r'(?<=\w)\'(?=\w)', '@@TMP@@', x) \
        .replace('\'', '\"') \
        .replace('\\\'', '\"') \
        .replace('\"\"?\"\"', '\"?\"') \
        .replace('(', '[') \
        .replace(')', ']') \
        .replace('@@TMP@@', '\'') \
        .replace('None', 'null')
    return cleaned_up

def max_response(row, w_for_cfdnce_lvl: Dict[str, List[int]]):
    '''

    Returns: most frequent response + its frequency as a fraction of maximum frequency

    '''
    c = Counter()
    for k, cfdnce_lvl in zip(row['cause_directions'], row['confidences']):
        c.update({k: w_for_cfdnce_lvl[cfdnce_lvl]})
    max_resp, count = sorted(c.items(), key=lambda item: item[1], reverse=True)[0]
    count_fraction = count / len(row['cause_directions'])
    return max_resp, count_fraction, count

In [48]:
DATA_PATH = 'mturk_responses_clean.csv'
responses = pd.read_csv(DATA_PATH,
                        converters={col: lambda x: json.loads(clean_up(x))
                                    for col in ['cause_directions', 'confounders', 'confidences']},
                        index_col=0
                        )
responses[['word_X', 'word_Y']] = pd.DataFrame([p.split(" // ") for p in responses['pair_of_objects'].tolist()],
                                               index=responses.index)
print(responses.head())

          pair_of_objects                             cause_directions                 confounders                                                             confidences      word_X    word_Y
0         trick // skater     [y-to-x, y-to-x, y-to-x, y-to-x, y-to-x]                  [, , , , ]  [confidence_3, confidence_3, confidence_3, confidence_3, confidence_2]       trick    skater
1         sinks // vanity    [x-is-y, y-to-x, x-to-y, z-to-xy, x-to-y]  [bathroom, , , bathroom, ]  [confidence_3, confidence_2, confidence_2, confidence_2, confidence_2]       sinks    vanity
2  toiletries // products    [z-to-xy, y-to-x, y-to-x, y-to-x, x-to-y]                 [?, , , , ]  [confidence_2, confidence_2, confidence_2, confidence_2, confidence_3]  toiletries  products
3             roof // sky  [z-to-xy, x-to-y, z-to-xy, x-to-y, z-to-xy]            [?, , ?, , rain]  [confidence_1, confidence_3, confidence_1, confidence_2, confidence_2]        roof       sky
4       laptops // office     [y-to

Filter the responses to use only those with sufficient agreement, possibly weighting agreement by confidence level

In [49]:
w_for_cfdnce_setting = {
    "no_cfdnce_weight": [1, 1, 1],
    "half_cfdnce_weight": [.5, .75, 1],
    "full_cfdnce_weight": [1 / 3, 2 / 3, 3 / 3],
}
min_agreement = .8
cfdnce_setting = "no_cfdnce_weight"
cfdnce_weighting = w_for_cfdnce_setting[cfdnce_setting]
w_for_cfdnce_lvl = {
    "confidence_1": cfdnce_weighting[0],
    "confidence_2": cfdnce_weighting[1],
    "confidence_3": cfdnce_weighting[2],
    None: 0, # Ignore some cases where input is None
}
responses[['max_resp', 'max_resp_fraction', 'max_resp_count']] = pd.DataFrame(
    [max_response(row, w_for_cfdnce_lvl) for _, row in responses.iterrows()], index=responses.index)
filtered_responses = \
    responses[responses.apply(lambda row:
                              row['max_resp_fraction'] >= min_agreement and
                              row['max_resp'] != 'x-is-y' and
                              len(row['cause_directions']) >= 5,
                                                                   axis=1)]
gt_for_pair = filtered_responses[['word_X', 'word_Y', 'max_resp']]
print(gt_for_pair.head())

     word_X  word_Y max_resp
0     trick  skater   y-to-x
4   laptops  office   y-to-x
7    person   shirt   x-to-y
8     table     man  z-to-xy
10     face    tree  z-to-xy


As explained in the paper, for DeVLBERT only one effect variable is used (in other words, the confounders are effectively causes).
To extract triplets from the data, use the following code.

In [50]:
cause_only_responses = filtered_responses[filtered_responses.apply(lambda row:
                                                                   row['max_resp'] in ['x-to-y', 'y-to-x'],
                                                                   axis=1)]
x_to_y = [(a.word_X, a.word_Y) if a.max_resp == 'x-to-y' else (a.word_Y, a.word_X) for a in
          cause_only_responses.itertuples(index=False)]
ys_for_x = defaultdict(list)
for x, y in x_to_y:
    ys_for_x[x] += [y]


confnd_triples = [f"{eff1}⬅{cause}➡{eff2}" for cause in ys_for_x for eff1, eff2 in
                  combinations(ys_for_x[cause], 2) if
                  len(ys_for_x[cause]) > 1]
print(len(confnd_triples))
print(confnd_triples[:10])

778
['laptops⬅office➡computers', 'laptops⬅office➡telephone', 'laptops⬅office➡office chair', 'laptops⬅office➡calculator', 'laptops⬅office➡monitors', 'computers⬅office➡telephone', 'computers⬅office➡office chair', 'computers⬅office➡calculator', 'computers⬅office➡monitors', 'telephone⬅office➡office chair']


In [51]:
pd.options.mode.chained_assignment = None  # default='warn'
class GT:

    def __init__(self, gt_for_pair):
        self.cache = {}
        self.gt_for_pair = gt_for_pair

    def get_known_gts(self, effect_object_id):
        if effect_object_id in self.cache:
            return self.cache[effect_object_id]
        else:
            known_gts = self.gt_for_pair[
                (effect_object_id == self.gt_for_pair['ID_X']) | (effect_object_id == self.gt_for_pair['ID_Y'])]
            known_gts['cause_candidate'] = known_gts['ID_X'].where(known_gts['ID_Y'] == effect_object_id,
                                                                   known_gts['ID_Y'])
            known_gts['cause_candidate_label'] = np.where(
                (known_gts['ID_X'] == known_gts['cause_candidate']) & (known_gts['max_resp'] == 'x-to-y'),
                'cause', np.where(known_gts['max_resp'] == 'z-to-xy', 'mere_correlate', 'effect'))
            self.cache[effect_object_id] = known_gts
            return known_gts

with open(f"DeVLBert/dic/objects_vocab.txt", "r") as vocab:
    CLASSES = ['background'] + [line.strip() for line in vocab]
def word_to_id(word: str):
    return CLASSES.index(word) if word in CLASSES else None

gt_for_pair['ID_X'] = gt_for_pair.apply(lambda row: word_to_id(row['word_X']), axis=1)
gt_for_pair['ID_Y'] = gt_for_pair.apply(lambda row: word_to_id(row['word_Y']), axis=1)
gt = GT(gt_for_pair=gt_for_pair)
pd.set_option('display.expand_frame_repr', False)
print(gt.get_known_gts(effect_object_id=word_to_id('office')))
print(gt.get_known_gts(effect_object_id=word_to_id('egg')))
print(gt.get_known_gts(effect_object_id=word_to_id('shirt')))

      word_X        word_Y max_resp  ID_X  ID_Y  cause_candidate cause_candidate_label
4    laptops        office   y-to-x  1229  1081             1229                effect
319   office     computers   x-to-y  1081  1032             1032                effect
346   office     telephone   x-to-y  1081   582              582                effect
366   office  office chair   x-to-y  1081   552              552                effect
748   office    calculator   x-to-y  1081    13               13                effect
765   office      monitors   x-to-y  1081   220              220                effect
    word_X word_Y max_resp  ID_X  ID_Y  cause_candidate cause_candidate_label
232    egg   yolk   x-to-y   525     1                1                effect
          word_X  word_Y max_resp  ID_X  ID_Y  cause_candidate cause_candidate_label
7         person   shirt   x-to-y   365    52              365                 cause
11        sleeve   shirt   y-to-x  1497    52             1497   

`cause_candidate` is the id of the variable other than the effect_object_id, for which we have some information. It is 'candidate' because it could still be cause, effect or mere correlate.
`cause_candidate_label` then tells you what the `cause_candidate` actually is.
This is used in `test_confounder_finding.py`.