In [75]:
import json
import re
from collections import Counter
from collections import defaultdict
from itertools import combinations
from multiprocessing import Pool, cpu_count

import nltk
import spacy
from tqdm.auto import tqdm

In [136]:
dataset_name = "wikievents"
wikievents246_input = open("../../Corpora/WikiEvents/gtt_format/all-test-dev-train.jsonl")
examples = [json.loads(l) for l in wikievents246_input.readlines()]
wikievents246_input.close()
example_len = 246
dev_start, test_start,train_start = 0, 20,40
type_to_roles =  json.load(open("../../Corpora/WikiEvents/gtt_format/EventTypeToRoles.json"))
types = [*type_to_roles.keys()]



In [117]:


def clean(string):
    string = string.lower()
    cleaned_string = re.sub(r'[^a-z]', '', string)
    return cleaned_string


def find_substring_indices(string, substring):
    indices = []
    length = len(substring)
    for i in range(len(string) - length + 1):
        if string[i:i + length] == substring:
            indices.append(i)
    return indices


def indices_in_tokenized_text(role_filler, tokens):
    cleaned_text = ""
    d = {}
    for i, token in enumerate(tokens):
        cleaned = clean(token)
        for ii in range(len(cleaned)):
            d[ii + len(cleaned_text)] = i
        cleaned_text += cleaned
    return [d[index] for index in find_substring_indices(cleaned_text, clean(role_filler))]


def flatten_list(nested_list):
    return [item for sublist in nested_list for item in
            (flatten_list(sublist) if isinstance(sublist, list) else [sublist])]


def generate_pairs(lists):
    pairs = set()
    for i in range(len(lists)):
        for j in range(i + 1, len(lists)):
            for elem1 in lists[i]:
                for elem2 in lists[j]:
                    if elem1 != elem2:
                        l = sorted([elem1, elem2])
                        pairs.add((l[0], l[1]))
    return pairs


def n_comb(l):
    return len(l) * (len(l) - 1) // 2
nlp = spacy.load("en_core_web_sm")

def is_not_pronoun(phrase):
    return phrase.lower().strip() not in {
        "he", "him", "his", "himself",
        "she", "her", "hers", 
        "it", "its", 
        "they", "them", "their", "theirs", "themselves",
        "we", "us", "our", "ours", "ourselves",
        "you", "your", "yours", "yourself", "yourselves",
        "one", "ones", "oneself",
        "who", "whom", "whose",
        "i", "me", "my", "mine", "myself", "that", "which", "some", "anyone", "those", "this", "these", "other", "another", "such", "each", "neither","both", "all"
    }
    doc = nlp(phrase)
    for token in doc:
        # Check if the token is a pronoun
        if token.text.lower() == "the":
            continue
        if token.pos_ == "PRON" or token.text.lower() in ["one", "ones"]:
            print(token)
            return False
    return True

In [118]:
example_template_entity_mentions = []
counts = defaultdict(lambda: 0)
mention_count = []
qualified_mention_count = []
coref_pairs = []
coref_negative_examples = []
coref_pairs_non_single_mention = []
coenv_positive_examples = []
coenv_negative_examples = []
event_typing_examples = []
roling_examples = []
all_role_examples = []
for ex in tqdm(examples):
    templates = ex['templates']
    text = ex['doctext']
    cleanedtext = clean(" ".join(text))
    template_of_entities = []
    flattened_list_all_templates = []
    per_example_coref_pairs = set()
    per_example_coref_pairs_non_single_mention = set()
    coref_negative_examples.append(set())
    coenv_positive_examples.append(set())
    event_typing_examples.append(set())
    roling_examples.append(set())
    coenv_negative_examples.append(set())
    list_of_mention_per_template = []
    for template in templates:
        flattened_list_within_template = []
        arguments = [l for l in template.values() if isinstance(l, list)]
        roles = type_to_roles[template['incident_type']]
        qualified_entities = []

        for role_index, role_fillers in enumerate(arguments):
            role = roles[role_index]
            list_of_entity_mentions = []
            for entity in role_fillers:
                qualified_mentions = []
                mentions_per_entity = []
                for mention_w_index in entity:
                    mention = mention_w_index[0]
                    if not is_not_pronoun(mention):
                        continue
                    #assert cleanedtext.count(clean(mention)) > 0
                    mentions_per_entity.append(mention)
                    roling_examples[-1].add((mention, role))
                    if cleanedtext.count(clean(mention)) == 1:
                        qualified_mentions.append(mention)
                        counts['qualified_mentions'] += 1

                        if mention in flattened_list_all_templates:
                            counts['duplicated qualified mention across templates'] += 1
                        flattened_list_all_templates.append(mention)

                        if mention in flattened_list_within_template:
                            counts['duplicated qualified mention within templates'] += 1
                        flattened_list_within_template.append(mention)

                if len(qualified_mentions) > 0:
                    counts['qualified_entities'] += 1
                    qualified_entities.append(qualified_mentions)

                    per_example_coref_pairs = per_example_coref_pairs.union(
                        [p for p in list(combinations(qualified_mentions, 2)) if p[0] < p[1]])
                per_example_coref_pairs_non_single_mention = per_example_coref_pairs_non_single_mention.union(
                    [p for p in list(combinations(mentions_per_entity, 2)) if p[0] < p[1]])
                mention_count.append(len(mentions_per_entity))
                qualified_mention_count.append(len(qualified_mentions))
                list_of_entity_mentions.append(mentions_per_entity)
            coref_negative_examples[-1] = coref_negative_examples[-1].union(generate_pairs(list_of_entity_mentions))

        counts['qualified_coenv'] += n_comb(set(flattened_list_within_template))
        coenv_positive_examples[-1] = coenv_positive_examples[-1].union(
            ([p for p in list(combinations(flattened_list_within_template, 2)) if p[0] < p[1]]))
        event_typing_examples[-1] = event_typing_examples[-1].union(([p for p in [p + (types.index(template["incident_type"].split("/")[0].strip()),) for p in
                                                                                  combinations(flattened_list_within_template, 2)]
                                                                      if p[0] < p[1]]))
        list_of_mention_per_template.append(flattened_list_within_template)
        template_of_entities.append(qualified_entities)
    all_role_examples.append(flattened_list_all_templates)
    coenv_negative_examples[-1] = coenv_negative_examples[-1].union(generate_pairs(list_of_mention_per_template))
    counts['actual_coref_pairs'] += len(per_example_coref_pairs)
    coref_pairs.append(per_example_coref_pairs)
    counts['actual_coref_pairs_non_single'] += len(per_example_coref_pairs_non_single_mention)
    coref_pairs_non_single_mention.append(per_example_coref_pairs_non_single_mention)

    example_template_entity_mentions.append(template_of_entities)

  0%|          | 0/246 [00:00<?, ?it/s]

In [119]:
def get_named_entities(text):
    nlp = spacy.load("en_core_web_sm")
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        entities.append((ent.text, ent.label_))
    return entities


counts['ne'] = 0
id_to_tag = []

pool = Pool(cpu_count())
for named_entities in tqdm(pool.imap(get_named_entities, [ex['doctext'] for ex in examples]), total=example_len ):
    id_to_tag.append(named_entities)
    counts['ne'] += len(named_entities)
pool.close()
pool.join()

  0%|          | 0/246 [00:00<?, ?it/s]

In [120]:
ne_examples = [[p[0] for p in ex if p[1] in {"PERSON","ORG"}] for ex in id_to_tag]
ne_examples = [list(set(ex)) for ex in ne_examples]
sum([len(ex) for ex in ne_examples])
ne_neg_examples = [[n for n in ne_examples[i] if n not in " ".join(all_role_examples[i])] for i in range(len(ne_examples))]

In [121]:
ne_pos_examples = [list(set(ex)) for ex in all_role_examples]

In [122]:
ne_neg_examples[1]

['Allah',
 'Prophet Muhammad',
 'Worlds',
 'Court',
 'Islam',
 'The Prophet Muhammad']

In [123]:
roles,counts

(['Detainee', 'Place', 'Jailer'],
 defaultdict(<function __main__.<lambda>()>,
             {'qualified_mentions': 4467,
              'qualified_entities': 3069,
              'qualified_coenv': 3934,
              'duplicated qualified mention across templates': 2132,
              'actual_coref_pairs': 414,
              'actual_coref_pairs_non_single': 3770,
              'duplicated qualified mention within templates': 103,
              'ne': 18110}))

In [124]:
Counter(mention_count)

Counter({16: 49,
         6: 194,
         9: 135,
         2: 875,
         1: 1899,
         3: 505,
         22: 43,
         5: 296,
         11: 83,
         4: 346,
         7: 153,
         0: 99,
         8: 152,
         18: 34,
         14: 38,
         12: 80,
         10: 119,
         13: 52,
         23: 2,
         28: 19,
         25: 7,
         15: 92,
         17: 39,
         20: 45,
         37: 7,
         35: 14,
         33: 4,
         45: 1,
         26: 3,
         34: 10,
         19: 14,
         21: 4,
         46: 11,
         24: 35,
         55: 11,
         58: 17,
         31: 6,
         29: 18,
         38: 5,
         30: 17,
         97: 3})

In [125]:
Counter(qualified_mention_count)

Counter({2: 573, 0: 2467, 1: 2177, 3: 202, 4: 65, 5: 34, 6: 18})

In [126]:
coref_pairs_nonempty = [coref_pairs[i] for i in range(len(coref_pairs)) if coref_pairs[i]]
coref_pairs_nonempty

[{('Amin', 'Milton J. Valencia')},
 {('Chechen', 'immigrant'),
  ('Dzhokhar Tsarnaev', 'immigrant'),
  ('Karen Rand McWatters', 'friend'),
  ('Patricia Campbell', 'mother')},
 {('country', 'where')},
 {('attacker', 'soldier')},
 {('Boylston Street', 'thoroughfare'), ('Sean Collier', 'policeman')},
 {('Both brothers', 'lifeguards'),
  ('Ruger pistol', 'Ruger semi-automatic handgun'),
  ('Ruger pistol', 'Ruger semi-automatic pistol'),
  ('Ruger pistol', 'a gun'),
  ('Ruger semi-automatic pistol', 'a gun'),
  ('Sean Collier', 'officer'),
  ('Stephen Silva', 'his close friend')},
 {('Osama bin Laden', 'suspect')},
 {('dead', 'victims')},
 {('Bilal Mohammed', 'client')},
 {('Nestor Reverol', 'minister')},
 {('Birmingham Mail', 'Birmingham Post & Mail'),
  ('Irishmen', 'bombers'),
  ('group', 'organisation')},
 {('Badia al-Sukhnah', 'spot'),
  ('Badia al-Sukhnah', 'zone'),
  ('Tass', 'agency'),
  ('spot', 'zone')},
 {('U.S. State Department', 'U.S. Treasury Department')},
 {('Christophe Cast

In [127]:
coref_pairs_non_single_mention_non_empty = [coref_pairs_non_single_mention[i] for i in range(len(coref_pairs)) if
                                            coref_pairs[i]]
coref_pairs_non_single_mention_non_empty

[{('Amin', 'Milton J. Valencia')},
 {('America', 'American'),
  ('America', 'Boston'),
  ('America', 'U.S'),
  ('America', 'U.S.'),
  ('America', 'here'),
  ('American', 'Boston'),
  ('American', 'U.S'),
  ('American', 'U.S.'),
  ('American', 'here'),
  ('Bomber', 'Tsarnaev'),
  ('Boston', 'U.S'),
  ('Boston', 'here'),
  ('Chechen', 'Tsarnaev'),
  ('Chechen', 'bomber'),
  ('Chechen', 'immigrant'),
  ('Dzhokhar Tsarnaev', 'Tsarnaev'),
  ('Dzhokhar Tsarnaev', 'bomber'),
  ('Dzhokhar Tsarnaev', 'immigrant'),
  ('Karen Rand McWatters', 'McWatters'),
  ('Karen Rand McWatters', 'friend'),
  ('Krystle Campbell', 'daughter'),
  ('Martin', 'brother'),
  ('Martin', 'son'),
  ('Patricia Campbell', 'mother'),
  ('Tamerlan', 'brother'),
  ('Tsarnaev', 'bomber'),
  ('Tsarnaev', 'immigrant'),
  ('U.S', 'here'),
  ('U.S.', 'here'),
  ('bomber', 'immigrant'),
  ('people', 'victims'),
  ('person', 'son')},
 {('Afghan', 'Afghanistan'),
  ('Afghan', 'country'),
  ('Afghan', 'where'),
  ('Afghanistan', 'co

In [128]:
list(filter(None,
            [x.difference(coref_pairs_nonempty[i]) for i, x in enumerate(coref_pairs_non_single_mention_non_empty)]))

[{('America', 'American'),
  ('America', 'Boston'),
  ('America', 'U.S'),
  ('America', 'U.S.'),
  ('America', 'here'),
  ('American', 'Boston'),
  ('American', 'U.S'),
  ('American', 'U.S.'),
  ('American', 'here'),
  ('Bomber', 'Tsarnaev'),
  ('Boston', 'U.S'),
  ('Boston', 'here'),
  ('Chechen', 'Tsarnaev'),
  ('Chechen', 'bomber'),
  ('Dzhokhar Tsarnaev', 'Tsarnaev'),
  ('Dzhokhar Tsarnaev', 'bomber'),
  ('Karen Rand McWatters', 'McWatters'),
  ('Krystle Campbell', 'daughter'),
  ('Martin', 'brother'),
  ('Martin', 'son'),
  ('Tamerlan', 'brother'),
  ('Tsarnaev', 'bomber'),
  ('Tsarnaev', 'immigrant'),
  ('U.S', 'here'),
  ('U.S.', 'here'),
  ('bomber', 'immigrant'),
  ('people', 'victims'),
  ('person', 'son')},
 {('Afghan', 'Afghanistan'),
  ('Afghan', 'country'),
  ('Afghan', 'where'),
  ('Afghanistan', 'country'),
  ('Afghanistan', 'where'),
  ('Pakistan', 'hub'),
  ('civilians', 'people'),
  ('drone', 'drones')},
 {('Andrew Cuomo', 'Cuomo'),
  ('Bill de Blasio', 'DeBlasio'),


In [129]:
example_template_entity_mentions[1]

[[['Amin', 'Milton J. Valencia']]]

In [130]:
coref_pairs

[set(),
 {('Amin', 'Milton J. Valencia')},
 {('Chechen', 'immigrant'),
  ('Dzhokhar Tsarnaev', 'immigrant'),
  ('Karen Rand McWatters', 'friend'),
  ('Patricia Campbell', 'mother')},
 {('country', 'where')},
 set(),
 {('attacker', 'soldier')},
 set(),
 set(),
 set(),
 {('Boylston Street', 'thoroughfare'), ('Sean Collier', 'policeman')},
 {('Both brothers', 'lifeguards'),
  ('Ruger pistol', 'Ruger semi-automatic handgun'),
  ('Ruger pistol', 'Ruger semi-automatic pistol'),
  ('Ruger pistol', 'a gun'),
  ('Ruger semi-automatic pistol', 'a gun'),
  ('Sean Collier', 'officer'),
  ('Stephen Silva', 'his close friend')},
 {('Osama bin Laden', 'suspect')},
 {('dead', 'victims')},
 {('Bilal Mohammed', 'client')},
 {('Nestor Reverol', 'minister')},
 set(),
 set(),
 {('Birmingham Mail', 'Birmingham Post & Mail'),
  ('Irishmen', 'bombers'),
  ('group', 'organisation')},
 set(),
 set(),
 {('Badia al-Sukhnah', 'spot'),
  ('Badia al-Sukhnah', 'zone'),
  ('Tass', 'agency'),
  ('spot', 'zone')},
 {('U

In [131]:
per_example_result = []
count = 0
for i, coref_pairs in enumerate(coref_pairs_non_single_mention):
    ex = examples[i]
    per_example_result.append([])
    templates = ex['templates']
    text = ex['doctext']

    for pair in set(coref_pairs):
        blocked = set()
        sorted_pairs = sorted(list(pair), key=len, reverse=True)
        example_result = []
        for word in sorted_pairs:
            pattern = re.compile(re.escape(word))
            positions = [match.start() for match in re.finditer(pattern, text)]
            positions = set(positions).difference(blocked)
            example_result.append(positions)
            for i in range(len(word)):
                blocked = blocked.union([p + i for p in positions])
        if len(example_result[0]) == 0 or len(example_result[1]) == 0:
            continue
        per_example_result[-1].append((sorted_pairs[0], example_result[0], sorted_pairs[1], example_result[1]))
        count += len(example_result[0]) * len(example_result[1])


In [132]:
count

200244

In [133]:
per_example_result

[[('The Taliban', {0, 4748, 4918}, 'insurgents', {3054, 3854, 4267}),
  ('insurgents', {3054, 3854, 4267}, 'militants', {5093}),
  ('official',
   {161, 349, 434, 887, 1247, 1381, 1894, 2013, 2284, 2317, 2982, 3148, 3659},
   'source',
   {1062, 5133}),
  ('The Taliban', {0, 4748, 4918}, 'militants', {5093}),
  ('compound', {89, 4515}, 'complex', {1194, 2389, 4423}),
  ('members', {33, 1123, 1303, 5155}, '100', {29, 1119, 1299}),
  ('building', {1846}, 'complex', {1194, 2389, 4423}),
  ('training center', {269, 661}, 'campus', {599, 758}),
  ('compound', {89, 4515}, 'campus', {599, 758}),
  ('training center', {269, 661}, 'complex', {1194, 2389, 4423}),
  ('militants',
   {5093},
   'Taliban',
   {4, 906, 2949, 3540, 3599, 3968, 4259, 4452, 4752, 4922, 5744}),
  ('member', {33, 1123, 1303, 1414, 5155}, 'Hotak', {1404, 1658}),
  ('Sharif Hotak', {1397}, 'Hotak', {1658}),
  ('over 100', {1294}, 'members', {33, 1123, 1303, 5155}),
  ('Sharif Hotak', {1397}, 'member', {33, 1123, 1303, 1414

In [137]:
print("#Test Coref: ", sum([sum([len(pairs[1]) * len(pairs[3]) for pairs in ex]) for ex in per_example_result[:test_start]]))
print("#Dev Coref:  ",
      sum([sum([len(pairs[1]) * len(pairs[3]) for pairs in ex]) for ex in per_example_result[test_start:train_start]]))
print("#Train Coref:", sum([sum([len(pairs[1]) * len(pairs[3]) for pairs in ex]) for ex in per_example_result[train_start:]]))

#Test Coref:  3626
#Dev Coref:   3076
#Train Coref: 193542


In [138]:
coref_positive_examples = coref_pairs_non_single_mention

In [139]:
gtt_tokens = json.load(open("../../Model/GTT/TokenizedDoc/GTT_MUC1700_FullTextTokenized.json"))
gttsent_tokens = json.load(open("../../Model/GTT/TokenizedDoc/GTT_MUC1700_SentCatTokenized.json"))
tanl_tokens = json.load(open("../../Model/TANL/TokenizedDoc/muc1700.json"))
#tanl_sent_tokens = json.load(open("../../Model/TANL/TokenizedDoc/tanl_token_25749.json"))
dygie_tokens = json.load(open("../../Model/Dygie/dygiepp/TokenizedDoc/muc1700_list_of_tokenlist.json"))
rawbert_tokens = json.load(open("../../Model/NaiveBERT/tokenized_muc1700.json"))
model_token_pairs = [(gtt_tokens, "gtt"), (gttsent_tokens, "gtt_sent"), (tanl_tokens, "tanl"),
                     (dygie_tokens, "dygie"), (rawbert_tokens, "raw")]#, (tanl_sent_tokens, "tanl_sent")]


for docid in tqdm(range(example_len )):
    tanl_clean = clean(" ".join(tanl_tokens[docid]))
    for i, (_, model_name) in enumerate(model_token_pairs):
        if "tanl" in model_name:
            continue
        truncate = 0
        total_len = 0
        while total_len < len(tanl_clean):
            total_len += len(clean(model_token_pairs[i][0][docid][truncate]))
            truncate += 1
            if truncate >= len(model_token_pairs[i][0][docid]):
                break
        model_token_pairs[i][0][docid] = model_token_pairs[i][0][docid][:truncate]

  0%|          | 0/246 [00:00<?, ?it/s]

In [232]:
# Coref
coref_data = defaultdict(lambda: defaultdict(list))
for tokens, model_name in model_token_pairs:
    all_results = set()
    for per_example_pairs, label in [(coref_positive_examples, 1), (coref_negative_examples, 0)]:
        for doc_i, example_pairs in tqdm(enumerate(per_example_pairs)):
            for pair in example_pairs:
                pair0_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[0])
                pair1_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[1])
                pair0_indices = set(pair0_indices)
                pair1_indices = set(pair1_indices).difference(pair0_indices)
                for index0 in pair0_indices:
                    for index1 in pair1_indices:
                        result = {"doc_i": doc_i, "index0": index0, "index1": index1, "label": label}
                        if tuple(result.values()) not in all_results:
                            all_results.add(result.values())
                        else:
                            continue
                        coref_data[model_name]["test" if doc_i < 200 else ("dev" if doc_i < 400 else "train")].append(
                            result)
json.dump(coref_data, open("../XY/XY_bool_CoRef.json", "w+"))

1700it [00:02, 835.69it/s] 
1700it [00:05, 286.53it/s]
1700it [00:01, 869.03it/s] 
1700it [00:05, 318.55it/s]
1700it [00:03, 557.68it/s]
1700it [00:08, 208.27it/s]
1700it [00:01, 968.34it/s] 
1700it [00:04, 351.79it/s]
1700it [00:01, 885.74it/s] 
1700it [00:06, 256.31it/s]


In [233]:
[[len(v[x]) for x in v] for v in coref_data.values()]

[[1479, 1073, 3149],
 [1479, 1073, 3149],
 [1483, 1113, 3167],
 [1487, 1144, 3180],
 [1487, 1144, 3181]]

In [234]:
coref_data

defaultdict(<function __main__.<lambda>()>,
            {'gtt': defaultdict(list,
                         {'test': [{'doc_i': 1,
                            'index0': 35,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 1,
                            'index0': 208,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 1,
                            'index0': 146,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 1,
                            'index0': 183,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 1,
                            'index0': 60,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 12,
                            'index0': 164

In [235]:
sum([sum([len(x) for x in c]) for c in coenv_positive_examples])

7168

In [236]:
sum([sum([len(x) for x in c]) for c in coenv_negative_examples])

10798

In [237]:
# Coenv
coenv_data = defaultdict(lambda: defaultdict(list))
for tokens, model_name in model_token_pairs:
    all_results = set()
    for per_example_pairs, label in [(coenv_positive_examples, 1), (coenv_negative_examples, 0)]:
        for doc_i, example_pairs in tqdm(enumerate(per_example_pairs)):
            for pair in example_pairs:
                pair0_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[0])
                pair1_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[1])
                pair0_indices = set(pair0_indices)
                pair1_indices = set(pair1_indices).difference(pair0_indices)
                for index0 in pair0_indices:
                    for index1 in pair1_indices:
                        result = {"doc_i": doc_i, "index0": index0, "index1": index1, "label": label}
                        if tuple(result.values()) not in all_results:
                            all_results.add(result.values())
                        else:
                            continue
                        coenv_data[model_name]["test" if doc_i < 200 else ("dev" if doc_i < 400 else "train")].append(result)
json.dump(coenv_data, open("../XY/XY_bool_CoEnv.json", "w+"))

1700it [00:08, 202.85it/s]
1700it [00:13, 124.87it/s]
1700it [00:06, 243.75it/s]
1700it [00:12, 137.15it/s]
1700it [00:11, 145.35it/s]
1700it [00:15, 107.08it/s]
1700it [00:06, 277.30it/s]
1700it [00:10, 160.32it/s]
1700it [00:07, 241.04it/s]
1700it [00:11, 145.68it/s]


In [238]:
[(k, [len(v[x]) for x in v]) for k, v in coenv_data.items()]

[('gtt', [1216, 1204, 4056]),
 ('gtt_sent', [1216, 1204, 4056]),
 ('tanl', [1218, 1244, 4059]),
 ('dygie', [1239, 1291, 4161]),
 ('raw', [1239, 1291, 4157])]

In [239]:
# Typing
typing_data = defaultdict(lambda: defaultdict(list))
for tokens, model_name in model_token_pairs:
    all_results = set()
    for doc_i, example_pairs in tqdm(enumerate(event_typing_examples)):
        for pair in example_pairs:
            label = pair[2]
            pair0_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[0])
            pair1_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[1])
            pair0_indices = set(pair0_indices)
            pair1_indices = set(pair1_indices).difference(pair0_indices)
            for index0 in pair0_indices:
                for index1 in pair1_indices:
                    result = {"doc_i": doc_i, "index0": index0, "index1": index1, "label": label}
                    if tuple(result.values()) not in all_results:
                        all_results.add(result.values())
                    else:
                        continue
                    typing_data[model_name]["test" if doc_i < 200 else ("dev" if doc_i < 400 else "train")].append(result)
json.dump(typing_data, open("../XY/XY_6way_typing.json", "w+"))

1700it [00:07, 226.32it/s]
1700it [00:07, 230.12it/s]
1700it [00:11, 143.54it/s]
1700it [00:06, 257.07it/s]
1700it [00:07, 238.69it/s]


In [240]:
[(k, [len(v[x]) for x in v]) for k, v in typing_data.items()]

[('gtt', [619, 542, 1843]),
 ('gtt_sent', [619, 542, 1843]),
 ('tanl', [622, 547, 1871]),
 ('dygie', [642, 552, 1895]),
 ('raw', [642, 552, 1891])]

In [241]:
# Role labeling
rolelabeling_data = defaultdict(lambda: defaultdict(list))
for tokens, model_name in model_token_pairs:
    all_results = set()
    for doc_i, example_pairs in tqdm(enumerate(roling_examples)):
        for pair in example_pairs:
            label = pair[1]
            pair0_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[0])
            pair0_indices = set(pair0_indices)
            for index0 in pair0_indices:
                result = {"doc_i": doc_i, "index0": index0, "label": label}
                if tuple(result.values()) not in all_results:
                    all_results.add(result.values())
                else:
                    continue
                rolelabeling_data[model_name]["test" if doc_i < 200 else ("dev" if doc_i < 400 else "train")].append(result)
json.dump(rolelabeling_data, open("../XY/XY_5way_rolelabeling.json", "w+"))

1700it [00:04, 355.08it/s]
1700it [00:05, 326.32it/s]
1700it [00:07, 222.06it/s]
1700it [00:04, 388.87it/s]
1700it [00:04, 364.56it/s]


In [242]:
[(k, [len(v[x]) for x in v]) for k, v in rolelabeling_data.items()]

[('gtt', [1196, 994, 4535]),
 ('gtt_sent', [1196, 994, 4535]),
 ('tanl', [1186, 993, 4497]),
 ('dygie', [1190, 1001, 4536]),
 ('raw', [1190, 1001, 4533])]

In [243]:
# Argument Detection
isarg_data = defaultdict(lambda: defaultdict(list))
for tokens, model_name in model_token_pairs:
    all_results = set()
    for ne_examples, label in  [(ne_pos_examples, 1), (ne_neg_examples, 0)]:
        for doc_i, mentions in tqdm(enumerate(ne_examples)):
            for mention in mentions:
                pair0_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=mention)
                pair0_indices = set(pair0_indices)
                for index0 in pair0_indices:
                    result = {"doc_i": doc_i, "index0": index0, "label": label}
                    if tuple(result.values()) not in all_results:
                        all_results.add(result.values())
                    else:
                        continue
                    isarg_data[model_name]["test" if doc_i < 200 else ("dev" if doc_i < 400 else "train")].append(result)
json.dump(isarg_data, open("../XY/XY_bool_isArg.json", "w+"))

1700it [00:03, 487.95it/s]
1700it [00:06, 260.38it/s]
1700it [00:03, 501.87it/s]
1700it [00:06, 265.74it/s]
1700it [00:05, 284.09it/s]
1700it [00:08, 190.04it/s]
1700it [00:02, 578.54it/s]
1700it [00:07, 220.75it/s]
1700it [00:03, 515.23it/s]
1700it [00:06, 271.11it/s]


In [244]:
[(k, [len(v[x]) for x in v]) for k, v in isarg_data.items()]

[('gtt', [1679, 1750, 9406]),
 ('gtt_sent', [1679, 1750, 9407]),
 ('tanl', [1671, 1754, 9428]),
 ('dygie', [1675, 1756, 9454]),
 ('raw', [1683, 1761, 9458])]