In [2]:
import json
import re
from collections import Counter
from collections import defaultdict
from itertools import combinations

from tqdm import tqdm

In [3]:
dataset_name = "muc1700"
muc_1700_input = open("../../Corpora/MUC/muc/processed2/muc_1700_GTT_style-test-dev-train.json")
examples = json.load(muc_1700_input)
muc_1700_input.close()



In [121]:

def clean(string):
    string = string.lower()
    cleaned_string = re.sub(r'[^a-z]', '', string)
    return cleaned_string


def find_substring_indices(string, substring):
    indices = []
    length = len(substring)
    for i in range(len(string) - length + 1):
        if string[i:i + length] == substring:
            indices.append(i)
    return indices


def indices_in_tokenized_text(role_filler, tokens):
    cleaned_text = ""
    d = {}
    for i, token in enumerate(tokens):
        cleaned = clean(token)
        for ii in range(len(cleaned)):
            d[ii + len(cleaned_text)] = i
        cleaned_text += cleaned
    return [d[index] for index in find_substring_indices(cleaned_text, clean(role_filler))]


def flatten_list(nested_list):
    return [item for sublist in nested_list for item in
            (flatten_list(sublist) if isinstance(sublist, list) else [sublist])]



def generate_pairs(lists):
    pairs = set()
    for i in range(len(lists)):
        for j in range(i + 1, len(lists)):
            for elem1 in lists[i]:
                for elem2 in lists[j]:
                    if elem1 != elem2:
                        l = sorted([elem1, elem2])
                        pairs.add((l[0], l[1]))
    return pairs


def n_comb(l):
    return len(l) * (len(l) - 1) // 2

In [140]:
example_template_entity_mentions = []
counts = defaultdict(lambda: 0)
mention_count = []
qualified_mention_count = []
coref_pairs = []
coref_negative_examples = []
coref_pairs_non_single_mention = []
coenv_positive_examples = []
coenv_negative_examples = []
for ex in tqdm(examples):
    templates = ex['templates']
    text = ex['doctext']
    cleanedtext = clean(" ".join(text))
    template_of_entities = []
    flattened_list_all_templates = []
    per_example_coref_pairs = set()
    per_example_coref_pairs_non_single_mention = set()
    coref_negative_examples.append(set())
    coenv_positive_examples.append(set())
    coenv_negative_examples.append(set())
    list_of_mention_per_template = []
    for template in templates:
        flattened_list_within_template = []
        arguments = [l for l in template.values() if isinstance(l, list)]
        qualified_entities = []

        for role_fillers in arguments:
            list_of_entity_mentions = []
            for entity in role_fillers:
                qualified_mentions = []
                mentions_per_entity = []
                for mention_w_index in entity:
                    mention = mention_w_index[0]
                    #assert cleanedtext.count(clean(mention)) > 0
                    mentions_per_entity.append(mention)
                    if cleanedtext.count(clean(mention)) == 1:
                        qualified_mentions.append(mention)
                        counts['qualified_mentions'] += 1

                        if mention in flattened_list_all_templates:
                            counts['duplicated qualified mention across templates'] += 1
                        flattened_list_all_templates.append(mention)

                        if mention in flattened_list_within_template:
                            counts['duplicated qualified mention within templates'] += 1
                        flattened_list_within_template.append(mention)

                if len(qualified_mentions) > 0:
                    counts['qualified_entities'] += 1
                    qualified_entities.append(qualified_mentions)

                    per_example_coref_pairs = per_example_coref_pairs.union(
                        [p for p in list(combinations(qualified_mentions, 2)) if p[0] < p[1]])
                per_example_coref_pairs_non_single_mention = per_example_coref_pairs_non_single_mention.union(
                    [p for p in list(combinations(mentions_per_entity, 2)) if p[0] < p[1]])
                mention_count.append(len(mentions_per_entity))
                qualified_mention_count.append(len(qualified_mentions))
                list_of_entity_mentions.append(mentions_per_entity)
            coref_negative_examples[-1] = coref_negative_examples[-1].union(generate_pairs(list_of_entity_mentions))

        counts['qualified_coenv'] += n_comb(set(flattened_list_within_template))
        coenv_positive_examples[-1] = coenv_positive_examples[-1].union(([p for p in list(combinations(flattened_list_within_template, 2)) if p[0] < p[1]]))
        list_of_mention_per_template.append(flattened_list_within_template)
        template_of_entities.append(qualified_entities)
    coenv_negative_examples[-1] = coenv_negative_examples[-1].union(generate_pairs(list_of_mention_per_template))
    counts['actual_coref_pairs'] += len(per_example_coref_pairs)
    coref_pairs.append(per_example_coref_pairs)
    counts['actual_coref_pairs_non_single'] += len(per_example_coref_pairs_non_single_mention)
    coref_pairs_non_single_mention.append(per_example_coref_pairs_non_single_mention)

    example_template_entity_mentions.append(template_of_entities)

100%|██████████| 1700/1700 [00:00<00:00, 1957.12it/s]


In [123]:
counts

defaultdict(<function __main__.<lambda>()>,
            {'qualified_coenv': 8541,
             'actual_coref_pairs': 506,
             'actual_coref_pairs_non_single': 997,
             'qualified_mentions': 4229,
             'qualified_entities': 3269,
             'duplicated qualified mention across templates': 602,
             'duplicated qualified mention within templates': 35})

In [124]:
Counter(mention_count)

Counter({4: 97, 1: 2835, 2: 926, 3: 217, 5: 19, 6: 12, 9: 1, 7: 4})

In [125]:
Counter(qualified_mention_count)

Counter({3: 111, 1: 2549, 2: 553, 0: 842, 4: 42, 5: 11, 6: 3})

In [126]:
coref_pairs_nonempty = [coref_pairs[i] for i in range(len(coref_pairs)) if coref_pairs[i]]
coref_pairs_nonempty

[{('people',
   'people who, through a government radio network, had accused him of being an "accomplice" of the rebels'),
  ('several people',
   'several people who, through a government radio network, had accused him of being an "accomplice" of the rebels')},
 {('jose jesus pena',
   'jose jesus pena--alleged chief of security for the nicaraguan embassy in tegucigalpa')},
 {('arena assassins', 'assassins'),
  ('fecmasan',
   'msgr oscar arnulfo romero federation of committees of mothers and relatives')},
 {('five unidentified individuals', 'unidentified individuals')},
 {('vehicle', 'vehicle in which the reporters were traveling')},
 {('fmln guerrillas', 'terrorists'), ('fmln guerrillas', 'urban guerrillas')},
 {('cano limon-covenas pipeline', 'pipeline')},
 {('four peruvians', 'peruvians'),
  ('four peruvians', 'peruvians suspects'),
  ('four peruvians suspects', 'peruvians'),
  ('four peruvians suspects', 'peruvians suspects')},
 {('alleged commando groups',
   'alleged commando g

In [127]:
coref_pairs_non_single_mention_non_empty = [coref_pairs_non_single_mention[i] for i in range(len(coref_pairs)) if
                                            coref_pairs[i]]
coref_pairs_non_single_mention_non_empty

[{('people',
   'people who, through a government radio network, had accused him of being an "accomplice" of the rebels'),
  ('several people',
   'several people who, through a government radio network, had accused him of being an "accomplice" of the rebels')},
 {('jose jesus pena',
   'jose jesus pena--alleged chief of security for the nicaraguan embassy in tegucigalpa')},
 {('arena', 'government'),
  ('arena assassins', 'assassins'),
  ('army', "army's 1st infantry brigade"),
  ('fecmasan',
   'msgr oscar arnulfo romero federation of committees of mothers and relatives')},
 {('five unidentified individuals', 'unidentified individuals')},
 {('vehicle', 'vehicle in which the reporters were traveling')},
 {('1st infantry brigade', '1st infantry brigade garrison'),
  ('60-mm mortar', 'mortar'),
  ('60-mm mortar', 'mortars'),
  ('60-mm mortars', 'mortar'),
  ('60-mm mortars', 'mortars'),
  ('fmln guerrillas', 'terrorists'),
  ('fmln guerrillas', 'urban guerrillas'),
  ('guerrillas', 'ter

In [128]:
list(filter(None,
            [x.difference(coref_pairs_nonempty[i]) for i, x in enumerate(coref_pairs_non_single_mention_non_empty)]))

[{('arena', 'government'), ('army', "army's 1st infantry brigade")},
 {('1st infantry brigade', '1st infantry brigade garrison'),
  ('60-mm mortar', 'mortar'),
  ('60-mm mortar', 'mortars'),
  ('60-mm mortars', 'mortar'),
  ('60-mm mortars', 'mortars'),
  ('guerrillas', 'terrorists'),
  ('guerrillas', 'urban guerrillas')},
 {('gunmen', 'men'),
  ('men', 'men on motorcycles'),
  ('men', 'two men on motorcycles')},
 {('armed forces soldiers', 'soldiers'),
  ('salvadoran armed forces soldiers', 'soldiers')},
 {('detachments', 'fmln detachments'),
  ('farabundo marti national liberation front', 'fmln')},
 {('fmln guerrillas', 'guerrillas')},
 {('individuals', 'two individuals')},
 {('fmln', 'fmln-fdr'),
  ('fmln terrorists', 'people'),
  ('fmln terrorists', 'terrorists')},
 {('armed forces', 'army'),
  ('farabundo marti national liberation front', 'fmln'),
  ('guerrilla group', 'guerrilla members'),
  ('guerrilla group', 'rebels')},
 {('guerrillas', 'urban commandos'),
  ('guerrillas',
   

In [129]:
example_template_entity_mentions[1]

[[['some military', 'military suspects', 'armed forces members'],
  ['armed forces']]]

In [130]:
coref_pairs

[set(),
 set(),
 set(),
 set(),
 set(),
 {('people',
   'people who, through a government radio network, had accused him of being an "accomplice" of the rebels'),
  ('several people',
   'several people who, through a government radio network, had accused him of being an "accomplice" of the rebels')},
 {('jose jesus pena',
   'jose jesus pena--alleged chief of security for the nicaraguan embassy in tegucigalpa')},
 set(),
 set(),
 set(),
 set(),
 set(),
 {('arena assassins', 'assassins'),
  ('fecmasan',
   'msgr oscar arnulfo romero federation of committees of mothers and relatives')},
 {('five unidentified individuals', 'unidentified individuals')},
 {('vehicle', 'vehicle in which the reporters were traveling')},
 set(),
 set(),
 {('fmln guerrillas', 'terrorists'), ('fmln guerrillas', 'urban guerrillas')},
 {('cano limon-covenas pipeline', 'pipeline')},
 {('four peruvians', 'peruvians'),
  ('four peruvians', 'peruvians suspects'),
  ('four peruvians suspects', 'peruvians'),
  ('four p

In [131]:
per_example_result = []
count = 0
for i, coref_pairs in enumerate(coref_pairs_non_single_mention):
    ex = examples[i]
    per_example_result.append([])
    templates = ex['templates']
    text = ex['doctext']

    for pair in coref_pairs:
        blocked = set()
        sorted_pairs = sorted(list(pair), key=len, reverse=True)
        example_result = []
        for word in sorted_pairs:
            pattern = re.compile(re.escape(word))
            positions = [match.start() for match in re.finditer(pattern, text)]
            positions = set(positions).difference(blocked)
            example_result.append(positions)
            for i in range(len(word)):
                blocked = blocked.union([p + i for p in positions])
        if len(example_result[0]) == 0 or len(example_result[1]) == 0:
            continue
        per_example_result[-1].append((sorted_pairs[0], example_result[0], sorted_pairs[1], example_result[1]))
        count += len(example_result[0]) * len(example_result[1])


In [132]:
count

2106

In [133]:
per_example_result

[[],
 [('some military', {295}, 'military', {141, 766, 1002, 1110}),
  ('military suspects', {1002}, 'military', {141, 300, 766, 1110})],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [],
 [('government',
   {1095, 1369, 1623, 1746, 1804, 2033, 2687},
   'arena',
   {736, 768, 2703}),
  ('msgr oscar arnulfo romero federation of committees of mothers and relatives',
   {2281},
   'fecmasan',
   {2271}),
  ("army's 1st infantry brigade", {532}, 'army', {602})],
 [],
 [],
 [],
 [('farabundo marti national liberation front',
   {0},
   'fmln',
   {43, 676, 714, 799, 931, 1088, 1212, 1376, 1405})],
 [('urban guerrillas', {373}, 'guerrillas', {0, 189, 577, 2641}),
  ('60-mm mortars', {593}, 'mortars', {2657}),
  ('1st infantry brigade garrison', {24}, '1st infantry brigade', {266, 438}),
  ('guerrillas', {0, 189, 379, 577, 2641}, 'terrorists', {641}),
  ('urban guerrillas', {373}, 'fmln guerrillas', {184}),
  ('60-mm mortars', {593}, 'mortar', {1533, 2657}),
  ('60-mm mortar', {593, 1527}, '

In [134]:
print("#Test Coref: ", sum([sum([len(pairs[1]) * len(pairs[3]) for pairs in ex]) for ex in per_example_result[:200]]))
print("#Dev Coref:  ",
      sum([sum([len(pairs[1]) * len(pairs[3]) for pairs in ex]) for ex in per_example_result[200:400]]))
print("#Train Coref:", sum([sum([len(pairs[1]) * len(pairs[3]) for pairs in ex]) for ex in per_example_result[400:]]))

#Test Coref:  563
#Dev Coref:   356
#Train Coref: 1187


In [136]:
coref_positive_examples = coref_pairs_non_single_mention

In [137]:
gtt_tokens = json.load(open("../../Model/GTT/TokenizedDoc/muc1700_list_of_tokenlist.json"))
tanl_tokens = json.load(open("../../Model/TANL/TokenizedDoc/muc1700.json"))
dygie_tokens = json.load(open("../../Model/Dygie/dygiepp/TokenizedDoc/muc1700_list_of_tokenlist.json"))

In [138]:
# Coref
coref_data = defaultdict(lambda :defaultdict(list))
for tokens, model_name in [(gtt_tokens, "gtt"), (tanl_tokens, "tanl"), (dygie_tokens, "dygie")]:
    for per_example_pairs, label in [(coref_positive_examples, 1), (coref_negative_examples, 0)]:
        for doc_i, example_pairs in tqdm(enumerate(per_example_pairs)):
            for pair in example_pairs:
                pair0_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[0])
                pair1_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[1])
                pair0_indices = set(pair0_indices)
                pair1_indices = set(pair1_indices).difference(pair0_indices)
                for index0 in pair0_indices:
                    for index1 in pair1_indices:
                        coref_data[model_name]["test" if doc_i<200 else ("dev" if doc_i < 400 else "train")].append({"doc_i": doc_i, "index0": index0, "index1": index1, "label": label})
json.dump(coref_data, open("muc_coref.json", "w+"))

1700it [00:01, 952.81it/s] 
1700it [00:05, 333.88it/s]
1700it [00:02, 663.30it/s] 
1700it [00:07, 229.87it/s]
1700it [00:02, 724.44it/s] 
1700it [00:07, 235.10it/s]


In [117]:
[[len(v[x]) for x in v] for v in coref_data.values()]

[[1589, 1165, 3465], [1483, 1113, 3167], [2012, 1327, 4287]]

In [118]:
coref_data

defaultdict(<function __main__.<lambda>()>,
            {'gtt': defaultdict(list,
                         {'test': [{'doc_i': 1,
                            'index0': 35,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 1,
                            'index0': 208,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 1,
                            'index0': 146,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 1,
                            'index0': 183,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 1,
                            'index0': 60,
                            'index1': 59,
                            'label': 1},
                           {'doc_i': 12,
                            'index0': 155

In [143]:
sum([sum([len(x) for x in c]) for c in coenv_positive_examples])

7168

In [144]:
sum([sum([len(x) for x in c]) for c in coenv_negative_examples])

10798

In [None]:
# Coenv
coenv_data = defaultdict(lambda :defaultdict(list))
for tokens, model_name in [(gtt_tokens, "gtt"), (tanl_tokens, "tanl"), (dygie_tokens, "dygie")]:
    for per_example_pairs, label in [(coenv_positive_examples, 1), (coenv_negative_examples, 0)]:
        for doc_i, example_pairs in tqdm(enumerate(per_example_pairs)):
            for pair in example_pairs:
                pair0_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[0])
                pair1_indices = indices_in_tokenized_text(tokens=tokens[doc_i], role_filler=pair[1])
                pair0_indices = set(pair0_indices)
                pair1_indices = set(pair1_indices).difference(pair0_indices)
                for index0 in pair0_indices:
                    for index1 in pair1_indices:
                        coenv_data[model_name]["test" if doc_i<200 else ("dev" if doc_i < 400 else "train")].append({"doc_i": doc_i, "index0": index0, "index1": index1, "label": label})
json.dump(coenv_data, open("muc_coenv.json", "w+"))

1700it [00:06, 255.75it/s]
1700it [00:10, 157.57it/s]
1700it [00:11, 142.70it/s]
598it [00:13, 30.92it/s] 