In [1]:
import nltk
# nltk.download('punkt')
import numpy as np
from nltk.tokenize import sent_tokenize
import json
import itertools
import re
import os
from collections import defaultdict

In [2]:
# base_path = "../../data/test/"
# file_name = "164726"
base_path = "../../../train/"

In [3]:
all_files = os.listdir(base_path)
filenames = set([name.split(".")[0] for name in all_files])

In [4]:
def parse_entities_relationships(file_content):
    """
    parses out entities and relationships from the entities file
    """
    entities = {}
    relationships = {}

    for line in file_content.strip().split('\n'):
        # madarchod line might be annotator comment
        if line.strip().startswith("#"):
            continue
        cols = line.split('\t')
        identifier = cols[0]
        if "Arg1:" in cols[1]:
            relationship_type, arg1, arg2 = cols[1].split()
            relationships[identifier] = {
                'type': relationship_type,
                'arg1': arg1.split(':')[1],
                'arg2': arg2.split(':')[1]
            }
            
        else:
            split_cols = cols[1].split()
            entity_type = split_cols[0]
            value = cols[2]
            start = split_cols[1]
            end = split_cols[-1]
            entities[identifier] = {
                'type': entity_type,
                'start': int(start),
                'end': int(end),
                'value': value
            }

    return entities, relationships

In [5]:
def get_spans(entities, relationships, txt_str, split_char="\n"):
    """
    first pass through spans
    """
    spans = {}
    for rel in relationships.keys():
        arg1 = entities[relationships[rel]['arg1']]
        arg2 = entities[relationships[rel]['arg2']]
        min_left = min(arg1['start'], arg2['start'])
        max_right = max(arg1['end'], arg2['end'])
        # find dots
        prev_char = txt_str[:min_left].rfind(split_char)
        next_char = txt_str[max_right:].find(split_char)+max_right
        # see if no exist
        assert(prev_char != -1)
        assert(next_char != -1)
        # see that span is not insanely large
        if max_right-min_left > 10:
            assert((next_char-prev_char) <= 20*(max_right-min_left))
        # return spans
        spans[rel] = {
            'start':prev_char,
            'end':next_char
        }
    return spans

In [6]:
def get_ents_in_spans(spans, ents):
    span_ent = defaultdict(list)
    for span in spans.keys():
        s = spans[span]
        for ent in ents.keys():
            e = ents[ent]
            if s['start'] <= e['start'] and s['end'] >= e['end']:
                span_ent[span].append(ent)
    return span_ent

In [7]:
# CHECKING FOR ALL FILES MA CHUDAO
for file_name in filenames:
    print(file_name)
    # read annotations
    ann_str = None
    with open(base_path + file_name + ".ann", "r") as file:
        ann_str = file.read()
    # parse entities and relations
    ent, rel = parse_entities_relationships(ann_str)
    # parse text string
    txt_str = None
    with open(base_path + file_name + ".txt", "r") as file:
        txt_str = file.read()
    # first pass through spans
    spans = get_spans(ent, rel, txt_str, "\n")
    # second pass through spans
    span_ents = get_ents_in_spans(spans, ent)
#     print(span_ents)

149687
114923
113824
116451
185800
111041
160574
110186
106384
195784
102087
100039
121467
176448
124917
177331
103315
112014
109170
101331
131434
110445
151688
117609
109617
108809
168831
114144
103677
176106
104653
120237
102698
115157
103074
127263
110384
103926
116853
111298
103722
115232
182160
115347
195689
161384
102527
105614
110342
115244
168936
109697
102296
101215
112249
104682
106064
190056
103142
196798
109527
197869
105104
155980
110326
127356
157609
108754
186876
115464
110727
102053
174150
112628
160707
101427
102937
101813
143451
108073
101372
106038
159079
100187
102456
149614
186788
122468
107869
105778
109667
120301
141586
118418
178331
184696
110037
110335
119144
187782
104799
157284
105747
106415
107047
114965
101665
192877
115667
182040
112615
109770
128041
102173
114836
103293
106621
161477
102365
121514
109191
100229
131903
126380
115789
113391
106361
105445
164253
114959
106939
104979
100564
111458
106629
105254
178143
114004
187736
177721
113200
190444
177370