In [1]:
import os
import json
import pandas as pd
from xml.etree import ElementTree

In [2]:
!pwd

/scratch/hkaredla/Event-Extraction


In [3]:
def get_file_paths():
    df = pd.read_csv('/scratch/hkaredla/Event-Extraction/data/data_list.csv')
    types = df['type'].unique()
    train, valid, test = [], [], []
    
    for i in range(len(df)):
        path = os.path.join( '/scratch/hkaredla/Event-Extraction/data/English/English', df['path'].iloc[i] + '.apf.xml')
        if df['type'].iloc[i] == 'train':
            train.append(path)
        elif df['type'].iloc[i] == 'valid':
            valid.append(path)
        elif df['type'].iloc[i] == 'test':
            test.append(path)
        
    return train, valid, test

In [4]:
class Parser:
    def __init__(self, xml_path):
        self.entity_mentions = []
        self.event_mentions = []
        self.parse(xml_path)
        
    def parse(self, xml_path):
        tree = ElementTree.parse(xml_path)
        root = tree.getroot()
        #print(tree)
        #print(root)
        
        for child in root[0]:
            if child.tag == 'entity':
                self.entity_mentions.extend(self.parse_entity_tag(child))
            elif child.tag in ['value', 'timex2']:
                self.entity_mentions.extend(self.parse_value_timex(child))
            elif child.tag == 'event':
                self.event_mentions.extend(self.parse_event_tag(child))
                
     
    def parse_entity_tag(self, node):
        entity_mentions = []
        for child in node:
            if child.tag != 'entity_mention':  
                continue
            extent = child[0]
            charset = extent[0]

            entity_mention = dict()
            entity_mention['entity_id'] = child.attrib['ID']
            entity_mention['entity_type'] = '{}:{}'.format(node.attrib['TYPE'], node.attrib['SUBTYPE'])
            entity_mention['text'] = charset.text
            entity_mention['position'] = (int(charset.attrib['START']), int(charset.attrib['END']))

            entity_mentions.append(entity_mention)
        return entity_mentions
    
    def parse_event_tag(self, node):
        event_mentions = []
        for child in node:
            if child.tag == 'event_mention':
                event_mention = dict()
                event_mention['event_type'] = '{}:{}'.format(node.attrib['TYPE'], node.attrib['SUBTYPE'])
                event_mention['arguments'] = []
                for child2 in child:
                    if child2.tag == 'ldc_scope':
                        charset = child2[0]
                        event_mention['text'] = charset.text
                        event_mention['position'] = (int(charset.attrib['START']), int(charset.attrib['END']))
                    if child2.tag == 'anchor':
                        charset = child2[0]
                        event_mention['trigger'] = charset.text
                    if child2.tag == 'event_mention_argument':
                        extent = child2[0]
                        charset = extent[0]
                        event_mention['arguments'].append({
                            'text': charset.text,
                            'role': child2.attrib['ROLE'],
                            'entity_id': child2.attrib['REFID']
                        })
                event_mentions.append(event_mention)
        return event_mentions

                                            
    def parse_value_timex(self, node):
        entity_mentions = []

        for child in node:
            extent = child[0]
            charset = extent[0]

            entity_mention = dict()
            entity_mention['entity_id'] = child.attrib['ID']

            if 'TYPE' in node.attrib:
                entity_mention['entity_type'] = node.attrib['TYPE']
            if 'SUBTYPE' in node.attrib:
                entity_mention['entity_type'] += ':{}'.format(node.attrib['SUBTYPE'])
            if child.tag == 'timex2_mention':
                entity_mention['entity_type'] = 'TIM:time'

            entity_mention['text'] = charset.text
            entity_mention['position'] = (int(charset.attrib['START']), int(charset.attrib['END']))

            entity_mentions.append(entity_mention)

        return entity_mentions
    
    
    def get_data(self):
        data = []
#         print(len(self.entity_mentions))
        #print(len(self.event_mentions))
        for event_mention in self.event_mentions:
            item = dict()
            #print(event_mention)
            item['sentence'] = event_mention['text']

            entity_map = dict()
            item['golden_entity_mentions'] = []
            text_position = event_mention['position']
            for entity_mention in self.entity_mentions:
                entity_position = entity_mention['position']
                if text_position[0] <= entity_position[0] and entity_position[1] <= text_position[1]:
                    item['golden_entity_mentions'].append({
                        'text': entity_mention['text'],
                        'entity_type': entity_mention['entity_type']
                    })
                    entity_map[entity_mention['entity_id']] = entity_mention

            event_arguments = []
            print("2")
            for argument in event_mention['arguments']:
                event_arguments.append({
                    'role': argument['role'],
                    'entity_type': entity_map[argument['entity_id']]['entity_type'],
                    'text': argument['text'],
                })

            item['golden_event_mention'] = {
                'trigger': event_mention['trigger'],
                'arguments': event_arguments,
                'event_type': event_mention['event_type'],
            }
            
            data.append(item)
        return data
                                            
            
            
    
        

In [5]:
def preprocessing(data_type, file_path_list):
    data = []
    
    for filename in file_path_list:
        #Parser(filename)
        x = Parser(filename).get_data()
        data.extend(x)
    #print(len(data))
    with open('/scratch/hkaredla/Event-Extraction/data/output/{}.json'.format(data_type),'w') as f:
        json.dump(data, f, indent=2)
        
        

In [6]:
train_files, valid_files, test_files = get_file_paths()

In [7]:
#preprocessing('train', train_files)

In [8]:
preprocessing('train', train_files)
preprocessing('valid', valid_files)
preprocessing('test', test_files)

{'event_type': 'Movement:Transport', 'arguments': [{'text': 'a 30-foot Cuban patrol boat with four heavily armed men', 'role': 'Vehicle', 'entity_id': 'CNN_CF_20030303.1900.00-E96-192'}, {'text': 'four heavily armed men', 'role': 'Artifact', 'entity_id': 'CNN_CF_20030303.1900.00-E98-194'}, {'text': 'American shores', 'role': 'Destination', 'entity_id': 'CNN_CF_20030303.1900.00-E99-195'}], 'text': 'Even as the\nsecretary of homeland security was putting his people on high alert last\nmonth, a 30-foot Cuban patrol boat with four heavily armed men landed on\nAmerican shores, utterly undetected by the Coast Guard Secretary Ridge\nnow leads', 'position': (277, 514), 'trigger': 'landed'}
2
{'event_type': 'Movement:Transport', 'arguments': [{'text': 'four armed Cubans', 'role': 'Artifact', 'entity_id': 'CNN_CF_20030303.1900.00-E98-70'}, {'text': 'these\nbozos', 'role': 'Agent', 'entity_id': 'CNN_CF_20030303.1900.00-E4-69'}, {'text': 'our shores', 'role': 'Destination', 'entity_id': 'CNN_CF_20

{'event_type': 'Conflict:Attack', 'arguments': [], 'text': 'Definitely put up a fight', 'position': (1328, 1352), 'trigger': 'fight'}
2
{'event_type': 'Conflict:Attack', 'arguments': [{'text': 'a Jeep', 'role': 'Target', 'entity_id': 'CNN_IP_20030405.1600.02-E18-32'}, {'text': 'a Humvee', 'role': 'Target', 'entity_id': 'CNN_IP_20030405.1600.02-E19-33'}, {'text': 'These', 'role': 'Instrument', 'entity_id': 'CNN_IP_20030405.1600.02-E16-31'}], 'text': 'These\ncould take out a Jeep or a Humvee in a heartbeat', 'position': (1418, 1471), 'trigger': 'take out'}
2
{'event_type': 'Movement:Transport', 'arguments': [{'text': 'you', 'role': 'Artifact', 'entity_id': 'CNN_IP_20030405.1600.02-E73-246'}, {'text': 'these\nbuildings', 'role': 'Destination', 'entity_id': 'CNN_IP_20030405.1600.02-E90-207'}], 'text': "It's -- yes, it's extremely dangerous, because once you enter these\nbuildings, we found, actually, enemy, you know, guarding these sites", 'position': (2626, 2761), 'trigger': 'enter'}
2
{'

{'event_type': 'Personnel:End-Position', 'arguments': [{'text': 'a leader whose fate is unknown\ntonight', 'role': 'Person', 'entity_id': 'CNN_ENG_20030410_183644.8-E23-7'}], 'text': 'people in baghdad celebrate the fall of a leader whose fate is unknown\ntonight', 'position': (477, 554), 'trigger': 'fall'}
2
{'event_type': 'Life:Die', 'arguments': [{'text': 'saddam hussein', 'role': 'Victim', 'entity_id': 'CNN_ENG_20030410_183644.8-E23-15'}], 'text': "we'll be talking with an expert on saddam hussein, author\nandrew coburn about whether saddam hussein is dead or alive and where\nin the world is he", 'position': (557, 702), 'trigger': 'dead'}
2
{'event_type': 'Conflict:Attack', 'arguments': [{'text': 'coalition', 'role': 'Attacker', 'entity_id': 'CNN_ENG_20030410_183644.8-E28-45'}, {'text': 'it', 'role': 'Target', 'entity_id': 'CNN_ENG_20030410_183644.8-E24-42'}, {'text': 'coalition bombs and missiles', 'role': 'Instrument', 'entity_id': 'CNN_ENG_20030410_183644.8-E26-43'}, {'text': 'c

{'event_type': 'Movement:Transport', 'arguments': [{'text': 'they', 'role': 'Artifact', 'entity_id': 'CNN_ENG_20030617_173115.22-E13-12'}, {'text': 'the sea', 'role': 'Destination', 'entity_id': 'CNN_ENG_20030617_173115.22-E5-15'}], 'text': 'they were jumping in the sea when they saw the rescuers\nwere on the way', 'position': (1067, 1137), 'trigger': 'jumping'}
2
{'event_type': 'Movement:Transport', 'arguments': [{'text': 'the kramers', 'role': 'Artifact', 'entity_id': 'CNN_ENG_20030617_193116.10-E3-95'}, {'text': 'the u.s', 'role': 'Origin', 'entity_id': 'CNN_ENG_20030617_193116.10-E20-96'}, {'text': 'china', 'role': 'Destination', 'entity_id': 'CNN_ENG_20030617_193116.10-E24-98'}], 'text': 'well, it is our turn to meet the kramers back in the u.s. after their\njourney to china to bring home the newest and cutest perhaps, member\nof their family, in los angeles, the proud parents of 11 mnth abigail\nkramer', 'position': (2762, 2976), 'trigger': 'journey'}
2
{'event_type': 'Movement:T

{'event_type': 'Life:Die', 'arguments': [{'text': 'Arafat', 'role': 'Victim', 'entity_id': 'fsh_29592-E9-216'}], 'text': 'Death of Arafat', 'position': (64, 78), 'trigger': 'Death'}
2
{'event_type': 'Life:Die', 'arguments': [{'text': 'his', 'role': 'Victim', 'entity_id': 'fsh_29592-E9-259'}], 'text': 'What do you think caused his illness, and could it\nhave been something malicious (like poisoning', 'position': (219, 314), 'trigger': 'poisoning'}
2
{'event_type': 'Life:Die', 'arguments': [{'text': 'PLO leader Yasser Arafat', 'role': 'Victim', 'entity_id': 'fsh_29592-E9-1'}, {'text': 'a\nParis hospital', 'role': 'Place', 'entity_id': 'fsh_29592-E55-255'}, {'text': 'Paris', 'role': 'Place', 'entity_id': 'fsh_29592-E56-256'}, {'text': 'last week', 'role': 'Time-Within', 'entity_id': 'fsh_29592-T3-1'}], 'text': 'PLO leader PLO leader Yasser Arafat died in a\nParis hospital last week, after a lot of controversy surrounding his\nillness and death', 'position': (84, 215), 'trigger': 'died'}
2

{'event_type': 'Movement:Transport', 'arguments': [{'text': 'Kelly', 'role': 'Artifact', 'entity_id': 'APW_ENG_20030424.0532-E19-43'}, {'text': 'Seoul', 'role': 'Destination', 'entity_id': 'APW_ENG_20030424.0532-E27-45'}, {'text': 'Friday', 'role': 'Time-Within', 'entity_id': 'APW_ENG_20030424.0532-T8-1'}], 'text': 'Discussions were scheduled to end Friday, when Kelly was to fly to\nSeoul the same day to meet with South Korean officials', 'position': (2228, 2348), 'trigger': 'fly'}
2
{'event_type': 'Contact:Meet', 'arguments': [{'text': 'North Korea', 'role': 'Entity', 'entity_id': 'APW_ENG_20030424.0532-E17-171'}, {'text': 'the United States', 'role': 'Entity', 'entity_id': 'APW_ENG_20030424.0532-E1-172'}, {'text': 'China', 'role': 'Entity', 'entity_id': 'APW_ENG_20030424.0532-E24-173'}, {'text': 'Thursday', 'role': 'Time-Ending', 'entity_id': 'APW_ENG_20030424.0532-T3-1'}], 'text': "Talks between North Korea, the United States and China ended Thursday\nwith U.S. State Department offi

{'event_type': 'Movement:Transport', 'arguments': [{'text': 'he', 'role': 'Artifact', 'entity_id': 'misc.legal.moderated_20050129.2225-E32-164'}, {'text': 'the step-down unit', 'role': 'Destination', 'entity_id': 'misc.legal.moderated_20050129.2225-E47-176'}, {'text': 'the ICU', 'role': 'Origin', 'entity_id': 'misc.legal.moderated_20050129.2225-E44-168'}, {'text': 'the fourth day of his stay', 'role': 'Time-Within', 'entity_id': 'misc.legal.moderated_20050129.2225-T7-1'}], 'text': 'On the fourth day of his stay, he was moved\nout of the ICU to the step-down unit, with a diagnosis of "ICU Psychosis\nsecondary to narcotic administration" and post thoracostomy', 'position': (344, 519), 'trigger': 'moved'}
2
{'event_type': 'Movement:Transport', 'arguments': [{'text': 'he', 'role': 'Artifact', 'entity_id': 'misc.legal.moderated_20050129.2225-E32-16'}, {'text': 'home', 'role': 'Destination', 'entity_id': 'misc.legal.moderated_20050129.2225-E1-160'}], 'text': "Then the\norderly was apparently

{'event_type': 'Life:Die', 'arguments': [{'text': 'Yasser Arafat', 'role': 'Victim', 'entity_id': 'FLOPPINGACES_20050101.2244.048-E1-105'}], 'text': 'One is the death of Yasser Arafat', 'position': (521, 553), 'trigger': 'death'}
2
{'event_type': 'Life:Die', 'arguments': [{'text': 'Arafat', 'role': 'Victim', 'entity_id': 'FLOPPINGACES_20050101.2244.048-E1-2'}], 'text': "But since Arafat's death, Europeans and Americans have been\nable to find common ground: supporting Ariel Sharon's withdrawal from\nGaza, putting pressure on Israel to let the Palestinians hold elections\nand, covertly, backing Mahmoud Abbas to become the next Palestinian\nleader", 'position': (955, 1230), 'trigger': 'death'}
2
{'event_type': 'Personnel:Elect', 'arguments': [{'text': 'Mahmoud Abbas', 'role': 'Person', 'entity_id': 'FLOPPINGACES_20050101.2244.048-E6-14'}, {'text': 'the Palestinians', 'role': 'Entity', 'entity_id': 'FLOPPINGACES_20050101.2244.048-E8-13'}], 'text': "But since Arafat's death, Europeans and 

{'event_type': 'Personnel:Start-Position', 'arguments': [{'text': 'Former senior banker Callum McCarthy', 'role': 'Person', 'entity_id': 'AFP_ENG_20030401.0476-E5-43'}, {'text': "one of the most\nimportant jobs in London's financial world", 'role': 'Position', 'entity_id': 'AFP_ENG_20030401.0476-V1-1'}, {'text': 'September', 'role': 'Time-Within', 'entity_id': 'AFP_ENG_20030401.0476-T4-1'}], 'text': "Former senior banker Callum McCarthy begins what is one of the most\nimportant jobs in London's financial world in September, when\nincumbent Howard Davies steps down", 'position': (340, 503), 'trigger': 'begins'}
2
{'event_type': 'Personnel:End-Position', 'arguments': [{'text': 'Former senior banker Callum McCarthy', 'role': 'Person', 'entity_id': 'AFP_ENG_20030401.0476-E5-43'}], 'text': "Former senior banker Callum McCarthy begins what is one of the most\nimportant jobs in London's financial world in September, when\nincumbent Howard Davies steps down", 'position': (340, 503), 'trigger':