Read in the syntactic and lexico-syntactic features for argument component identification from the output files in the *src/main/resources/syntactic folder* within the **preprocessing** Java project.

In [39]:
import os
from collections import defaultdict 

def get_lca(syn_file): 
    ''' 
    Note that c_token means the constituent type of the current token.
    Likewise for the preceding and following tokens, i.e., c_preceding and c_following. 
    '''
    info = defaultdict(list)
    with open(syn_file,"r") as f: 
        for line in f.readlines(): 
            line = line.split("\t")
            sentIdx = line[0]
            label = line[2]
            position = label.split("-")[1]
            if sentIdx not in info: 
                info[sentIdx] = []
            token_info = {
                "token": line[1], "position": int(position), "c_token": line[3],
                "preceding": line[4], "c_preceding": line[5], "lcaPath_preceding": line[6],
                "following": line[7], "c_following": line[8], "lcaPath_following": line[9].strip("\n")
            }
            info[sentIdx].append(token_info)
            break
    print(dict(info))

def get_lex(lex_file): 
    ''' 
    Info for sentences are divided by newlines. 
    Formatting of info for each sentence: 
    1. For each token, <token_label>\t<label of uppermost node>
    2. (if applicable) <token_label>\t"child_of_uppermost"\t<label of child> 
    3. (if applicable)  <token_label>\t"right_sibling_of_child"\t<right_sibling_label>\t<its lexical head> 
    4. List with each entry being <node_label>\t<node_index>
    5. HashMap mapping <node_index>=<lexical_head>
    '''
    token_info = defaultdict(dict)
    node_info = {}
    sentIdx = 0
    node_indices = {}
    lexical_heads = defaultdict(list)
    with open(lex_file,"r") as f: 
        for line in f.readlines(): 
            if line == "\n": # reached a new sentence 
                node_info[sentIdx] = dict(lexical_heads) 
                print(dict(token_info))
                print(dict(node_info))
                sentIdx += 1 
                node_indices = defaultdict(list)
                lexical_heads = defaultdict(list)
                if sentIdx == 1: break # for sample printing purposes 
                continue
            if line[0] == "[":
                # reached info of type 4 
                line = line.replace("]","").replace("[","").split(", ")
                for entry in line: 
                    entry = entry.strip("\n").split("\t")
                    node_indices[entry[1]] = entry[0]
            elif line[0] == "{": 
                # reached info of type 5 
                line = line.replace("{","").replace("}","").split(", ")
                for entry in line: 
                    entry = entry.split("=")
                    lexical_heads[entry[0]].append({"node": node_indices[entry[0]], 
                                               "head": entry[1] })
            else: 
                line = line.strip("\n").split("\t")
                token = line[0].split("-")[0]
                position = int(line[0].split("-")[1])
                if len(line) == 2: 
                    uppermost = line[1]
                    token_dict = { "token": token, "uppermost": uppermost}
                    token_info[sentIdx][position] = token_dict
                else: # intermediary info 
                    type = line[1]
                    label = line[2]
                    token_info[sentIdx][position][type] = label
                    if type == "right_sibling_of_child": 
                        token_info[sentIdx][position]["right_sibling_type"] = line[3]

In [40]:
syn_dir = "preprocessing/src/main/resources/syntactic/"
lca_dir = syn_dir + "LCA_info"
lex_dir = syn_dir + "lexico_syntactic"
verb_dir = syn_dir + "main_verbs"

for file in sorted(os.listdir(lca_dir)):
    syn_file = f"{lca_dir}/{file}"
    lex_file = f"{lex_dir}/{file}"
    get_lca(syn_file)
    get_lex(lex_file)

{'0': [{'token': 'It', 'position': 1, 'c_token': 'PRP', 'preceding': '', 'c_preceding': '', 'lcaPath_preceding': '-1.000000', 'following': 'is', 'c_following': 'VBZ', 'lcaPath_following': '0.250000'}]}
{0: {1: {'token': 'It', 'uppermost': 'NP', 'child_of_uppermost': 'PRP'}, 2: {'token': 'is', 'uppermost': 'VBZ'}, 3: {'token': 'always', 'uppermost': 'ADVP', 'child_of_uppermost': 'RB'}, 4: {'token': 'said', 'uppermost': 'ROOT', 'child_of_uppermost': 'S'}, 5: {'token': 'that', 'uppermost': 'IN'}, 6: {'token': 'competition', 'uppermost': 'NP', 'child_of_uppermost': 'NN'}, 7: {'token': 'can', 'uppermost': 'MD'}, 8: {'token': 'effectively', 'uppermost': 'ADVP', 'child_of_uppermost': 'RB'}, 9: {'token': 'promote', 'uppermost': 'SBAR', 'child_of_uppermost': 'S'}, 10: {'token': 'the', 'uppermost': 'DT'}, 11: {'token': 'development', 'uppermost': 'NP', 'child_of_uppermost': 'NP', 'right_sibling_of_child ': 'PP'}, 12: {'token': 'of', 'uppermost': 'IN'}, 13: {'token': 'economy', 'uppermost': 'PP',