Read in the syntactic and lexico-syntactic features for argument component identification from the output files in the *src/main/resources folder* within the **preprocessing** Java project.

In [52]:
import os
from collections import defaultdict 

def get_lca(syn_file): 
    ''' 
    Note that c_token means the constituent type of the current token.
    Likewise for the preceding and following tokens, i.e., c_preceding and c_following. 
    '''
    info = defaultdict(list)
    with open(syn_file,"r") as f: 
        for line in f.readlines(): 
            line = line.split("\t")
            sentIdx = line[0]
            label = line[2]
            position = label.split("-")[1]
            if sentIdx not in info: 
                info[sentIdx] = []
            token_info = {
                "token": line[1], "position": int(position), "c_token": line[3],
                "preceding": line[4], "c_preceding": line[5], "lcaPath_preceding": line[6],
                "following": line[7], "c_following": line[8], "lcaPath_following": line[9].strip("\n")
            }
            info[sentIdx].append(token_info)
            break
    print(info)

def get_lex(lex_file): 
    ''' 
    Info for sentences are divided by newlines. 
    Formatting of info for each sentence: 
    1. For each token, <token_label>\t<constituent type of uppermost node>
    2. List with each entry being <node_label>\t<node_index>
    3. HashMap mapping <node_index>=<lexical_head>
    4. There may be intermediary lines that start with a constituent type. 
        These are the node labels and types for the children of the uppermost node for each token. 
    '''
    token_info = defaultdict(dict)
    node_info = {}
    sentIdx = 0
    node_indices = {}
    lexical_heads = defaultdict(list)
    with open(lex_file,"r") as f: 
        for line in f.readlines(): 
            if line == "\n": # reached a new sentence 
                node_info[sentIdx] = lexical_heads
                print(token_info)
                print(node_info)
                sentIdx += 1 
                node_indices = {}
                lexical_heads = {}
                if sentIdx == 1: break
                continue
            if line[0] == "[":
                # reached info of type 2 
                line = line.replace("]","").replace("[","").split(", ")
                for entry in line: 
                    entry = entry.strip("\n").split("\t")
                    node_indices[entry[1]] = entry[0]
            elif line[0] == "{": 
                # reached info of type 3 
                line = line.replace("{","").replace("}","").split(", ")
                for entry in line: 
                    entry = entry.split("=")
                    lexical_heads[entry[0]].append({"node": node_indices[entry[0]], 
                                               "head": entry[1] })
            else: 
                line = line.strip("\n").split("\t")
                token = line[0].split("-")[0]
                position = int(line[0].split("-")[1])
                if len(line) == 2: 
                    uppermost = line[1]
                    token_dict = { "token": token, "uppermost": uppermost,
                                  "intermediaries": []}
                    token_info[sentIdx][position] = token_dict
                else: # intermediary info 
                    child = line[1]
                    head = line[2]
                    token_info[sentIdx][position]["intermediaries"].append((child,head))            

In [54]:
syn_dir = "preprocessing/src/main/resources/LCA_info"
lex_dir = "preprocessing/src/main/resources/lexico_syntactic"

for file in sorted(os.listdir(syn_dir)):
    syn_file = f"{syn_dir}/{file}"
    lex_file = f"{lex_dir}/{file}"
    get_lca(syn_file)
    get_lex(lex_file)

defaultdict(<class 'list'>, {'0': [{'token': 'It', 'c_token': 'PRP', 'preceding': '', 'c_preceding': '', 'lcaPath_preceding': '-1.000000', 'following': 'is', 'c_following': 'VBZ', 'lcaPath_following': '0.250000'}]})
defaultdict(<class 'dict'>, {0: {1: {'token': 'It', 'uppermost': 'NP', 'intermediaries': [('PRP', 'It-1')]}, 2: {'token': 'is', 'uppermost': 'VBZ', 'intermediaries': []}, 3: {'token': 'always', 'uppermost': 'ADVP', 'intermediaries': [('RB', 'always-3')]}, 4: {'token': 'said', 'uppermost': 'ROOT', 'intermediaries': [('S', 'said-4')]}, 5: {'token': 'that', 'uppermost': 'IN', 'intermediaries': []}, 6: {'token': 'competition', 'uppermost': 'NP', 'intermediaries': [('NN', 'competition-6')]}, 7: {'token': 'can', 'uppermost': 'MD', 'intermediaries': []}, 8: {'token': 'effectively', 'uppermost': 'ADVP', 'intermediaries': [('RB', 'effectively-8')]}, 9: {'token': 'promote', 'uppermost': 'SBAR', 'intermediaries': [('IN', 'that-5'), ('S', 'promote-9')]}, 10: {'token': 'the', 'uppermost