In [27]:
import numpy as np
import sys
from io import open
import os
from logging import debug, info, warning, error

## Reading data

In [19]:
class PARSE_LAYER:
    SYM = 2
    SEM = 3
    CAT = 4
    SNS = 5
    ROL = 6

In [25]:
# From: https://github.com/RikVN/DRS_parsing

def get_conll_blocks(in_file, split_lines=True, add_doc=False):
    '''Read a CoNLL formatted input file and return the list of lists per sentence/document'''
    docs = []
    cur_doc = []
    doc_ids = []
    num_lines = -1
    for line in open(in_file, 'r'):
        if not line.strip() and cur_doc:
            docs.append(cur_doc)
            cur_doc = []
            doc_ids.append(num_lines)
        elif line.strip().startswith('# newdoc'):
            # Keep track of start of new documents in doc_ids
            # We form a list of all sentences in docs, but at some
            # point we have to put multi-sent docs in a single file
            num_lines += 1
            if add_doc:
                cur_doc.append(line.strip())
        elif not line.strip().startswith('#') and line.strip():
            if len(line.split()) != 7:
                raise ValueError("Line should always consist of 6 layer-values, found {0}\n{1}".format(len(line.split()), line.strip()))
            if split_lines:
                cur_doc.append(line.split())
            else:
                cur_doc.append(line.strip())
    # Add left over one if there's not an ending last line
    if cur_doc:
        docs.append(cur_doc)
        doc_ids.append(num_lines)
    # If num_lines is never increased, this means that the # newdoc information was not added
    # In that case we just assume the default of 1 doc per block
    if num_lines == -1:
        info("Assuming 1 document per CoNLL block")
        doc_ids = range(0, len(docs))
    info("Extracted {0} sents, for {1} docs".format(len(docs), doc_ids[-1] + 1))
    return docs, doc_ids

In [26]:


def read_data(directory, filename):
    path = os.path.join(directory, filename)
    docs, docs_ids = get_conll_blocks(path)
    return docs


language = 'en'
standard = 'gold'

directory = os.path.join('./data/4.0.0', language, standard)
print(directory)

train_data = read_data(os.path.join(directory), 'train.conll')
test_data = read_data(os.path.join(directory), 'test.conll')




./data/4.0.0/en/gold
O
girl.n.01
O
style.v.02
female.n.02
hair.n.01
