In [7]:
import sys
import os
import random
import ujson
import argparse
import pandas as pd
import numpy as np
import pickle
import json
import csv
from collections import defaultdict, OrderedDict
import jsonlines

In [8]:
def clean(tokens):
    mapping = {'-LRB-': '(',
                '-RRB-': ')',
                '-LSB-': '[',
                '-RSB-': ']',
                '-LCB-': '{',
                '-RCB-': '}'}
    for i in range(len(tokens)):
        if tokens[i] in mapping:
            tokens[i] = mapping[tokens[i]]
        tokens[i] = tokens[i].lower()
    return tokens


In [11]:
def extract_subj_obj(tokens, d):
    ss, se = d['subj_start'], d['subj_end']
    subj = tokens[ss:se+1]
    subj = ' '.join(subj)
    subj_span = "{}:{}".format(ss, se)
    
    os, oe = d['obj_start'], d['obj_end']
    obj = tokens[os:oe+1]
    obj = ' '.join(obj) #CHECK!
    obj_span = "{}:{}".format(os, oe)
    return subj, subj_span, obj, obj_span

In [12]:
def create_one_file(train, dev, test, all_out, subjobj=False):
    with open(train) as infile:
        data_train = json.load(infile)
        print(len(data_train))

    with open(dev) as infile:
        data_dev = json.load(infile)
        print(len(data_dev))

    with open(test) as infile:
        data_test = json.load(infile)
        print(len(data_test))

    unq_id = 0
    with open(all_out, 'w') as outfile:
        for d in data_train:
            tokens = d['token']
            tokens = clean(tokens)
            example = ' '.join(tokens)
            subj, subj_span, obj, obj_span = extract_subj_obj(tokens, d)
            entry = {"sentence": example, 
                     "id": d['id'], 
                     "aliases": [subj, obj], 
                     "spans": [subj_span, obj_span], 
                     "sent_idx_unq": unq_id
                    } 
            json.save(entry, outfile)
            outfile.write('\n')
            unq_id += 1
        print("UNQ ID is: ", unq_id)
        
        for d in data_dev:
            tokens = d['token']
            tokens = clean(tokens)
            example = ' '.join(tokens)
            subj, subj_span, obj, obj_span = extract_subj_obj(tokens, d)
            entry = {"sentence": example, 
                     "id": d['id'], 
                     "aliases": [subj, obj], 
                     "spans": [subj_span, obj_span], 
                     "sent_idx_unq": unq_id
                    }            
            json.save(entry, outfile)
            outfile.write('\n')
            unq_id += 1
        print("UNQ ID is: ", unq_id)

        for d in data_test:
            tokens = d['token']
            tokens = clean(tokens)
            example = ' '.join(tokens)
            subj, subj_span, obj, obj_span = extract_subj_obj(tokens, d)
            entry = {"sentence": example, 
                     "id": d['id'], 
                     "aliases": [subj, obj], 
                     "spans": [subj_span, obj_span], 
                     "sent_idx_unq": unq_id
                    }
            json.save(entry, outfile)
            outfile.write('\n')
            unq_id += 1

        print("UNQ ID is: ", unq_id)

        

In [13]:
expt_dir = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_cidr_model/bootleg_09132020/subjobj_candjen_only_2/'
source_path = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_cidr_model/tacred/base_data/'
inname_train = "{}train.json".format(source_path)
inname_dev = "{}dev_rev.json".format(source_path)
inname_test = "{}test_rev.json".format(source_path)
outname_all = '{}candgen_prepped_for_bootinput.jsonl'.format(expt_dir)
create_one_file(inname_train, inname_dev, inname_test, outname_all, subjobj = False)

68124
22631
15509
UNQ ID is:  68124
UNQ ID is:  90755
UNQ ID is:  106264


In [6]:
# JSONL file example:


# {"sentence": <sentence>,
# "aliases": [<list of mentions you want to extract>],
# "spans": [<list of word offsets for each alias>],
# "sent_idx_unq": <unique sentence index>}

# For examples
# {"sentence": "Barak enjoys walks on the CA beach with Michelle",
# "aliases": ["barak", "ca"],
# "spans": ["0:1", "5:6"],
# "sent_idx_unq": 4}