In [18]:
# run this before going through candidate generation to augment the base data

In [43]:
import pandas as pd
pd.set_option('display.max_colwidth', -1)

import numpy as np
import sklearn
from sklearn import metrics
import nltk 
from collections import Counter
import matplotlib.pyplot as plt
import itertools
import ast
import json
import ujson

nltk.download('stopwords')
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /lfs/1/simran/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
base_data = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_iclr_model/tacred/base_data'

train_file = "{}/train.json".format(base_data)
with open(train_file) as train:
    df_train = json.load(train)
    df_train = pd.DataFrame.from_dict(df_train, orient='columns')
    print(df_train.shape)
    
dev_file = "{}/dev_rev.json".format(base_data)
with open(dev_file) as dev:
    df_dev = json.load(dev)
    df_dev = pd.DataFrame.from_dict(df_dev, orient='columns')
    print(df_dev.shape)
    
test_file = "{}/test_rev.json".format(base_data)
with open(test_file) as test:
    df_test = json.load(test)
    df_test = pd.DataFrame.from_dict(df_test, orient='columns')
    print(df_test.shape)

(68124, 14)
(22631, 14)
(15509, 14)


In [45]:
print(df_train.columns)

Index(['id', 'docid', 'relation', 'token', 'subj_start', 'subj_end',
       'obj_start', 'obj_end', 'subj_type', 'obj_type', 'stanford_pos',
       'stanford_ner', 'stanford_head', 'stanford_deprel'],
      dtype='object')


# augment pronouns

In [46]:
def print_row(row):
    tokens = row['token']
    ss, se = row['subj_start'], row['subj_end']
    os, oe = row['obj_start'], row['obj_end']
    subj = ' '.join(tokens[ss:se+1])
    obj = ' '.join(tokens[os:os+1])
    print("TOKENS: ", tokens)
    print("SUBJ: ", subj)
    print("OBJ: ", obj)

    print()

In [48]:
# length
pronouns = ['he', 'she', 'her', 'his', 'him']
length = 1
df_list = [df_train, df_dev, df_test]
df_names = ['train', 'dev', 'test']
for i in range(len(df_list)):
    df = df_list[i]
    num_one_person_var = 0
    subj_pronoun = 0
    obj_pronoun = 0
    for index, row in df.iterrows(): 
        tokens = row['token']
        ss, se = row['subj_start'], row['subj_end']
        os, oe = row['obj_start'], row['obj_end']
        subj = ' '.join(tokens[ss:se+1])
        obj = ' '.join(tokens[os:os+1])
        if any(pronoun for pronoun in pronouns if pronoun == subj or pronoun == obj): #there's a pronoun subj/obj
            ner_tags = row['stanford_ner']
            if len([tag for tag in ner_tags if tag=="PERSON"]) == length and 'said' not in tokens:
                
                print('tokens:',tokens)
                print('ner_tags:', ner_tags)
                print('subj:', tokens[ss:se+1])
                print('obj:', tokens[os:oe+1])
                
                index_person = ner_tags.index('PERSON')
                if index_person < len(tokens)-length and ner_tags[index_person:index_person+length] == ['PERSON']*length:
                    num_one_person_var += 1
                    person = tokens[index_person:index_person+length]
                    
                    #print_row(row)
                if any(pronoun for pronoun in pronouns if pronoun == subj):
                    tokens[ss:se+length+1] = person
                    ner_tags[ss:se+length+1] = 'PERSON'
                    subj_pronoun +=1 
                    se = se + length
                if any(pronoun for pronoun in pronouns if pronoun == obj):
                    tokens[os:oe+length+1] = person
                    ner_tags[os:oe+length+1] = 'PERSON'
                    obj_pronoun +=1  
                    oe = oe + length
                    
                df.at[index, "token"] = tokens
                df.at[index, "stanford_ner"] = ner_tags
                df.at[index, "subj_end"] = se
                df.at[index, "obj_end"] = oe
                
                print('tokens:',tokens)
                print('ner_tags:', ner_tags)
                print('subj:', tokens[ss:se+length+1])
                print('obj:', tokens[os:oe+length+1])
                
                break
        
                
                
    print("Split:", df_names[i])            
    print("Of the examples with pronouns, ", num_one_person_var, " have just {} person ner tag in them.".format(length))
    print("Of these examples pronouns, replaced", subj_pronoun, " subj as a pronoun")
    print("Of these examples pronouns, replaced", obj_pronoun, " obj as a pronoun")
    print()

 

tokens: ['-LSB-', 'By', 'the', 'way', ',', 'I', 'first', 'pointed', 'out', 'Alice', 'when', 'she', 'was', 'announced', 'as', 'being', 'the', 'new', 'Agent', 'Provocateur', 'face', ',', 'since', 'then', 'they', "'ve", 'been', 'giving', 'her', 'a', 'MASSIVE', 'PR', 'push', '-LRB-', 'obviously', 'got', 'big', 'plans', 'for', 'her', '-RRB-', ',', 'in', 'The', 'Sun', 'in', 'particular', ',', 'which', 'I', 'missed', '.']
ner_tags: ['O', 'O', 'O', 'O', 'O', 'O', 'ORDINAL', 'O', 'O', 'PERSON', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'MISC', 'MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'MISC', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
subj: ['Alice']
obj: ['she']
tokens: ['-LSB-', 'By', 'the', 'way', ',', 'I', 'first', 'pointed', 'out', 'Alice', 'when', 'Alice', 'announced', 'as', 'being', 'the', 'new', 'Agent', 'Provocateur', 'face', ',', 'since', 'then', 'they', "'ve", 'been', 'giving', 'her', 'a', 'MASSIVE', 'PR', 'p

In [None]:
Split: train
Of the examples with pronouns,  1783  have just 2 person ner tag in them.
Of these examples pronouns, replaced 4190  subj as a pronoun
Of these examples pronouns, replaced 2196  obj as a pronoun

Split: dev
Of the examples with pronouns,  744  have just 2 person ner tag in them.
Of these examples pronouns, replaced 1746  subj as a pronoun
Of these examples pronouns, replaced 776  obj as a pronoun

Split: test
Of the examples with pronouns,  651  have just 2 person ner tag in them.
Of these examples pronouns, replaced 1698  subj as a pronoun
Of these examples pronouns, replaced 752  obj as a pronoun


In [81]:
out_dir = '/dfs/scratch1/simran/tacred/tacred-relation-bootleg/dataset_bootleg_iclr_model/bootleg_model_1002/experiments_noft/everything'


In [82]:
train_out = df_train.to_json(r'{}train_coref.json'.format(out_dir),orient='records')

In [83]:
dev_out = df_dev.to_json(r'{}dev_coref.json'.format(out_dir),orient='records')

In [84]:
test_out = df_test.to_json(r'{}test_coref.json'.format(out_dir),orient='records')