In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
import ujson as json
from tqdm import tqdm
import numpy as np
from pathlib import Path
from collections import defaultdict
from IPython.core.display import display, HTML
sys.path.append("/dfs/scratch0/lorr1/projects/bootleg/tutorials/downstream_tutorial/bootleg_utilities")
display(HTML("<style>.container { width:90% !important; }</style>"))

### Instructions
Before running this, it's recommended to filter the alias candidate map before mention extraction to avoid spurious mentions.

You will need to run
`python3 -m bootleg.utils.preprocessing.compute_statistics --data_dir <wiki_train_dir> --save_dir <wiki_train_dir>  --no_types --num_workers 30 --strip --lower`

And then use the `filter_alias_cand_map.ipynb` notebook.

In [3]:
from bootleg.end2end.extract_mentions import extract_mentions
wiki_train_dir = Path("/dfs/scratch0/lorr1/projects/bootleg-data/data/korealiases_title_0122/")
tacred_dir = Path("/dfs/scratch0/lorr1/projects/bootleg-data/downstream/tacred/")
emb_dir = Path("/dfs/scratch0/lorr1/projects/bootleg-data/embs")
root_boot_model = Path("/dfs/scratch1/lorr1/projects/bootleg/logs_guid/base/2021_01_23/23_45_31/2f2b98c2")

In [5]:
# Converts three separate files into one for mention extraction
# Outputs: tacred_dir / all_tacred_bootinput.jsonl
import convert_to_jsonl
convert_to_jsonl.main(source_path=tacred_dir)

68124
22631
15509
Data is saved to /dfs/scratch0/lorr1/projects/bootleg-data/downstream/tacred/all_tacred_bootinput.jsonl


In [5]:
cand_map = wiki_train_dir / 'entity_db/entity_mappings/alias2qids_filt.json'
infile = tacred_dir / "all_tacred_bootinput.jsonl"
outfile = tacred_dir / "all_tacred_bootinput_ent.jsonl"

In [None]:
extract_mentions(in_filepath=infile, out_filepath=outfile, cand_map_file=cand_map, num_workers=20)

In [10]:
from tutorials.utils import load_mentions
# If you want to do error analysis and examine the mentions
bootleg_mentions_df = load_mentions(outfile)
display(bootleg_mentions_df.sample(20))

Unnamed: 0,sentence,aliases,spans
5385,"Kurnaz was seized by US forces in Pakistan shortly after the September 11 , 2001 attacks on the United States and was later sent to a US prison in Afghanistan before being incarcerated at Guantanamo Bay in 2002 .","[us forces, pakistan, september 11 2001 attacks, united states, us prison, afghanistan, guantanamo bay]","[[4, 6], [7, 8], [11, 16], [18, 20], [26, 28], [29, 30], [34, 36]]"
26969,"Abigail Johnson , Massachusetts , 44 , $ 12.5 , Fidelity 29 .","[massachusetts, fidelity]","[[3, 4], [10, 11]]"
19609,"The youthful zeal of the Paul movement `` does recall the early Goldwater movement , which was also jampacked with people dropping out of graduate school , college , maybe even high school , to devote themselves 24/7 to what they called the ` revolution , '' ' said Rick Perlstein , author of `` Before the Storm : Barry Goldwater and the Unmaking of the American Consensus '' ( Hill and Wang , 2002 ) .","[zeal, paul, goldwater, revolution, rick perlstein, storm, barry goldwater, hill, wang]","[[2, 3], [5, 6], [12, 13], [44, 45], [49, 51], [57, 58], [59, 61], [70, 71], [72, 73]]"
73045,He is also charged with violating his squadron commander 's October 2009 order to notify his sexual partners about his HIV status before having sexual relations and to use condoms .,"[commander, october 2009, sexual partners, hiv, sexual relations]","[[8, 9], [10, 12], [16, 18], [20, 21], [24, 26]]"
32591,"`` We reserve the right to take further legal action as we see fit , '' Haifa University President Aaron Ben Zeev told The Jerusalem Post .","[legal action, haifa university, the jerusalem post]","[[8, 10], [16, 18], [23, 26]]"
92664,1 on the US album chart with `` Something For Everybody '' -- his sixth US No,"[us album chart, us]","[[3, 6], [15, 16]]"
31391,High Score Table PRC 79 Jo 74 Peeb 70 Sebs 68 Helen 67 Jimbo 66 Snake 66 Sumi 66 Lozz 66 STEPHEN 65 Alistair 65 Robz 65 Murf 64 Stroma 64 Peggie 64 Martyn 64 Gee 63 Alison 63 Moog 62 Simon 61 Jan 60 Nicky 60 Bluenose 60 Barry 60 Eyan 57 Dan 53 Piablo 52 Cally 45 AJ 41 Jim D 28,"[prc, jo, sebs, helen, jimbo, snake, sumi, alistair, stroma, peggie, martyn, gee, alison, moog, simon, jan, nicky, bluenose, barry, dan, aj]","[[3, 4], [5, 6], [9, 10], [11, 12], [13, 14], [15, 16], [17, 18], [23, 24], [29, 30], [31, 32], [33, 34], [35, 36], [37, 38], [39, 40], [41, 42], [43, 44], [45, 46], [47, 48], [49, 50], [53, 54], [59, 60]]"
54516,Ambac plunged 23.82 percent to 2.59 and MBIA dropped 21.22 percent to 8.24 after reporting third-quarter losses and steep write-downs on soured assets .,"[mbia, losses, writedowns]","[[7, 8], [16, 17], [19, 20]]"
79440,"Iraqi Shiite leader Abdel Aziz al-Hakim has died in a Tehran hospital after a long battle with lung cancer , his son Mohsen Hakim told AFP .","[alhakim, tehran, lung, afp]","[[5, 6], [10, 11], [17, 18], [25, 26]]"
94508,"He said his wife went to see Piedra for teeth whitening , but ended up with a recommendation for extensive work and an unauthorized charge of $ 3,218 on his credit card .","[piedra, recommendation, credit card]","[[7, 8], [17, 18], [30, 32]]"


In [10]:
from bootleg.utils.parser.parser_utils import parse_boot_and_emm_args
from bootleg.utils.utils import load_yaml_file, dump_yaml_file
from bootleg.run import run_model

config_in_path = root_boot_model / 'run_config.yaml'

config_args = load_yaml_file(config_in_path)

# decrease number of data threads as this is a small file
config_args["run_config"]["dataset_threads"] = 2
config_args["run_config"]["log_level"] = "info"
# set the model checkpoint path
config_args["emmental"]["model_path"] = str(root_boot_model / 'last_model.pth')
config_args["emmental"]["log_path"] = str("bootleg_results")
# set the path for the entity db and candidate map
config_args["data_config"]["entity_dir"] = str(wiki_train_dir / 'entity_db')
config_args["data_config"]["alias_cand_map"] = str(cand_map.name)

config_args["data_config"]["data_dir"] = str(tacred_dir)
config_args["data_config"]["test_dataset"]["file"] = str(outfile)

# set the embedding paths
config_args["data_config"]["emb_dir"] =  str(emb_dir)
config_args["data_config"]["word_embedding"]["cache_dir"] =  str(emb_dir / 'pretrained_bert_models')

# Put on CPU
# config_args["emmental"]["device"] = -1

# Can use this to run via command line:
# python3 -m bootleg.run --mode dump_embs --config_script /dfs/scratch0/lorr1/projects/bootleg/notebooks/tacred/tacred_eval_config.yaml
config_out_path = "tacred_eval_config.yaml"
dump_yaml_file(config_out_path, config_args)

config_args = parse_boot_and_emm_args(config_args) # or you can pass in the config_out_path

# bootleg_label_file, bootleg_emb_file = run_model(mode="dump_embs", config=config_args)

In [16]:
bootleg_emb_file = "/dfs/scratch1/lorr1/projects/bootleg/bootleg_results/2021_02_03/18_34_36/c4128349/all_tacred_bootinput_ent/last_model/bootleg_embs.npy"
bootleg_label_file = "/dfs/scratch1/lorr1/projects/bootleg/bootleg_results/2021_02_03/18_34_36/c4128349/all_tacred_bootinput_ent/last_model/bootleg_labels.jsonl"

In [18]:
import add_bootleg_feature
threshold = 0.5
add_bootleg_feature.main(os.path.dirname(bootleg_label_file), str(tacred_dir), threshold)

Index(['id', 'sentence', 'aliases', 'spans', 'gold', 'cand_probs', 'qids',
       'sent_idx_unq', 'probs', 'ctx_emb_ids', 'entity_ids'],
      dtype='object')
(106264, 11)
TRAIN SHAPE:  (68124, 14)
DEV SHAPE:  (22631, 14)
TEST SHAPE (15509, 14)
Removed 50156 out of 106264 with threshold 0.5
Saved datasets with Bootleg features to train_ent.json, dev_ent.json, test_ent.json in /dfs/scratch0/lorr1/projects/bootleg-data/downstream/tacred directory


In [21]:
import prepare_entity_vocab_bootleg
prepare_entity_vocab_bootleg.main(os.path.dirname(bootleg_emb_file), str(tacred_dir))

Saved ent at /dfs/scratch0/lorr1/projects/bootleg-data/downstream/tacred/ent_vocab.pkl
