In [1]:
%load_ext autoreload
%autoreload 2

In [4]:
import os, sys
from tutorials.utils import load_train_data, score_predictions
import ujson as json
import jsonlines
from tqdm import tqdm
from itertools import chain, islice
import random
import numpy as np
from fuzzywuzzy import fuzz, process
from pathlib import Path
from collections import defaultdict
from bootleg.symbols.entity_symbols import EntitySymbols
import shutil
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [9]:
data_dir = Path("/dfs/scratch0/lorr1/projects/bootleg-data/data/medmentions_0203")
train_file = data_dir / "spacy_10_noNC/train.jsonl"
test_file = data_dir / "spacy_10_noNC/test.jsonl"
dev_file = data_dir / "spacy_10_noNC/dev.jsonl"
print(f"Loading entity symbols")
es = EntitySymbols(load_dir = os.path.join(data_dir, "spacy_10_noNC/entity_db/entity_mappings"))
print(f"Reading a2q")
a2q = es.get_alias2qids()
q2title = es.get_qid2title()

Loading entity symbols
Reading a2q


In [7]:
qid2cnt = defaultdict(int)
with jsonlines.open(train_file) as in_f:
    for line in in_f:
        for qid in line["qids"]:
            qid2cnt[qid] += 1

In [10]:
train_df = load_train_data(
    train_file, q2title, cands_map=a2q, type_symbols=None, kg_symbols=None
)
dev_df = load_train_data(
    dev_file, q2title, cands_map=a2q, type_symbols=None, kg_symbols=None
)

100%|██████████| 27008/27008 [00:01<00:00, 17119.79it/s]
100%|██████████| 8949/8949 [00:02<00:00, 3258.28it/s] 


In [6]:
dev_pred_file = Path("/dfs/scratch1/lorr1/projects/bootleg/logs_medmentions/base/2021_02_01/23_38_13/28ec0c96/dev/last_model/bootleg_labels.jsonl")
dev_pred_file = Path("/dfs/scratch1/lorr1/projects/bootleg/logs_medmentions/base/2021_02_01/23_38_13/28ec0c96/dev_titlecue/last_model/bootleg_labels.jsonl")
dev_preds_df = score_predictions(orig_file=dev_file,
                 pred_file=dev_pred_file,
                 title_map=q2title,
                 cands_map=a2q,
                 type_symbols=[],
                 kg_symbols=[])

100%|██████████| 6949/6949 [00:00<00:00, 7975.69it/s]


In [7]:
print(dev_preds_df.shape[0], dev_preds_df[dev_preds_df["pred_qid"]==dev_preds_df["gold_qid"]].shape[0])

15045 7102


In [8]:
if "cands" in dev_preds_df:
    dev_preds_df["num_cands"] = dev_preds_df["cands"].apply(lambda x: len(x))
    dev_preds_df["cand_names"] = dev_preds_df["cands"].apply(lambda x: [y[0] for y in x])
    dev_preds_df["cand_probs"] = dev_preds_df["cands"].apply(lambda x: [y[1] for y in x])
    del dev_preds_df["cands"]
dev_preds_df["span"] = dev_preds_df["span"].apply(lambda x: tuple(x))
dev_preds_df["in_cand"] = dev_preds_df.apply(lambda x: x["gold_title"] in x["cand_names"], axis=1)
dev_preds_df["qid_cnt"] = dev_preds_df["gold_qid"].apply(lambda x: qid2cnt.get(x, 0))
dev_preds_df["pred_qid_cnt"] = dev_preds_df["pred_qid"].apply(lambda x: qid2cnt.get(x, 0))
dev_preds_df = dev_preds_df[dev_preds_df["pred_qid"] != -1]

In [9]:
print(dev_preds_df.shape[0], dev_preds_df[dev_preds_df["pred_qid"] == dev_preds_df["gold_qid"]].shape[0])

15045 7102


In [11]:
def compute_fuzz_score(df):
    crc = 0
    no_cands = 0
    for i, row in tqdm(df.iterrows(), total=df.shape[0]):
        cand_names = row["cand_names"]
        if len(cand_names) == 0:
            no_cands += 1
            continue
        sp_l, sp_r = row["span"]
        al = " ".join(row["sentence"].split()[sp_l:sp_r])
        r = process.extractOne(al, cand_names)
    #     print(row["cands"], r)
        gld = row["gold_title"]
        if r[0] == gld:
            crc += 1


    print(crc, no_cands, df.shape[0], crc/(df.shape[0]-no_cands))

In [None]:
print("DEV")
compute_fuzz_score(dev_df)
print("DEV WRONG")
compute_fuzz_score(dev_preds_df[dev_preds_df["pred_qid"] != dev_preds_df["gold_qid"]])
print("TRAIN")
compute_fuzz_score(train_df)

In [11]:
def subsample_data(orig, new):
    org = 0
    kp = 0
    with open(orig) as in_f, open(new, "w") as out_f:
        for line in tqdm(in_f, total=sum(1 for _ in open(orig))):
            line = json.loads(line)
            new_line = {
                "aliases": [],
                "qids": [],
                "spans": [],
                "gold": [],
                "sentence": "",
                "sent_idx_unq": -1,
                "doc_id": -1
            }
            for al, sp, gld in zip(line["aliases"], line["spans"], line["qids"]):
                org += 1
                cand_names = [q2title[p[0]] for p in a2q[al]]
                if len(cand_names) == 0:
                    continue
                sp_l, sp_r = sp
                al2 = " ".join(line["sentence"].split()[sp_l:sp_r])
                r = process.extractOne(al2, cand_names)
                if r[0] == q2title[gld]:
                    kp += 1
                    new_line["aliases"].append(al)
                    new_line["spans"].append(sp)
                    new_line["qids"].append(gld)
                    new_line["gold"].append(True)
                    new_line["doc_id"] = line["doc_id"]
                    new_line["sentence"] = line["sentence"]
                    new_line["sent_idx_unq"] = line["sent_idx_unq"]
            if new_line["doc_id"] != -1:
                out_f.write(json.dumps(new_line) + "\n")

    print(f"Kept: {kp} Our of: {org}")

In [12]:
new_train = data_dir / "spacy_10_noNC/train_titlecue.jsonl"
new_dev = data_dir / "spacy_10_noNC/dev_titlecue.jsonl" 
subsample_data(dev_file, new_dev)
subsample_data(train_file, new_train)

 33%|███▎      | 8949/26993 [00:21<00:42, 425.45it/s]
  0%|          | 45/26993 [00:00<01:02, 428.85it/s]

Kept: 17348 Our of: 40817


27008it [01:00, 449.34it/s]                           

Kept: 50707 Our of: 122002





In [12]:
dev_pred_file_ft = Path("/dfs/scratch1/lorr1/projects/bootleg/logs_medmentions/titlecue/2021_02_02/21_44_32/de3aeb1d/dev_titlecue/checkpoint_9.0/bootleg_labels.jsonl")
dev_preds_ft_df = score_predictions(orig_file=dev_file,
                 pred_file=dev_pred_file_ft,
                 title_map=q2title,
                 cands_map=a2q,
                 type_symbols=[],
                 kg_symbols=[])

if "cands" in dev_preds_ft_df:
    dev_preds_ft_df["num_cands"] = dev_preds_ft_df["cands"].apply(lambda x: len(x))
    dev_preds_ft_df["cand_names"] = dev_preds_ft_df["cands"].apply(lambda x: [y[0] for y in x])
    dev_preds_ft_df["cand_probs"] = dev_preds_ft_df["cands"].apply(lambda x: [y[1] for y in x])
    del dev_preds_ft_df["cands"]
dev_preds_ft_df["span"] = dev_preds_ft_df["span"].apply(lambda x: tuple(x))
dev_preds_ft_df["in_cand"] = dev_preds_ft_df.apply(lambda x: x["gold_title"] in x["cand_names"], axis=1)
dev_preds_ft_df["qid_cnt"] = dev_preds_ft_df["gold_qid"].apply(lambda x: qid2cnt.get(x, 0))
dev_preds_ft_df["pred_qid_cnt"] = dev_preds_ft_df["pred_qid"].apply(lambda x: qid2cnt.get(x, 0))
dev_preds_ft_df = dev_preds_ft_df[dev_preds_ft_df["pred_qid"] != -1]

100%|██████████| 6949/6949 [00:00<00:00, 7886.21it/s]


In [18]:
print(dev_preds_df.shape[0], dev_preds_df[dev_preds_df["pred_qid"] == dev_preds_df["gold_qid"]].shape[0])
print(dev_preds_ft_df.shape[0], dev_preds_ft_df[dev_preds_ft_df["pred_qid"] == dev_preds_ft_df["gold_qid"]].shape[0])
print(dev_preds_ft_df.columns)
cols = ["sent_idx", "sentence", "in_cand", "qid_cnt", "pred_qid_cnt", "span", "gold_qid", "gold_title", "pred_qid", "pred_title", "all_spans", "cand_names", "cand_probs"]

15045 7102
15045 12371
Index(['sentence', 'sent_idx', 'aliases', 'span', 'slices', 'alias',
       'alias_idx', 'is_gold_label', 'gold_qid', 'pred_qid', 'gold_title',
       'pred_title', 'all_gold_qids', 'all_pred_qids', 'gold_label_aliases',
       'all_is_gold_labels', 'all_spans', 'num_cands', 'cand_names',
       'cand_probs', 'in_cand', 'qid_cnt', 'pred_qid_cnt'],
      dtype='object')


In [21]:
dev_preds_df[dev_preds_df["pred_qid"] == dev_preds_df["gold_qid"]][cols].sample(10)

Unnamed: 0,sent_idx,sentence,in_cand,qid_cnt,pred_qid_cnt,span,gold_qid,gold_title,pred_qid,pred_title,all_spans,cand_names,cand_probs
14050,42414,"Hypertension is a prevalent and costly chronic condition in the U.S. and worldwide , and alcohol use is a modifiable hypertension risk factor .",True,157,157,"(21, 23)",C0035648,Risk factor (observable entity),C0035648,Risk factor (observable entity),"[[0, 1], [6, 8], [10, 11], [20, 21], [21, 23]]","[Risk factor (observable entity), Risk assessment (procedure), Disease Predictive Factor, management procedures risk, RISKSOC, trigger, Risk factor;cardiovascular, Lipid risk factors, rndx suicide risk, psychological risk factors, High risk of (qualifier value), Risk Factor Assessment, Biologic agent, Prognostic/Survival Factor, Cardiovascular event risk (finding), Disease Susceptibility [Disease/Finding], Health Social Determinant, High risk factors, rndx aspiration risk, Intermediate risk,...","[0.9956347346, 0.0034684855, 0.000121272, 8.756e-07, 1.43699e-05, 5.1e-08, 1.51812e-05, 5.789e-07, 2.7885e-06, 6.2792e-06, 0.0001006576, 8.0376e-06, 3.265e-07, 0.0005208465, 5.141e-07, 4.397e-07, 1.0984e-06, 3.00663e-05, 1.982e-07, 8.1053e-06, 2.127e-07, 3.005e-07, 3.2301e-06, 1.2819e-05, 9.1972e-06, 2.25587e-05, 1.67112e-05]"
6419,38831,"Although we focus here on the detection of positive selection from multiple population data , the local score approach is general and can be applied to other genome scans for selection or other genomewide analyses such as GWAS .",True,20,20,"(37, 38)",C2350277,WGA study,C2350277,WGA study,"[[9, 10], [30, 31], [37, 38]]","[WGA study, RNA SEQ DATABASES, research, WGS, global genomic nucleotide-excision repair, genome sequence analysis (lab test), GENET SEQ DATABASES, Sequencing, High-Throughput RNA, Editing, Genome, Human Genome Sequence Index, PROTEIN SEQ DATABASES, Total RNA Sequencing, SEQ DETERMINATIONS RNA, GENET DATA BASES, Gene Sequencing, genome mapping, Large-Scale Sequencing, genome sequencing, Genomics Research, SEQ DATA MOL, HUGO, Study, Genetic Association, Intellectual Product, genome database, R...","[0.583927393, 0.0014656227, 0.0140366936, 0.1867334694, 0.0017058838, 0.0062172795, 0.0032819642, 0.0025903168, 0.0027275078, 0.0017878308, 0.0031466258, 0.0021873149, 0.0008279632, 0.0024420489, 0.0065079276, 0.0025940447, 0.0033537529, 0.0025265771, 0.0021622237, 0.0097124791, 0.0051232204, 0.0062213833, 0.0288446359, 0.0030783813, 0.0011065813, 0.0058318549, 0.0078671239, 0.0026091295, 0.0877004415, 0.0116822729]"
7821,19809,"Here , we report that SL4 is able to inhibit the proliferation of different types of breast cancer cell in vitro and in vivo by inducing G2/M cell cycle arrest .",True,79,79,"(11, 12)",C0596290,Cell Proliferation Process,C0596290,Cell Proliferation Process,"[[3, 4], [11, 12], [22, 24]]","[Cell Proliferation Process, cellular growth, Replicative Senescence, Size Growth, Cell, Cancer Cell Growth, chondrocyte cell proliferation, Growth, cell viability, Lymphocyte proliferation, Cell Division Process, Cancer Induction, Neoplasms [Disease/Finding], fibroblast proliferation, (Neoplasms) or (cancers) (disorder), Cell Transformation, Neoplastic [Disease/Finding], DIFFER CELL, Precancerous Cells, microglial cell proliferation, 76-77 PROLIFERATIONS, movement of a cell, cellular quiesc...","[0.8633034229, 0.0021705497, 4.22184e-05, 1.49012e-05, 0.0079335384, 4.20618e-05, 1.84874e-05, 0.0017639453, 4.5908e-05, 6.8e-05, 0.0033850214, 2.7083e-06, 1.01164e-05, 0.0006899759, 2.40165e-05, 0.0064120954, 1.67286e-05, 9.67916e-05, 0.0001102814, 1.96556e-05, 4.781e-06, 0.0026278798, 0.1088375822, 3.91722e-05, 9.95814e-05, 0.0019197067, 0.0002409555, 2.08371e-05, 2.34108e-05, 1.56345e-05]"
12360,27854,"The phylogeny of the Phasianidae ( pheasants , partridges , and allies ) has been studied extensively .",True,0,0,"(4, 5)",C0325635,Family phasianidae (organism),C0325635,Family phasianidae (organism),"[[4, 5]]","[Subfamily Meleagridinae (organism), Family phasianidae (organism), Family Psychodidae (organism), Phasianellidae, Family Apidae (organism), cockle, Family fasciolidae (organism), Drosophilidae, Phasmatidae, Family Fagaceae (organism), Bird (organism), Flabellidae, Family Camelidae, pompanos, Nuttalliellidae, Cricket (organism), Macropsyllidae, Family scopidae (organism), Family helicidae (organism), Subfamily Phasianinae (organism), Cossidae, crawling seals, Beetle (organism), Family Cerato...","[0.0746359602, 0.2339954674, 0.016072534, 0.0522450842, 0.0266559962, 0.0008992016, 0.009943421, 0.0310628954, 0.0796381533, 0.0037184386, 0.0216003191, 0.009292556, 0.0443421341, 0.0130970199, 0.0099032298, 0.0042195749, 0.047467865, 0.047710672, 0.0076917084, 0.0679474473, 0.0372577235, 0.0045364117, 0.0015941198, 0.0081791338, 0.0180011839, 0.0765092224, 0.0145792477, 0.0247758832, 0.002785614, 0.0096416771]"
7015,14623,"While UPS pathways in plants are certainly not yet exhaustively researched , an emerging notion is that induction of UPS pathways is correlated with pathogenesis and stress responses .",True,68,68,"(24, 25)",C0699748,Pathogeneses (qualifier value),C0699748,Pathogeneses (qualifier value),"[[24, 25]]","[Pathogeneses (qualifier value), Phenomenon, Cell Physiological, Biological function (qualifier value), Organismal Process, cellular growth, homeostatic, obsolete senescence, Cell Division Process, biological regulation, Nerve Degeneration [Disease/Finding], Responses, Plant Immune, Hormeses, cell viability, growth plant, Cancer Induction, heat generation, Biosynthetic, Adaptation (function), Host Defense Mechanism, Inflammation Process, response, Adaption, organic evolution, molecular funct...","[0.7317121029, 0.005195566, 0.0812990293, 0.0003993542, 0.0010534395, 0.0004561713, 3.60625e-05, 1.31006e-05, 0.0112475483, 3.5014e-05, 0.0003875243, 0.001447318, 0.0071535646, 0.0015100059, 0.0119849704, 0.0002388787, 0.0021963543, 0.0612392388, 0.0135968598, 0.0037065973, 0.0028056111, 0.0005883113, 0.0349430181, 0.0178439002, 0.0007553959, 0.0009803183, 0.0008553963, 0.0009085981, 5.2684e-06, 0.0054055182]"
347,10786,The rapid development of programmable nuclease -based genome editing technologies has enabled targeted gene disruption and correction both in vitro and in vivo This revolution opens up the possibility of precise genome editing at target genomic sites to modulate gene function in animals and plants .,True,169,169,"(21, 23)",C1515655,In Vivo,C1515655,In Vivo,"[[7, 9], [13, 14], [21, 23], [31, 33], [39, 41]]","[In Vivo, In situ (qualifier value), External (modifier) (qualifier value), Possible (qualifier value), Intracellular (qualifier value), Places (environment), Right and left (qualifier value), Unlimited, Countries (geographic location), Present (qualifier value), Left (qualifier value), obsolete uniport, Site (qualifier value), in vivo test, Union, Proximal (qualifier value), far, SITE, INFUSION, Local (modifier) (qualifier value), Invention, dextros, orthotopic, Juxta-posed (qualifier value...","[0.9919037223, 0.0001091692, 2.0448e-06, 1.182e-07, 0.0008722071, 8.2e-09, 3.343e-06, 2.8e-09, 3.04e-08, 5.51853e-05, 6.529e-07, 3.77e-08, 0.0067462865, 0.0001638336, 2.32e-08, 1.8034e-06, 1.2141e-06, 2.4744e-06, 6.1368e-06, 1.9e-09, 1.064e-07, 1.777e-07, 1.23635e-05, 6.5e-09, 0.0001183262, 1.728e-07, 1.106e-07, 4.003e-07]"
14129,13992,It has multifaceted challenges including pre and post-operative histopathological diagnosis and optimal modality of treatment .,True,339,339,"(9, 10)",C0011900,diagnosis (DX),C0011900,diagnosis (DX),"[[2, 3], [3, 4], [9, 10]]","[diagnostic technique, diagnosis (DX), Diagnostic Study, Detected, Diagnostic radiography (procedure), TESTS DIAG, diagnostic dental procedures (procedure), Clinical Trials, Diagnosis, Histology (situation), Diagnostic assessment (procedure), Aspiration - diagnostic (procedure), anesthetics [specialty], exploratory surgical procedures, Clinical diagnosis (contextual qualifier), DIAG IMAGE, Diagnostic physical therapy procedure (regime/therapy), Molecular diagnosis, Assay technique (qualifier...","[0.0992526859, 0.8262358904, 0.0619286932, 2.14429e-05, 1.00967e-05, 1.7923e-06, 5.1824e-06, 1.77074e-05, 4.15721e-05, 0.0001979803, 4.119e-07, 1.4087e-06, 2.5742e-06, 0.0022436169, 0.0002482856, 1.21512e-05, 1.38552e-05, 6.714e-06, 1.27748e-05, 3.809e-06, 0.0072641, 8.1127e-06, 3.94625e-05, 1.8209e-06, 7.373e-07, 6.5719e-06, 1.21138e-05, 7.319e-07, 0.0024075531]"
6171,1132,"Most common complications were pneumonia ( 12 % ) , UTI ( 9 % ) , and wound infection ( 7 % ) .",True,19,19,"(4, 5)",C0032285,Pneumonia [Disease/Finding],C0032285,Pneumonia [Disease/Finding],"[[2, 3], [4, 5], [17, 19]]","[Pneumonia [Disease/Finding], Pneumonia, Aspiration [Disease/Finding], flus, Interstitial pneumonia (disorder), Asthma [Disease/Finding], Inflammation of the bronchioles, Pneumonia, Bacterial [Disease/Finding], Diphtheria [Disease/Finding], rndx hyperthermia, Bronchitis: [unspecified (& chest infection)] or [recurrent wheezy] (disorder), Tuberculosis (A15-A19), Haemophilus influenzae (organism), Tetanus [Disease/Finding], H1N1 Flu (Swine Flu), Rabies [Disease/Finding], Trichomonas Vaginitis ...","[0.3845651448, 0.0100375302, 0.0033778062, 0.0057115485, 0.0435556173, 0.0015481486, 0.0330408588, 0.0009850387, 0.0035451469, 0.0053827083, 0.0061343969, 0.0005698901, 0.0013258026, 0.0060026408, 0.0056630005, 0.0018701184, 0.0069308607, 0.0091250185, 0.035779763, 0.090170145, 0.2405107468, 0.0123125678, 0.0015676591, 0.0044830889, 0.0015606153, 0.0564073175, 0.0013988719, 0.0248376764, 0.0007023888, 0.0008978558]"
11008,37360,Ventricular pacing site separation by cardiac computed tomography : validation for the prediction of clinical response to cardiac resynchronization therapy \n,True,13,13,"(14, 16)",C4055223,CLINRESP,C4055223,CLINRESP,"[[0, 2], [5, 6], [9, 10], [14, 16], [17, 20]]","[CLINRESP, successful treatment]","[0.867752254, 0.132247746]"
8142,43393,NLP -based pneumonia information extraction of pediatric diagnostic imaging reports performed better than domain experts in this pilot study .,True,31,31,"(17, 19)",C0031928,pilot study,C0031928,pilot study,"[[2, 3], [17, 19]]","[pilot study, demonstration programs, CROSS SECTIONAL ANAL, prospective longitudinal study, research, Screening Trial, Clinical Studies, experimental studies, Observational Trial, Experiment Design, study, feasibility study, Study, Clinical Trial as Topic, Blinded Clinical Trial, test validation, STATIST MODEL, ECOLOGIC OR COMMUNITY, Prospective Study, prospective, OPEN LABEL, OBSMODEL, Preclinical Models, SAMPLING STUDIES, INTMODEL, RETROSPECTIVE, Epidemiologic Trial, CONTROLLED CLIN TRIALS...","[0.5386298895, 0.00177816, 0.0009545347, 0.1068802699, 0.0024110547, 0.0065684123, 0.0108929649, 0.022775542, 0.0025842458, 0.0167698301, 0.0069796336, 0.0422489122, 0.0069796336, 0.0210474283, 0.0030183748, 0.0020100079, 0.0006917809, 0.0015169021, 0.1144548208, 0.0428132229, 0.0018555331, 0.0006530134, 0.00064764, 0.0261748433, 0.0006700994, 0.0099118603, 0.0013201906, 0.0045359465, 0.001533453, 0.0006917412]"
