In [61]:
import re
import nltk
import spacy
import numpy as np
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm

tqdm.pandas()

pd.set_option("display.max.columns", None)
pd.set_option("display.max.rows", None)

In [None]:
DATA_PATH = "../../data/parsed_data"

In [59]:
df_ner_train = pd.read_parquet(f"{DATA_PATH}/ner_train.parquet")
df_rel_train = pd.read_parquet(f"{DATA_PATH}/rel_train.parquet")

df_ner_test = pd.read_parquet(f"{DATA_PATH}/ner_test.parquet")
df_rel_test = pd.read_parquet(f"{DATA_PATH}/rel_test.parquet")

In [119]:
def get_pos_tags(df):
    df['pos'] = df.groupby(
        ['uid', 'contains_rel', 'sid']
    )['token'].transform(lambda s: [x[1] for x in nltk.pos_tag(s)])
    return df


def extract_group_feats(df, window=2):
    df_feats = df.copy()
    df_feats['pos'] = df_feats['pos'].astype('string')
    df_feats['istitle'] = df_feats['token'].str.istitle().astype('string')
    df_feats['isupper'] = df_feats['token'].str.isupper().astype('string')
    df_feats['isalpha'] = df_feats['token'].str.isalpha().astype('string')
    df_feats['isnumeric'] = df_feats['token'].str.isnumeric().astype('string')
    df_feats['containsnumbers'] = df_feats['token'].apply(
        lambda x: len(re.findall(r'\d', x))>0
        ).astype('string')
    df_feats = df_feats.drop(columns=['uid', 'contains_rel', 'sid'])
    df_feats = df_feats.set_index(['token', 'tag'])
    ini_cols = df_feats.columns
    for s in range(-window, window+1):
        if s != 0:
            cols = [f"{s}_{c}" for c in ini_cols]
            shift_df = df_feats[ini_cols].shift(-s)
            shift_df.columns = cols
            df_feats = pd.concat([df_feats, shift_df], axis=1)
    return df_feats


def extract_feats(df_ner):
    df_ner = get_pos_tags(df_ner)
    return df_ner.groupby(['uid', 'contains_rel', 'sid']).progress_apply(
        extract_group_feats
    )


def get_dependency_graph(token_lst, spacy_pipe):
    doc = spacy_pipe(" ".join(token_lst))
    edges = []
    for token in doc:
        for child in token.children:
            edges.append((token.text, child.text))
    return nx.from_edgelist(edges)


def get_sdp(token_lst, s_idx, t_idx, spacy_pipe):
    source = token_lst[s_idx]
    target = token_lst[t_idx]
    selected_tokens = token_lst[: max(s_idx, t_idx) + 1]
    g = get_dependency_graph(selected_tokens, spacy_pipe)
    try:
        sdp = " ".join(nx.shortest_path(g, source=source, target=target))
    except Exception as e:
        # print(e)
        sdp = " ".join(selected_tokens)
    return sdp

In [63]:
df_ner_train = extract_feats(df_ner_train)
df_ner_train.to_parquet(f"{DATA_PATH}/ner_train_baseline.parquet")

  0%|          | 0/40523 [00:00<?, ?it/s]

In [66]:
df_ner_test = extract_feats(df_ner_test)
df_ner_test.to_parquet(f"{DATA_PATH}/ner_test_baseline.parquet")

  0%|          | 0/26056 [00:00<?, ?it/s]

In [68]:
nlp = spacy.load('en_core_web_sm')

In [120]:
df_rel_train['sdp'] = df_rel_train.progress_apply(
    lambda row: get_sdp(
        row['text'].split('|'),
        int(row['arg1'].split(':')[:-1][0]),
        int(row['arg2'].split(':')[:-1][-1]),
        nlp
    ), axis=1
)

  0%|          | 0/36346 [00:00<?, ?it/s]

Either source 25mg or target Topiramate is not in G
Either source 50mg or target Topiramate is not in G
No path between Ventilator and vanc.
No path between Ventilator and cefepime.
No path between Ventilator and cipro.
No path between Hypertension and lisinopril.
Either source 20mg or target lisinopril is not in G
Either source 1-2 or target Xanax is not in G
No path between Two and bisacodyl.
Either source PO or target bisacodyl is not in G
No path between DAILY and bisacodyl.
Either source Solution or target ipratropium is not in G
No path between One and ipratropium.
No path between neb and ipratropium.
No path between Inhalation and ipratropium.
No path between every and ipratropium.
No path between Nebulization and albuterol.
No path between One and albuterol.
No path between neb and albuterol.
No path between Inhalation and albuterol.
No path between every and albuterol.
Either source Solution or target albuterol is not in G
No path between Nebulization and albuterol.
No path be

In [122]:
df_rel_train.to_parquet(f"{DATA_PATH}/rel_train_baseline.parquet")

In [123]:
df_rel_test['sdp'] = df_rel_test.progress_apply(
    lambda row: get_sdp(
        row['text'].split('|'),
        int(row['arg1'].split(':')[:-1][0]),
        int(row['arg2'].split(':')[:-1][-1]),
        nlp
    ), axis=1
)

  0%|          | 0/23462 [00:00<?, ?it/s]

No path between one and Decadron.
No path between fevers and Dilantin.
Either source q.i.d or target Polysporin is not in G
Either source q.d or target Lipitor is not in G
No path between Acute and chemo.
No path between Acute and carboplatin.
No path between Leukopenia and chemotherapy.
No path between thrombocytopenia and chemotherapy.
Either source 1-2 or target Artificial is not in G
Either source 20-50 or target Propofol is not in G
Either source moderate/heavy or target Propofol is not in G
Either source Oral or target Chlorhexidine is not in G
Either source 100 or target Hydrocortisone is not in G
No path between Q8H and Hydrocortisone.
Either source PUFF or target Albuterol-Ipratropium is not in G
Either source IH or target Albuterol-Ipratropium is not in G
Either source Q6H or target Albuterol-Ipratropium is not in G
Either source 25-100 or target Fentanyl is not in G
No path between IV and Hydrocortisone.
No path between thrombocytopenia and platelets.
Either source 1gram/hou

In [124]:
df_rel_test.to_parquet(f"{DATA_PATH}/rel_test_baseline.parquet")