In [7]:
import re
import nltk
import spacy
import numpy as np
import pandas as pd
import networkx as nx

pd.set_option("display.max.columns", None)

In [2]:
text = """He also may have
recurrent seizures which should be treated with ativan IV or IM
and do not neccessarily indicate patient needs to return to
hospital unless they continue for greater than 5 minutes or he
has multiple recurrent seizures or complications such as
aspiration."""
text

'He also may have\nrecurrent seizures which should be treated with ativan IV or IM\nand do not neccessarily indicate patient needs to return to\nhospital unless they continue for greater than 5 minutes or he\nhas multiple recurrent seizures or complications such as\naspiration.'

In [52]:
def extract_token_feats(text, window=2):
    token_tag_lst = nltk.pos_tag( nltk.word_tokenize(text))
    df_feats = pd.DataFrame(token_tag_lst, columns=['token', 'pos'])
    df_feats['pos'] = df_feats['pos'].astype('string')
    df_feats['istitle'] = df_feats['token'].str.istitle().astype('string')
    df_feats['isupper'] = df_feats['token'].str.isupper().astype('string')
    df_feats['isalpha'] = df_feats['token'].str.isalpha().astype('string')
    df_feats['isnumeric'] = df_feats['token'].str.isnumeric().astype('string')
    df_feats['containsnumbers'] = df_feats['token'].apply(
        lambda x: len(re.findall(r'\d', x))>0
        ).astype('string')
    df_feats = df_feats.set_index('token')
    ini_cols = df_feats.columns
    for s in range(-window, window+1):
        if s != 0:
            cols = [f"{s}_{c}" for c in ini_cols]
            shift_df = df_feats[ini_cols].shift(-s)
            shift_df.columns = cols
            df_feats = pd.concat([df_feats, shift_df], axis=1)
    return df_feats


def get_dependency_graph(token_lst, spacy_pipe):
    doc = spacy_pipe(" ".join(token_lst))
    edges = []
    for token in doc:
        for child in token.children:
            edges.append((token.text, child.text))
    return nx.from_edgelist(edges)


def get_sdp(text, source, target, spacy_pipe):
    token_lst = nltk.word_tokenize(text)
    s_idx = token_lst.index(source)
    t_idx = token_lst.index(target)
    selected_tokens = token_lst[: max(s_idx, t_idx) + 1]
    g = get_dependency_graph(selected_tokens, spacy_pipe)
    return " ".join(nx.shortest_path(g, source=source, target=target))

In [50]:
nlp = spacy.load('en_core_web_sm')

In [54]:
get_sdp(text, 'recurrent', 'ativan', nlp)

'recurrent seizures treated with ativan'