In [1]:
import re
import nltk
import spacy
import numpy as np
import pandas as pd
import networkx as nx
from tqdm import tqdm

tqdm.pandas()

pd.set_option("display.max.columns", None)
pd.set_option("display.max.rows", None)

In [2]:
DATA_PATH = "../../data/parsed_data"

In [3]:
df_ner_train = pd.read_parquet(f"{DATA_PATH}/ner_train.parquet")
df_rel_train = pd.read_parquet(f"{DATA_PATH}/rel_train.parquet")

df_ner_test = pd.read_parquet(f"{DATA_PATH}/ner_test.parquet")
df_rel_test = pd.read_parquet(f"{DATA_PATH}/rel_test.parquet")

In [4]:
def get_pos_tags(df):
    """
    This function takes a pandas dataframe and adds a new column 'pos' containing the 
    part-of-speech tags for the 'token' column using the nltk.pos_tag() function. The 
    part-of-speech tags are grouped by the 'uid', 'contains_rel', and 'sid' columns.
    
    Args:
    - df: pandas dataframe
    
    Returns:
    - df: pandas dataframe with a new column 'pos' containing part-of-speech tags
    
    """
    df['pos'] = df.groupby(
        ['uid', 'contains_rel', 'sid']
    )['token'].transform(lambda s: [x[1] for x in nltk.pos_tag(s)])
    return df


def extract_group_feats(df, window=2):
    """
    This function takes a pandas dataframe, adds new columns for various features such as 
    whether the token is a title, uppercase, alphabetic, numeric, or contains numbers. It 
    also sets the index to 'token' and 'tag'. The function then creates new columns for 
    each feature shifted by the given window size. 
    
    Args:
    - df: pandas dataframe
    - window: integer, default=2
    
    Returns:
    - df_feats: pandas dataframe with added columns for shifted features
    
    """
    df_feats = df.copy()
    df_feats['pos'] = df_feats['pos'].astype('string')
    df_feats['istitle'] = df_feats['token'].str.istitle().astype('string')
    df_feats['isupper'] = df_feats['token'].str.isupper().astype('string')
    df_feats['isalpha'] = df_feats['token'].str.isalpha().astype('string')
    df_feats['isnumeric'] = df_feats['token'].str.isnumeric().astype('string')
    df_feats['containsnumbers'] = df_feats['token'].apply(
        lambda x: len(re.findall(r'\d', x))>0
        ).astype('string')
    df_feats = df_feats.drop(columns=['uid', 'contains_rel', 'sid'])
    df_feats = df_feats.set_index(['token', 'tag'])
    ini_cols = df_feats.columns
    for s in range(-window, window+1):
        if s != 0:
            cols = [f"{s}_{c}" for c in ini_cols]
            shift_df = df_feats[ini_cols].shift(-s)
            shift_df.columns = cols
            df_feats = pd.concat([df_feats, shift_df], axis=1)
    return df_feats


def extract_feats(df_ner):
    """
    This function takes a pandas dataframe, gets the part-of-speech tags using the 
    'get_pos_tags' function, and applies the 'extract_group_feats' function to group the 
    features for each set of 'uid', 'contains_rel', and 'sid' columns in the dataframe. 
    
    Args:
    - df_ner: pandas dataframe
    
    Returns:
    - df_ner: pandas dataframe with added columns for features grouped by 'uid', 'contains_rel', 
             and 'sid'
    
    """
    df_ner = get_pos_tags(df_ner)
    return df_ner.groupby(['uid', 'contains_rel', 'sid']).progress_apply(
        extract_group_feats
    )


def get_dependency_graph(token_lst, spacy_pipe):
    """
    This function takes a list of tokens and a spacy pipeline object and returns a 
    NetworkX graph representing the dependency parse of the input sentence. It uses 
    spacy_pipe to get the dependencies and then creates an edge list of the dependencies 
    in the sentence. 
    
    Args:
    - token_lst: list of tokens
    - spacy_pipe: spacy pipeline object
    
    Returns:
    - g: NetworkX graph object representing the dependency parse of the input sentence
    
    """
    doc = spacy_pipe(" ".join(token_lst))
    edges = []
    for token in doc:
        for child in token.children:
            edges.append((token.text, child.text))
    return nx.from_edgelist(edges)


def get_sdp(token_lst, s_idx, t_idx, spacy_pipe):
    """
    This function takes a list of tokens, two indices for the source and target tokens, and a 
    spacy pipeline object. It then creates a dependency graph using the 'get_dependency_graph' 
    function and finds the shortest dependency path between the two tokens. If there is no path 
    found, it returns the entire sentence. 
    
    Args:
    - token_lst: list of tokens
    - s_idx: integer, index of source token
    - t_idx: integer, index of target token
    - spacy_pipe: spacy pipeline object
    
    Returns:
    - sdp: string representing the shortest dependency path between the source and target 
           tokens, or the entire sentence if no path was found
    
    """
    source = token_lst[s_idx]
    target = token_lst[t_idx]
    selected_tokens = token_lst[: max(s_idx, t_idx) + 1]
    g = get_dependency_graph(selected_tokens, spacy_pipe)
    try:
        sdp = " ".join(nx.shortest_path(g, source=source, target=target))
    except Exception as e:
        # print(e)
        sdp = " ".join(selected_tokens)
    return sdp

In [5]:
df_ner_train = extract_feats(df_ner_train)
df_ner_train.to_parquet(f"{DATA_PATH}/ner_train_baseline.parquet")

100%|██████████| 39339/39339 [13:01<00:00, 50.31it/s] 


In [6]:
df_ner_test = extract_feats(df_ner_test)
df_ner_test.to_parquet(f"{DATA_PATH}/ner_test_baseline.parquet")

100%|██████████| 25401/25401 [08:25<00:00, 50.22it/s]


In [5]:
nlp = spacy.load('en_core_web_sm')

In [10]:
df_rel_train['sdp'] = df_rel_train.progress_apply(
    lambda row: get_sdp(
        row['text'].split('|'),
        int(row['arg1'].split(':')[:-1][0]),
        int(row['arg2'].split(':')[:-1][-1]),
        nlp
    ), axis=1
)

100%|██████████| 36346/36346 [05:14<00:00, 115.64it/s]


In [11]:
df_rel_train.to_parquet(f"{DATA_PATH}/rel_train_baseline.parquet")

In [8]:
df_rel_test['sdp'] = df_rel_test.progress_apply(
    lambda row: get_sdp(
        row['text'].split('|'),
        int(row['arg1'].split(':')[:-1][0]),
        int(row['arg2'].split(':')[:-1][-1]),
        nlp
    ), axis=1
)

100%|██████████| 23462/23462 [03:28<00:00, 112.33it/s]


In [9]:
df_rel_test.to_parquet(f"{DATA_PATH}/rel_test_baseline.parquet")

In [12]:
row

text     She|was|started|on|prophylactic|Oxacillin|to|c...
arg1                                                 22:24
arg2                                                   5:6
label                                          Reason-Drug
sid                                                      2
uid                                                 100130
Name: 0, dtype: object

In [10]:
get_sdp(
        row['text'].split('|'),
        int(row['arg1'].split(':')[:-1][0]),
        int(row['arg2'].split(':')[:-1][-1]),
        nlp
    )

'ophthalmic for consulted started on Oxacillin'

In [15]:
row['arg1'].split(':')[:-1]

['22']

In [17]:
row['text'].split('|')[int(row['arg1'].split(':')[:-1][0])], row['text'].split('|')[int(row['arg2'].split(':')[:-1][0])]

('ophthalmic', 'Oxacillin')

In [11]:
" ".join(row['text'].split('|'))

'She was started on prophylactic Oxacillin to cover skin flora , and Dermatology was consulted along with Neurology and Ophthalmology for the ophthalmic involvement'