In [1]:
import re
import nltk
import spacy
import numpy as np
import pandas as pd
import networkx as nx
from tqdm.notebook import tqdm

tqdm.pandas()

pd.set_option("display.max.columns", None)
pd.set_option("display.max.rows", None)

In [2]:
DATA_PATH = "../../data/parsed_data"

In [3]:
df_ner_train = pd.read_parquet(f"{DATA_PATH}/ner_train.parquet")
df_rel_train = pd.read_parquet(f"{DATA_PATH}/rel_train.parquet")

df_ner_test = pd.read_parquet(f"{DATA_PATH}/ner_test.parquet")
df_rel_test = pd.read_parquet(f"{DATA_PATH}/rel_test.parquet")

In [4]:
def get_pos_tags(df):
    df['pos'] = df.groupby(
        ['uid', 'contains_rel', 'sid']
    )['token'].transform(lambda s: [x[1] for x in nltk.pos_tag(s)])
    return df


def extract_group_feats(df, window=2):
    df_feats = df.copy()
    df_feats['pos'] = df_feats['pos'].astype('string')
    df_feats['istitle'] = df_feats['token'].str.istitle().astype('string')
    df_feats['isupper'] = df_feats['token'].str.isupper().astype('string')
    df_feats['isalpha'] = df_feats['token'].str.isalpha().astype('string')
    df_feats['isnumeric'] = df_feats['token'].str.isnumeric().astype('string')
    df_feats['containsnumbers'] = df_feats['token'].apply(
        lambda x: len(re.findall(r'\d', x))>0
        ).astype('string')
    df_feats = df_feats.drop(columns=['uid', 'contains_rel', 'sid'])
    df_feats = df_feats.set_index(['token', 'tag'])
    ini_cols = df_feats.columns
    for s in range(-window, window+1):
        if s != 0:
            cols = [f"{s}_{c}" for c in ini_cols]
            shift_df = df_feats[ini_cols].shift(-s)
            shift_df.columns = cols
            df_feats = pd.concat([df_feats, shift_df], axis=1)
    return df_feats


def extract_feats(df_ner):
    df_ner = get_pos_tags(df_ner)
    return df_ner.groupby(['uid', 'contains_rel', 'sid']).progress_apply(
        extract_group_feats
    )


def get_dependency_graph(token_lst, spacy_pipe):
    doc = spacy_pipe(" ".join(token_lst))
    edges = []
    for token in doc:
        for child in token.children:
            edges.append((token.text, child.text))
    return nx.from_edgelist(edges)


def get_sdp(token_lst, s_idx, t_idx, spacy_pipe):
    source = token_lst[s_idx]
    target = token_lst[t_idx]
    selected_tokens = token_lst[: max(s_idx, t_idx) + 1]
    g = get_dependency_graph(selected_tokens, spacy_pipe)
    try:
        sdp = " ".join(nx.shortest_path(g, source=source, target=target))
    except Exception as e:
        # print(e)
        sdp = " ".join(selected_tokens)
    return sdp

In [5]:
df_ner_train = extract_feats(df_ner_train)
df_ner_train.to_parquet(f"{DATA_PATH}/ner_train_baseline.parquet")

  0%|          | 0/39367 [00:00<?, ?it/s]

In [6]:
df_ner_test = extract_feats(df_ner_test)
df_ner_test.to_parquet(f"{DATA_PATH}/ner_test_baseline.parquet")

  0%|          | 0/25417 [00:00<?, ?it/s]

In [7]:
nlp = spacy.load('en_core_web_sm')

In [8]:
df_rel_train['sdp'] = df_rel_train.progress_apply(
    lambda row: get_sdp(
        row['text'].split('|'),
        int(row['arg1'].split(':')[:-1][0]),
        int(row['arg2'].split(':')[:-1][-1]),
        nlp
    ), axis=1
)

  0%|          | 0/36346 [00:00<?, ?it/s]

In [9]:
df_rel_train.to_parquet(f"{DATA_PATH}/rel_train_baseline.parquet")

In [10]:
df_rel_test['sdp'] = df_rel_test.progress_apply(
    lambda row: get_sdp(
        row['text'].split('|'),
        int(row['arg1'].split(':')[:-1][0]),
        int(row['arg2'].split(':')[:-1][-1]),
        nlp
    ), axis=1
)

  0%|          | 0/23462 [00:00<?, ?it/s]

In [11]:
df_rel_test.to_parquet(f"{DATA_PATH}/rel_test_baseline.parquet")