In [1]:
import pandas as pd
from itertools import combinations
import os
import time

In [64]:
def unique_without_none(x, none_value='-1'):
    return set(x) - {none_value}

def combinations_with_reverse(x):
    c = list(combinations(x, 2))
    return c + [(j, i) for i, j in c]

def get_subject(path):
    return path.split(os.sep)[-1].split('_')[1]

def read_file_to_dataframe(path, engine='c'):
    filepath = os.path.normpath(path)
    df = pd.read_csv(filepath, sep='\s+', header=None, skip_blank_lines=False, engine=engine,
                     names = ['token', 'file', 'idx_start', 'idx_end', 'entity', 'entity_idx', 'ref_entity_idx', 'ref_label'])
    return df

def add_paragraph_index(df):
    df['paragraph_idx'] = None
    df.loc[df['file'].isna() & df['file'].shift(1).isna(), 'paragraph_idx'] =\
    df.loc[df['file'].isna() & df['file'].shift(1).isna()].reset_index(drop=True).index.tolist()
    df['paragraph_idx'].fillna(method='bfill', inplace=True)
    df['paragraph_idx'].fillna(df['paragraph_idx'].max()+1, inplace=True)
    return df

def add_sentence_index(df):
    df['sentence_idx'] = None
    df.loc[df['file'].isna(), 'sentence_idx'] = df.loc[df['file'].isna()].reset_index(drop=True).index.tolist()
    df['sentence_idx'].fillna(method='bfill', inplace=True)
    df['sentence_idx'].fillna(df['sentence_idx'].max()+1, inplace=True)
    return df

def clean_data(df):
    df.dropna(subset=['file'], inplace=True)
    df = df.reset_index(drop=True)
    df = df.loc[((df.paragraph_idx == df.paragraph_idx.shift(1)) & (df.paragraph_idx == df.paragraph_idx.shift(2)))]
    return df

def get_entity_pairs(df):
    entity_pairs = df.groupby('paragraph_idx').agg({"entity_idx": unique_without_none})['entity_idx'].apply(combinations_with_reverse).to_dict()
    entity_pairs_df = pd.DataFrame([(k, *vi) for k, v in entity_pairs.items() for vi in v], columns=['paragraph_idx', 'entity_idx', 'ref_entity_idx'])
    entity_pairs_df_labels = entity_pairs_df.merge(df[['paragraph_idx', 'entity_idx', 'ref_entity_idx', 'ref_label']].drop_duplicates(), how='left')
    entity_pairs_df_labels.fillna({'ref_label': 'Dummy'}, inplace=True)
    return entity_pairs_df_labels

def create_dataset(entities, df, subject):
    res = pd.DataFrame()
    for i, row in entities.iterrows():
        df_o1 = df.loc[df['entity_idx'] == row['entity_idx']]
        df_o2 = df.loc[df['entity_idx'] == row['ref_entity_idx']]
        df_p = df.loc[df['paragraph_idx'] == row['paragraph_idx']]
        label = row['ref_label']

        res = pd.concat((
            res,
            pd.DataFrame(
                {'entity_1_type': df_o1['entity'].str[2:].iloc[0],
                 'entity_1_tokens': len(df_o1),
                 'entity_1_signs': df_o1['idx_end'].iloc[-1]-df_o1['idx_start'].iloc[0],
                 'entity_2_type': df_o2['entity'].str[2:].iloc[0],
                 'entity_2_tokens': len(df_o2),
                 'entity_2_signs': df_o2['idx_end'].iloc[-1]-df_o2['idx_start'].iloc[0],
                 'signs_between': max(df_o2['idx_start'].iloc[0] - df_o1['idx_end'].iloc[-1], df_o1['idx_start'].iloc[0] - df_o2['idx_end'].iloc[-1]),
                 'tokens_between': len(df.loc[min(min(df_o1.index), min(df_o2.index)):max(max(df_o1.index), max(df_o2.index)), :]) - len(df_o1) - len(df_o2),
                 'same_sentence': int(df_o1['sentence_idx'].iloc[0] == df_o2['sentence_idx'].iloc[0]),
                 'sentences_difference': df_o1['sentence_idx'].iloc[0] - df_o2['sentence_idx'].iloc[0],
                 'paragraph_len': len(df_p),
                 'paragraph_sentences': len(df_p['sentence_idx'].unique()),
                 'paragraph_entities': len(unique_without_none(df_p['entity_idx'])),
                 'subject': subject,
                 'target': label 
                }, index=[0]
            )
        ), ignore_index=True)
    return res

def load_data(path):
    subject = get_subject(path)
    try:
        data = read_file_to_dataframe(path)
    except Exception as e:
        try:
            data = read_file_to_dataframe(path, engine='python')
        except Exception as e2:
            print(e2)
            return pd.DataFrame()
    data = add_paragraph_index(data)
    data = add_sentence_index(data)
    data = clean_data(data)
    entities = get_entity_pairs(data)
    re_data = create_dataset(entities, data, subject)
    return re_data

In [58]:
ctr = time.time()
df = pd.concat([load_data(filepath) for filepath in list(map(lambda x: os.path.abspath(os.path.join(dir_name, x)), os.listdir(dir_name)))],
               ignore_index=True)
time.time() - ctr

133.84686040878296

In [70]:
pd.get_dummies(df, columns=['entity_1_type', 'entity_2_type', 'subject'], drop_first=True)

Unnamed: 0,entity_1_tokens,entity_1_signs,entity_2_tokens,entity_2_signs,signs_between,tokens_between,same_sentence,sentences_difference,paragraph_len,paragraph_sentences,...,entity_2_type_Definition,entity_2_type_Definition-frag,entity_2_type_Ordered-Definition,entity_2_type_Ordered-Term,entity_2_type_Qualifier,entity_2_type_Referential-Definition,entity_2_type_Referential-Term,entity_2_type_Secondary-Definition,entity_2_type_Term,entity_2_type_Term-frag
0,14,85.0,2,16.0,16.0,2,1,0.0,66,3,...,0,0,0,0,0,0,0,0,1,0
1,2,16.0,14,85.0,16.0,2,1,0.0,66,3,...,1,0,0,0,0,0,0,0,0,0
2,2,19.0,16,92.0,4.0,1,1,0.0,66,3,...,1,0,0,0,0,0,0,0,0,0
3,16,92.0,2,19.0,4.0,1,1,0.0,66,3,...,0,0,0,0,0,0,0,0,1,0
4,2,10.0,18,115.0,102.0,18,0,-1.0,53,3,...,1,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37217,8,61.0,9,54.0,8.0,2,1,0.0,78,3,...,0,0,0,0,0,0,0,0,1,0
37218,23,154.0,9,54.0,29.0,5,0,-1.0,78,3,...,0,0,0,0,0,0,0,0,1,0
37219,8,61.0,3,22.0,246.0,39,0,1.0,78,3,...,0,0,0,0,0,0,0,0,1,0
37220,23,154.0,3,22.0,1.0,0,1,0.0,78,3,...,0,0,0,0,0,0,0,0,1,0


In [61]:
df

Unnamed: 0,entity_1_type,entity_1_tokens,entity_1_signs,entity_2_type,entity_2_tokens,entity_2_signs,signs_between,tokens_between,same_sentence,sentences_difference,paragraph_len,paragraph_sentences,paragraph_entities,subject,target
0,Definition,14,85.0,Term,2,16.0,16.0,2,1,0.0,66,3,2,biology,Direct-Defines
1,Term,2,16.0,Definition,14,85.0,16.0,2,1,0.0,66,3,2,biology,Dummy
2,Term,2,19.0,Definition,16,92.0,4.0,1,1,0.0,66,3,2,biology,Dummy
3,Definition,16,92.0,Term,2,19.0,4.0,1,1,0.0,66,3,2,biology,Direct-Defines
4,Term,2,10.0,Definition,18,115.0,102.0,18,0,-1.0,53,3,4,biology,Dummy
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37217,Definition,8,61.0,Term,9,54.0,8.0,2,1,0.0,78,3,4,biology,Direct-Defines
37218,Definition,23,154.0,Term,9,54.0,29.0,5,0,-1.0,78,3,4,biology,Dummy
37219,Definition,8,61.0,Term,3,22.0,246.0,39,0,1.0,78,3,4,biology,Dummy
37220,Definition,23,154.0,Term,3,22.0,1.0,0,1,0.0,78,3,4,biology,Direct-Defines
