In [2]:
import pandas as pd
import numpy as np
import random
import json
import os

from itertools import permutations
from itertools import combinations 
import matplotlib.pyplot as plt
from collections import defaultdict

In [3]:
abstracts_fname = "data/abstracts_train.csv"
abstracts_df = pd.read_csv(abstracts_fname,sep='\t')
# abstracts_df.head()

In [5]:
entities_fname = "data/entities_train.csv"
entities_df = pd.read_csv(entities_fname,sep='\t')
# entities_df.head()

In [6]:
relations_fname = "data/relations_train.csv"
relations_df = pd.read_csv(relations_fname,sep='\t')
# relations_df.head()

In [7]:
print(len(abstracts_df))
print(len(entities_df))
print(len(relations_df))

400
13636
4280


In [8]:
id2title ={ abs_id: title for abs_id, title in abstracts_df[['abstract_id','title']].values }
id2abstr ={ abs_id: abs_text for abs_id, abs_text in abstracts_df[['abstract_id','abstract']].values }

In [9]:
def get_dic(abstracts_df, relations_df):
    
    id2title = { abs_id: title for abs_id, title in abstracts_df[['abstract_id','title']].values }
    id2abstr = { abs_id: abs_text for abs_id, abs_text in abstracts_df[['abstract_id','abstract']].values }

    relations = relations_df.type.unique()
    relations2dic = {k: v for v, k in enumerate(relations)}
    
    return id2title, id2abstr, relations2dic
id2title, id2abstr, relations2dic = get_dic(abstracts_df, relations_df)

In [10]:
# entity-type dictionary
entityid2type ={ entity_ids: type for entity_ids, type in entities_df[['entity_ids','type']].values }
# entityid2type

mention2type = { mention: type for mention, type in entities_df[['mention','type']].values }
mention2entityid = { mention: entity_ids for mention, entity_ids in entities_df[['mention','entity_ids']].values }

In [11]:
relations_df['entity_1_type'] = relations_df['entity_1_id'].map(entityid2type)
relations_df['entity_2_type'] = relations_df['entity_2_id'].map(entityid2type)

In [12]:
relations_df.head()

Unnamed: 0,id,abstract_id,type,entity_1_id,entity_2_id,novel,entity_1_type,entity_2_type
0,0,1353340,Association,410,D007966,No,GeneOrGeneProduct,DiseaseOrPhenotypicFeature
1,1,1353340,Positive_Correlation,rs74315458,D007966,Novel,SequenceVariant,DiseaseOrPhenotypicFeature
2,2,1671881,Positive_Correlation,D010661,rs62514952,Novel,DiseaseOrPhenotypicFeature,SequenceVariant
3,3,1671881,Positive_Correlation,D010661,rs62514953,Novel,DiseaseOrPhenotypicFeature,SequenceVariant
4,4,1671881,Association,5053,D010661,No,GeneOrGeneProduct,DiseaseOrPhenotypicFeature


In [13]:
entity_type = relations_df[['entity_1_type','entity_2_type']].value_counts()
# print(entity_type.index)
entity_type_valid = []
for i in entity_type.items():
    entity_type_valid.append(i[0])
entity_type_valid

[('GeneOrGeneProduct', 'GeneOrGeneProduct'),
 ('GeneOrGeneProduct', 'DiseaseOrPhenotypicFeature'),
 ('ChemicalEntity', 'DiseaseOrPhenotypicFeature'),
 ('DiseaseOrPhenotypicFeature', 'GeneOrGeneProduct'),
 ('SequenceVariant', 'DiseaseOrPhenotypicFeature'),
 ('ChemicalEntity', 'GeneOrGeneProduct'),
 ('DiseaseOrPhenotypicFeature', 'SequenceVariant'),
 ('DiseaseOrPhenotypicFeature', 'ChemicalEntity'),
 ('ChemicalEntity', 'ChemicalEntity'),
 ('GeneOrGeneProduct', 'ChemicalEntity'),
 ('SequenceVariant', 'ChemicalEntity'),
 ('ChemicalEntity', 'SequenceVariant'),
 ('SequenceVariant', 'SequenceVariant'),
 ('SequenceVariant', 'GeneOrGeneProduct'),
 ('GeneOrGeneProduct', 'SequenceVariant')]

In [14]:
# sentence2entitise = defaultdict(list)
dataset_list = list()
#check_list = list()
for key, df in entities_df.groupby('abstract_id'):
    #if key == 1671881:
    text = id2title[key] + ' ' +  id2abstr[key]
    sentences = text.split('. ')
    entity_pos2ids = dict()
    entity_pos2mention = dict()
    entity_id2mention = defaultdict(list)
    for i, row in df.iterrows():
        entity_pos2ids[row['offset_start'], row['offset_finish']] = row['entity_ids']
        entity_id2mention[row['entity_ids']].append(row['mention'])
    entity_id2mention = { key : list(set(value)) for key, value in entity_id2mention.items()}
    entity_id2mention_temp = dict()
    all_entities = entity_id2mention.keys()
    for entity in all_entities:
        if ',' in entity:
            entity_x = entity.split(',')
            for i in entity_x:
                entity_id2mention_temp[i] = entity_id2mention[entity]
    entity_id2mention.update(entity_id2mention_temp)

    entitie2relation = dict()
    rels_df = relations_df[relations_df['abstract_id'] == key]
    for j, row in rels_df.iterrows():
        entitie2relation[row['entity_1_id'], row['entity_2_id']] = row['type']
        entitie2relation[row['entity_2_id'], row['entity_1_id']] = row['type']

    sentence2entitise = defaultdict(list)
    for idx in range(len(sentences)):
        if idx == 0:
            sent_left = 0
            sent_right = len(sentences[idx]+'. ')
        else:
            sent_left = sent_right 
            sent_right += len(sentences[idx]+'. ')
        for entity_pos in entity_pos2ids.keys():
            if entity_pos[0] > sent_left and entity_pos[1] < sent_right:
                sentence2entitise[sentences[idx]].append(entity_pos2ids[entity_pos])

    relation_entities = []
    for s in sentence2entitise:
        entities = sentence2entitise[s]
        entities = list(set(entities))
        comb = combinations(entities, 2)
        for c in comb:
            if c in entitie2relation.keys():
                sent_relation_type = entitie2relation[c]
            else:
                # I think we do not need so much Negative samples
                prob_list = list(range(5))
                dst = random.choice(prob_list)
                if dst == 1:
                    sent_relation_type = "Negative"
                else:
                    continue

            entity_1_id,  entity_2_id = c[0], c[1]        
            entity_1_mentions, entity_2_mentions = entity_id2mention[c[0]], entity_id2mention[c[1]]
          
            for mention in entity_1_mentions:
                if mention in s:
                    entity_1_mention = mention
                else:
                    continue
            for mention in entity_2_mentions:
                if mention in s:
                    entity_2_mention = mention 
                else:
                    continue
            if len(entity_1_mention) > len(entity_2_mention):
                s_new = s.replace(entity_1_mention, '<e1> ' + entity_1_mention + ' </e1>')
                s_new = s_new.replace(entity_2_mention, '<e2> ' + entity_2_mention + ' </e2>')
            else:
                s_new = s.replace(entity_2_mention, '<e2> ' + entity_2_mention + ' </e2>')
                s_new = s_new.replace(entity_1_mention, '<e1> ' + entity_1_mention + ' </e1>')

            if ('<e1>' in s_new) and ('</e2>' in s_new):
                dataset_list.append([key,
                                     s_new, 
                                     entity_1_id,
                                     entity_2_id,
                                     entity_1_mention,
                                     entity_2_mention,
                                     sent_relation_type])
            #else:
                #check_list.append([key, s_new])
                #print(entity_1_mention, '----' ,entity_2_mention)
                #print(s_new,'\n')

print('dataset size:  ',len(dataset_list))
#print(len(check_list))

dataset size:   7192


In [15]:
dataset_df =pd.DataFrame(dataset_list, columns=['abstract_id','sentences','entity_1_id','entity_2_id','entity_1_mention','entity_2_mention','type'])
print(len(dataset_df))

7192


In [16]:
#
dataset_df.type.value_counts()

Association             2514
Negative                2279
Positive_Correlation    1278
Negative_Correlation     890
Comparison                89
Bind                      72
Cotreatment               53
Drug_Interaction          13
Conversion                 4
Name: type, dtype: int64

In [17]:
from sklearn import model_selection
def randomKFold_func(df, n_splits=5):
    # creat a new colunm called kfold and fill it with -1
    df['kfold'] = -1
    # randomize the rows of the data
    df = df.sample(frac=1).reset_index(drop=True)
    # initiate the kfold class for model selection module
    kf = model_selection.KFold(n_splits=n_splits)
    # fill the new kfold column
    for f, (t_, v_) in enumerate(kf.split(X=df)):
        df.loc[v_, 'kfold'] = f 
    return df

dataset_df_kfold = randomKFold_func(dataset_df, n_splits=5)

In [18]:
dataset_df_kfold.sample(5)

Unnamed: 0,abstract_id,sentences,entity_1_id,entity_2_id,entity_1_mention,entity_2_mention,type,kfold
4238,27090298,The enzyme <e1> FASN </e1> (fatty acid synthas...,2194,D008659,FASN,metabolic dysfunction,Association,2
4750,28151486,"In <e2> breast cancer </e2> patients, high lev...",64852,D001943,Star-PAP,breast cancer,Association,3
4743,20705401,"In the haplotype-wise analysis, we detected an...",D011605,D008694,psychosis,METH,Positive_Correlation,3
608,18779591,This sole substitution was sufficient to confe...,5618,6776,PrlR,STAT5,Association,0
1111,20523265,"PURPOSE: Genes in the complement pathway, incl...",D008268,718,age-related macular degeneration,C3,Association,0


In [19]:
dataset_df_kfold.to_csv('./data/train_relation_candidates.csv',index=None)