In [1]:
import pandas as pd
import numpy as np
import random
import json
import os

from itertools import permutations
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
abstracts_test_fname = "data/abstracts_test.csv"
abstracts_test_df = pd.read_csv(abstracts_test_fname,sep='\t')
abstracts_test_df.head()

Unnamed: 0,abstract_id,title,abstract
0,1711760,Delayed institution of hypertension during foc...,The effect of induced hypertension instituted ...
1,6086495,Localisation of the Becker muscular dystrophy ...,A linkage study in 30 Becker muscular dystroph...
2,7018927,Pituitary response to luteinizing hormone-rele...,The effects of a 6-hour infusion with haloperi...
3,7811247,X-linked adrenoleukodystrophy (ALD): a novel m...,Fragments of the adrenoleukodystrophy (ALD) cD...
4,8944024,Detection of heterozygous mutations in BRCA1 u...,The ability to scan a large gene rapidly and a...


In [4]:
entities_test_fname = "data/entities_test.csv"
entities_test_df = pd.read_csv(entities_test_fname,sep='\t')
entities_test_df.head()

Unnamed: 0,id,abstract_id,offset_start,offset_finish,type,mention,entity_ids
0,0,1711760,23,35,DiseaseOrPhenotypicFeature,hypertension,D006973
1,1,1711760,49,66,DiseaseOrPhenotypicFeature,cerebral ischemia,D002545
2,2,1711760,78,89,DiseaseOrPhenotypicFeature,brain edema,D001929
3,3,1711760,113,125,DiseaseOrPhenotypicFeature,hypertension,D006973
4,4,1711760,165,197,DiseaseOrPhenotypicFeature,middle cerebral artery occlusion,D020244


In [5]:
test_id2title ={ abs_id: title for abs_id, title in abstracts_test_df[['abstract_id','title']].values }
test_id2abstr ={ abs_id: abs_text for abs_id, abs_text in abstracts_test_df[['abstract_id','abstract']].values }

In [6]:
print(len(abstracts_test_df))
print(len(entities_test_df))

100
3263


In [7]:
# entity-type dictionary
entityid2type ={ entity_ids: type for entity_ids, type in entities_test_df[['entity_ids','type']].values }
mention2type = { mention: type for mention, type in entities_test_df[['mention','type']].values }
mention2entityid = { mention: entity_ids for mention, entity_ids in entities_test_df[['mention','entity_ids']].values }

In [8]:
entity_type_valid = [('GeneOrGeneProduct', 'GeneOrGeneProduct'),
                     ('GeneOrGeneProduct', 'DiseaseOrPhenotypicFeature'),
                     ('ChemicalEntity', 'DiseaseOrPhenotypicFeature'),
                     ('DiseaseOrPhenotypicFeature', 'GeneOrGeneProduct'),
                     ('SequenceVariant', 'DiseaseOrPhenotypicFeature'),
                     ('ChemicalEntity', 'GeneOrGeneProduct'),
                     ('DiseaseOrPhenotypicFeature', 'SequenceVariant'),
                     ('DiseaseOrPhenotypicFeature', 'ChemicalEntity'),
                     ('ChemicalEntity', 'ChemicalEntity'),
                     ('GeneOrGeneProduct', 'ChemicalEntity'),
                     ('SequenceVariant', 'ChemicalEntity'),
                     ('ChemicalEntity', 'SequenceVariant'),
                     ('SequenceVariant', 'SequenceVariant'),
                     ('SequenceVariant', 'GeneOrGeneProduct'),
                     ('GeneOrGeneProduct', 'SequenceVariant')]


In [9]:
### generete test dataset file
sentence2entitise = defaultdict(list)
sentence_infos = dict()
relation_candidates = []
for key, df in entities_test_df.groupby('abstract_id'):
    #if key == 1711760:
    text = test_id2title[key] + ' ' + test_id2abstr[key]
    sentences = text.split('. ')
    entity_infos = dict()
    mention2entityid = {}
    sen2ents = {}
    for i, row in df.iterrows():
        entity_infos[row['offset_start'], row['offset_finish']] = row['mention']
        mention2entityid[row['mention']] = row['entity_ids']
        #entity = row['mention']

    for idx in range(len(sentences)):
        if idx == 0:
            sent_left = 0
            sent_right = len(sentences[idx]+'. ')
        else:
            sent_left = sent_right #+ 1
            sent_right += len(sentences[idx]+'. ')
        for entity_pos in entity_infos.keys():
            if entity_pos[0] > sent_left and entity_pos[1] < sent_right:
                sentence2entitise[sentences[idx]].append(entity_infos[entity_pos])

    for s in sentences:
        entities =[]
        for m in mention2entityid:
            if m in s:
                entities.append(m)

        entities = list(set(entities))
        comb = permutations(entities, 2)
        for c in comb:
            if (mention2type[c[0]], mention2type[c[1]]) in entity_type_valid:
                entity_1_mention, entity_2_mention = c[0], c[1]
                entity_1_id, entity_2_id = mention2entityid[c[0]], mention2entityid[c[1]]
                relation_candidates.append([key, 
                                            s, 
                                            entity_1_mention, 
                                            entity_2_mention, 
                                            entity_1_id, 
                                            entity_2_id])
                # if in entity_type_valid
                #  else: 
                # if we do not remove the entity-entity which not in entity_type_valid
    



In [10]:
dataset_df =pd.DataFrame(relation_candidates, 
                         columns=['abstract_id','sentence','entity_1_mention','entity_2_mention','entity_1_id','entity_2_id'])
print('size of test data: ',len(dataset_df))

size of test data:  8480


In [11]:
dataset_df.sample(5)

Unnamed: 0,abstract_id,sentence,entity_1_mention,entity_2_mention,entity_1_id,entity_2_id
6879,26516699,"Interestingly, the differential expression of ...",Let-7b,MSST1,406884,-
4212,22048266,Significantly lower frequency of SIM2 C-G hapl...,rs2073416,DS,rs2073416,D004314
1585,17318851,The same variants in the IRF6 gene that are as...,orofacial clefts,rs861019,C566121,rs861019
6621,26516699,"In addition, from 10 to 50 weeks of age, stage...",Let-96,miR-195,-,387190
5456,24853300,Immunohistochemistry showed that CBKOTg mice h...,CB,amyloid b-peptide,12307,11820


In [12]:
dataset_df.to_csv('./data/test_data2.csv',index=None)