In [1]:
import pandas as pd
import numpy as np
import random
import json
import os

from itertools import permutations
import matplotlib.pyplot as plt
from collections import defaultdict

In [2]:
abstracts_test_fname = "../data/test/abstracts_test.csv"
abstracts_test_df = pd.read_csv(abstracts_test_fname,sep='\t')
abstracts_test_df.head()

Unnamed: 0,abstract_id,title,abstract
0,1711760,Delayed institution of hypertension during foc...,The effect of induced hypertension instituted ...
1,6086495,Localisation of the Becker muscular dystrophy ...,A linkage study in 30 Becker muscular dystroph...
2,7018927,Pituitary response to luteinizing hormone-rele...,The effects of a 6-hour infusion with haloperi...
3,7811247,X-linked adrenoleukodystrophy (ALD): a novel m...,Fragments of the adrenoleukodystrophy (ALD) cD...
4,8944024,Detection of heterozygous mutations in BRCA1 u...,The ability to scan a large gene rapidly and a...


In [3]:
entities_test_fname = "../data/test/entities_test.csv"
entities_test_df = pd.read_csv(entities_test_fname,sep='\t')
entities_test_df.head()

Unnamed: 0,id,abstract_id,offset_start,offset_finish,type,mention,entity_ids
0,0,1711760,23,35,DiseaseOrPhenotypicFeature,hypertension,D006973
1,1,1711760,49,66,DiseaseOrPhenotypicFeature,cerebral ischemia,D002545
2,2,1711760,78,89,DiseaseOrPhenotypicFeature,brain edema,D001929
3,3,1711760,113,125,DiseaseOrPhenotypicFeature,hypertension,D006973
4,4,1711760,165,197,DiseaseOrPhenotypicFeature,middle cerebral artery occlusion,D020244


In [4]:
test_id2title ={ abs_id: title for abs_id, title in abstracts_test_df[['abstract_id','title']].values }
test_id2abstr ={ abs_id: abs_text for abs_id, abs_text in abstracts_test_df[['abstract_id','abstract']].values }

In [5]:
print(len(abstracts_test_df))
print(len(entities_test_df))

100
3263


In [6]:
# entity-type dictionary
entityid2type ={ entity_ids: type for entity_ids, type in entities_test_df[['entity_ids','type']].values }
mention2type = { mention: type for mention, type in entities_test_df[['mention','type']].values }
mention2entityid = { mention: entity_ids for mention, entity_ids in entities_test_df[['mention','entity_ids']].values }

In [7]:
entity_type_valid = [('GeneOrGeneProduct', 'GeneOrGeneProduct'),
 ('GeneOrGeneProduct', 'DiseaseOrPhenotypicFeature'),
 ('ChemicalEntity', 'DiseaseOrPhenotypicFeature'),
 ('DiseaseOrPhenotypicFeature', 'GeneOrGeneProduct'),
 ('SequenceVariant', 'DiseaseOrPhenotypicFeature'),
 ('ChemicalEntity', 'GeneOrGeneProduct'),
 ('DiseaseOrPhenotypicFeature', 'SequenceVariant'),
 ('DiseaseOrPhenotypicFeature', 'ChemicalEntity'),
 ('ChemicalEntity', 'ChemicalEntity'),
 ('GeneOrGeneProduct', 'ChemicalEntity'),
 ('SequenceVariant', 'ChemicalEntity'),
 ('ChemicalEntity', 'SequenceVariant'),
 ('SequenceVariant', 'SequenceVariant'),
 ('SequenceVariant', 'GeneOrGeneProduct'),
 ('GeneOrGeneProduct', 'SequenceVariant')]


In [15]:
!pip install spacy --upgrade

Requirement already up-to-date: spacy in /opt/conda/lib/python3.8/site-packages (3.2.2)


In [16]:
import spacy
#from spacy.lang.en import English

In [17]:
#!python -m spacy download en_core_web_sm

In [18]:
nlp = spacy.load("en_core_web_sm")

In [19]:
text = test_id2abstr[1711760]

In [20]:
doc = nlp(text)
print("Noun phrases:", [chunk.text for chunk in doc.noun_chunks])
print("Verbs:", [token.lemma_ for token in doc if token.pos_ == "VERB"])
for entity in doc.ents:
    print(entity.text, entity.label_)


Noun phrases: ['The effect', 'induced hypertension', 'a 2-h delay', 'middle cerebral artery occlusion', 'MCAO', 'brain edema formation', 'histochemical injury', 'isoflurane anesthesia', 'the MCA', '14 spontaneously hypertensive rats', 'the control group', '=', 'the mean arterial pressure', 'MAP', 'the hypertensive group', '=', 'the MAP', '25-30 mm', '2 h', 'MCAO', 'MCAO', 'the rats', 'the brains', 'The brains', 'coronal planes', 'the distribution', 'ischemia', 'MCAO', 'Specific gravity', 'SG', 'the subcortex', 'two sites', 'the cortex', 'core', 'periphery', 'the ischemic territory', 'The extent', 'neuronal injury', '2,3,5-triphenyltetrazolium staining', 'the ischemic core', 'no difference', 'SG', 'the subcortex', 'cortex', 'the two groups', 'the periphery', 'the ischemic territory', 'SG', 'the cortex', 'greater (less edema accumulation', 'the hypertensive group', '+', 'P', 'The area', 'histochemical injury', 'a percent', 'the cross-sectional area', 'the hemisphere', 'the hypertensive g

In [35]:
def getSentences(text):
    return [sent for sent in doc.sents]
    

In [32]:
def printToken(token):
    print(token.text, "->", token.dep_)

In [36]:
getSentences(text)

[The effect of induced hypertension instituted after a 2-h delay following middle cerebral artery occlusion (MCAO) on brain edema formation and histochemical injury was studied.,
 Under isoflurane anesthesia, the MCA of 14 spontaneously hypertensive rats was occluded.,
 In the control group (n = 7), the mean arterial pressure (MAP) was not manipulated.,
 In the hypertensive group (n = 7), the MAP was elevated by 25-30 mm,
 Hg beginning 2 h after MCAO.,
 Four hours after MCAO, the rats were killed and the brains harvested.,
 The brains were sectioned along coronal planes spanning the distribution of ischemia produced by MCAO.,
 Specific gravity (SG) was determined in the subcortex and in two sites in the cortex (core and periphery of the ischemic territory).,
 The extent of neuronal injury was determined by 2,3,5-triphenyltetrazolium staining.,
 In the ischemic core, there was no difference in SG in the subcortex and cortex in the two groups.,
 In the periphery of the ischemic territory

In [37]:
def appendChunk(original, chunk):
    return original + ' ' + chunk

In [38]:
def isRelationCandidate(token):
    deps = ["ROOT", "adj", "attr", "agent", "amod"]
    return any(subs in token.dep_ for subs in deps)

In [40]:
def isConstructionCandidate(token):
    deps = ["compound", "prep", "conj", "mod"]
    return any(subs in token.dep_ for subs in deps)

In [51]:
def processSubjectObjectPairs(tokens):
    subject = ''
    object = ''
    relation = ''
    subjectConstruction = ''
    objectConstruction = ''
    for token in tokens:
        printToken(token)
        if "punct" in token.dep_:
            continue
        if isRelationCandidate(token):
            relation = appendChunk(relation, token.lemma_)
        if isConstructionCandidate(token):
            if subjectConstruction:
                subjectConstruction = appendChunk(subjectConstruction, token.text)
            if objectConstruction:
                objectConstruction = appendChunk(objectConstruction, token.text)
        if "subj" in token.dep_:
            subject = appendChunk(subject, token.text)
            subject = appendChunk(subjectConstruction, subject)
            subjectConstruction = ''
        if "obj" in token.dep_:
            object = appendChunk(object, token.text)
            object = appendChunk(objectConstruction, object)
            objectConstruction = ''

    print ('sub', subject.strip(), ",pred:", relation.strip(), ",obj", object.strip())
    return (subject.strip(), relation.strip(), object.strip())

In [52]:
def processSentence(sentence):
    tokens = nlp(sentence)
    return processSubjectObjectPairs(tokens)

In [54]:
for sent in getSentences(text):
    print (sent)
    processSentence(str(sent))
    break

The effect of induced hypertension instituted after a 2-h delay following middle cerebral artery occlusion (MCAO) on brain edema formation and histochemical injury was studied.
The -> det
effect -> nsubjpass
of -> prep
induced -> amod
hypertension -> pobj
instituted -> acl
after -> prep
a -> det
2 -> nummod
- -> punct
h -> compound
delay -> pobj
following -> prep
middle -> amod
cerebral -> amod
artery -> compound
occlusion -> pobj
( -> punct
MCAO -> appos
) -> punct
on -> prep
brain -> compound
edema -> compound
formation -> pobj
and -> cc
histochemical -> amod
injury -> conj
was -> auxpass
studied -> ROOT
. -> punct
sub effect ,pred: induce middle cerebral histochemical study ,obj hypertension delay occlusion formation


In [21]:
# generete test dataset file
sentence2entitise = defaultdict(list)
# entity_infos = defaultdict(list)
sentence_infos = dict()
relation_candidates = []
for key, df in entities_test_df.groupby('abstract_id'):
    #print (key)
    #if key == 1353340:
    text = test_id2title[key] + ' ' + test_id2abstr[key]
    sentences = text.split('. ')
    entity_infos = dict()
    mention2entityid = {}
    sen2ents = {}
    for i, row in df.iterrows():
        entity_infos[row['offset_start'], row['offset_finish']] = row['mention']
        mention2entityid[row['mention']] = row['entity_ids']
        entity= row['mention']

#     for idx in range(len(sentences)):
#         if idx == 0:
#             sent_left = 0
#             sent_right = len(sentences[idx]+'. ')
#         else:
#             sent_left = sent_right #+ 1
#             sent_right += len(sentences[idx]+'. ')
#         for entity_pos in entity_infos.keys():
#             if entity_pos[0] > sent_left and entity_pos[1] < sent_right:
#                 sentence2entitise[sentences[idx]].append(entity_infos[entity_pos])
    #print (mention2entityid)
    for s in sentences:
        entities =[]
        for m in mention2entityid:
            if m in s:
                entities.append(m)
        #entities = sen2ents[s]

        entities = list(set(entities))
        #print (entities)
        comb = permutations(entities, 2)
        for c in comb:
            # move entiy1 == entity2
#             if c[0] == c[1]:
#                 continue
            if (mention2type[c[0]], mention2type[c[1]]) in entity_type_valid:
                #abstract_id = key
                #sentence = s
                #entity1_mention = c[0]
                #entity2_mention = c[1]
                #entity1_id = mention2entityid[c[0]]
                relation_candidates.append([key, s, c[0], c[1], mention2entityid[c[0]], mention2entityid[c[1]]])
    

In [19]:
dataset_df =pd.DataFrame(relation_candidates, 
                         columns=['abstract_id','sentence','entity_1_mention','entity_2_mention','entity_1_id','entity_2_id'])

In [15]:
dataset_df.sample(5)

Unnamed: 0,abstract_id,sentence,entity_1_mention,entity_2_mention,entity_1_id,entity_2_id
4578,22369755,These pro-oncogenic effects of NNK were abolis...,SLURP-1,NNK,57152,C016583
117,14510914,The diagnosis of ITD was suspected because of ...,iodide,ITD,D007454,C564766
4849,24114426,"RATIONALE: Ecstasy (3,4-methylenedioxymethamph...",MDMA,"3,4-methylenedioxymethamphetamine",D018817,D018817
866,15824163,Germ line mutations in BRAF have not been iden...,melanoma,BRAF,D008545,673
4563,22369755,NNK decreased expression of the CTNNB1 gene en...,FOXD3,tumor,27022,D009369


In [16]:
len(dataset_df)

8480

In [17]:
dataset_df.to_csv('../data/test_data2.csv',index=None)