In [1]:
import re
import pandas as pd
import bs4
import requests
import spacy
from spacy import displacy
nlp = spacy.load('en_core_web_sm')

from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')
from nltk.corpus import stopwords
stop = stopwords.words('english')
import string

from spacy.matcher import Matcher 
from spacy.tokens import Span 

import networkx as nx
import matplotlib.pyplot as plt
from tqdm import tqdm

pd.set_option('display.max_colwidth', 200)
%matplotlib inline

[nltk_data] Downloading package punkt to /home/ali/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# import wikipedia sentences
candidate_sentences = pd.read_csv("DataSet/DDIdataset.csv")
candidate_sentences.shape

(6120, 3)

In [3]:
candidate_sentences['Sentence'].sample(5)

5363    The possibility of hypotensive effects with OTHER_DRUG or OTHER_DRUG can be minimized by either discontinuing the OTHER_DRUG or increasing the salt intake prior to initiation of treatment with OTH...
5420                                              Magnesium- and OTHER_DRUG-containing OTHER_DRUG, administered concomitantly with OTHER_DRUG, significantly decreased the bioavailability (48%) of OTHER_DRUG.
4069                                                                                                                                                                 Sedative/hypnotics: OTHER_DRUG, OTHER_DRUG
428     Drugs that reportedly may increase oral OTHER_DRUG response, ie, increased prothrombin response, in man include:OTHER_DRUG*;OTHER_DRUG;OTHER_DRUG;OTHER_DRUG;OTHER_DRUG;OTHER_DRUG;OTHER_DRUG;OTHER_...
2761    Interactions may also occur with the following: OTHER_DRUG/OTHER_DRUG, drugs used to treat an overactive thyroid, OTHER_DRUG (e.g., OTHER_DRUG), OTHER_DRUG, OTH

In [4]:
#For Cleaning Data
def review_cleaning(text):
    tokens = word_tokenize(text.replace('-', ' ').replace('/',' '))
    tokens = [w.lower() for w in tokens if len(w.lower())>2]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha() and word not in stop]
    clean_text = ' '.join(words)
    return clean_text

In [5]:
def get_entities(sent):
  sent = review_cleaning(sent)

  ## chunk 1
  ent1 = ""
  ent2 = ""

  prv_tok_dep = ""    # dependency tag of previous token in the sentence
  prv_tok_text = ""   # previous token in the sentence

  prefix = ""
  modifier = ""

  #############################################################
  
  for tok in nlp(sent):
    ## chunk 2
    # if token is a punctuation mark then move on to the next token
    if tok.dep_ != "punct":
      # check: token is a compound word or not
      if tok.dep_ == "compound":
        prefix = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          prefix = prv_tok_text + " "+ tok.text
      
      # check: token is a modifier or not
      if tok.dep_.endswith("mod") == True:
        modifier = tok.text
        # if the previous word was also a 'compound' then add the current word to it
        if prv_tok_dep == "compound":
          modifier = prv_tok_text + " "+ tok.text
      
      ## chunk 3
      if tok.dep_.find("subj") == True:
        ent1 = modifier +" "+ prefix + " "+ tok.text
        prefix = ""
        modifier = ""
        prv_tok_dep = ""
        prv_tok_text = ""      

      ## chunk 4
      if tok.dep_.find("obj") == True:
        ent2 = modifier +" "+ prefix +" "+ tok.text
        
      ## chunk 5  
      # update variables
      prv_tok_dep = tok.dep_
      prv_tok_text = tok.text
  #############################################################

  return [ent1.strip(), ent2.strip()]

In [6]:
entity_pairs = []

for i in tqdm(candidate_sentences["Sentence"]):
  entity_pairs.append(get_entities(i))

100%|██████████| 6120/6120 [01:24<00:00, 72.68it/s]


In [7]:
entity_pairs[10:20]

[['', 'organophosphate carbamate insecticides'],
 ['patients', 'various clearance hydrodolasetron'],
 ['aid sleep drugs', 'drowsy'],
 ['otherdrug  otherdrug', 'loop'],
 ['rare interaction otherdrug nifedipine', ''],
 ['cold number antihyperlipidemics', 'otherdrug beta blockers'],
 ['otherdrug  otherdrug', ''],
 ['thioxanthene otherdrug monoamine oxidase inhibitors', 'otherdrug'],
 ['otherdrug ace otherdrug', 'adverse  interactions'],
 ['concomitant administration compounds', 'metabolism docetaxel']]

In [8]:
def get_relation(sent):
  sent = review_cleaning(sent)
  doc = nlp(sent)

  # Matcher class object 
  matcher = Matcher(nlp.vocab)

  #define the pattern 
  pattern = [{'DEP':'ROOT'}, 
            {'DEP':'prep','OP':"?"},
            {'DEP':'agent','OP':"?"},  
            {'POS':'ADJ','OP':"?"}] 

  matcher.add("matching_1", [pattern])
  matches = matcher(doc)
  #print("Matches -> ",matches)
  #print("Length of match",len(matches) - 1)
  k = len(matches) - 1
  #print("Len of K ", k)
  #print("Matches ", matches)
  #print("Matches k1",matches[k][1])
  #print("Matches k2",matches[k][2])
  
  #print("DOC of Matches k1",doc[matches[k][1]])
  #print("DOC of Matches k2",doc[matches[k][2]])
  #print("DOC of Matches k3",doc[matches[k][3]])

  #print(f"DOC ranges:K1 {doc[matches[k][1]]} and K2 {doc[matches[k][2]]}. ")
  #print("Matches ->", [doc[start:end].text for match_id, start, end in matches])

  span = doc[matches[k][1]:matches[k][2]] 
  
  #print("SPAN.text",doc[matches[k][1]:matches[k][2]].text)
  #print(span.text)
  return(span.text)

In [9]:
relations = [get_relation(i) for i in tqdm(candidate_sentences['Sentence'])]

100%|██████████| 6120/6120 [01:22<00:00, 73.79it/s]


In [10]:
pd.Series(relations).value_counts()[:50]

otherdrug                701
otherdrug otherdrug      396
found                    282
interact following       145
alter                    137
increase                 114
used                      87
reported                  84
decreased                 84
include                   69
reduce                    69
enhance neuromuscular     69
include otherdrug         63
interact                  61
increase oral             59
vitro                     58
wort                      53
include acute             53
inhibit                   53
enhance                   46
occur                     45
use                       44
decrease                  43
administered              42
resulted                  41
increased                 41
shown                     39
otherdrug overdosage      38
interfere                 38
result                    38
cause                     37
affect                    36
metabolized               36
tell                      33
substrates    

In [11]:
# extract subject
source = [i[0] for i in entity_pairs]

# extract object
target = [i[1] for i in entity_pairs]

kg_df = pd.DataFrame({'source':source, 'target':target, 'edge':relations})
kg_df

Unnamed: 0,source,target,edge
0,reason dose adrenaline,antiarrhythmic agent,restricted antiarrhythmic
1,eg eg otherdrug,otherdrug eg,otherdrug
2,otherdrug glucocorticoids,otherdrug otherdrug,found
3,therefore midamor otherdrug,desired effect diuretic,obtained
4,otherdrug otherdrug,healthy subjects,shown otherdrug
...,...,...,...
6115,agents inhibitors,otherdrug grapefruit juice otherdrug,found
6116,otherdrug,metabolism otherdrug result,vitro
6117,prostaglandin synthetase inhibitors,adverse interactions,used clinical
6118,oral solution,systemic absorption concentrations,interfere


In [12]:
# create a directed-graph from a dataframe
G=nx.from_pandas_edgelist(kg_df, "source", "target", 
                          edge_attr=True, create_using=nx.MultiDiGraph())



In [14]:
plt.figure(figsize=(12,12))

pos = nx.spring_layout(G)
nx.draw(G, with_labels=True, node_color='red', edge_cmap=plt.cm.Blues, pos = pos)
plt.show()