# Relation extraction with co-occurrences and HuggingFace


## Getting documents with pre-extracted entities

In [None]:
import requests

# Example PubMed ID
pmid = "20573926"

# PubTator API endpoint for BioC XML
url = f"https://www.ncbi.nlm.nih.gov/research/pubtator-api/publications/export/biocxml?pmids={pmid}"

# Make the request
response = requests.get(url)

# Check for successful response
if response.status_code == 200:
    biocxml = response.text

    # Save to a local file (optional)
    with open(f"example.bioc.xml", "w", encoding="utf-8") as f:
        f.write(biocxml)

    print(f"BioC XML for PMID {pmid} saved as {pmid}.bioc.xml")
else:
    print(f"Error: {response.status_code}")


In [None]:
from xml.dom.minidom import parseString

dom = parseString(response.text)
print(dom.toprettyxml(indent="  "))

In [None]:
from bioc import biocxml

with open('example.bioc.xml') as f:
  collection = biocxml.load(f)

In [None]:
len(collection.documents)

In [None]:
document = collection.documents[0]

In [None]:

len(document.passages)

In [None]:
passage = document.passages[0]
passage.text

In [None]:
passage.infons

In [None]:
len(passage.annotations)

In [None]:
for anno in passage.annotations:
  print(f"{anno.text=}\n{anno.infons=}\n{anno.total_span.offset=}\n{anno.total_span.length=}\n")

In [None]:
dir(anno)

## Calculating co-occurrences

In [None]:
from bioc import biocxml

with open('collection.bioc.xml', "r") as f:
    collection = biocxml.load(f)

In [None]:
len(collection.documents)

In [None]:
quick_lookup = {}
for i,doc in enumerate(collection.documents):
  for passage in doc.passages:

    for anno in passage.annotations:
      if 'identifier' in anno.infons:
        quick_lookup[anno.infons['identifier']] = (anno.infons['type'], anno.text)

In [None]:
len(quick_lookup)

In [None]:
import itertools
from collections import Counter
cooccurrences = Counter()
counts = Counter()

for i,doc in enumerate(collection.documents):
  identifiers = [ anno.infons['identifier'] for passage in doc.passages for anno in passage.annotations if anno.infons.get('identifier','-') != '-' ]

  unique_identifiers = set(identifiers)

  counts += Counter(unique_identifiers)

  for id1,id2 in itertools.combinations(unique_identifiers, 2):
    cooccurrences[(id1,id2)] += 1


In [None]:
for (id1,id2),count in cooccurrences.most_common(50):
  print(id1, quick_lookup[id1], id2, quick_lookup[id2], count)

In [None]:
doc_count = len(collection.documents)

count_1_and_2 = cooccurrences[(id1, id2)]

count_1_and_not_2 = counts[id1] - count_1_and_2

count_2_and_not_1 = counts[id2] - count_1_and_2

count_not_1_or_2 = doc_count - count_1_and_2 - count_1_and_not_2 - count_2_and_not_1

# 2x2 table
contingency_table = [[count_1_and_2, count_1_and_not_2],
                     [count_2_and_not_1, count_not_1_or_2]]

contingency_table

In [None]:
count_1_and_2 / (count_1_and_2+count_1_and_not_2)

In [None]:
counts[id1] / doc_count

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency

# Chi-squared test
chi2, p, dof, expected = chi2_contingency(contingency_table, correction=False)

p

In [None]:
for (id1,id2),count in cooccurrences.items():
  pass

## Task

Do that calculation at scale for a big dataset of BioC files

## A rule-based approach

In [None]:
import json

with open('sentences2.json') as f:
  sentences = json.load(f)

len(sentences)

In [None]:
sentences[2]

In [None]:
sentence = {'text': 'Warfarin is used for stroke prevention, and studies suggest it may help manage deep vein thrombosis.',
 'chemicals': ['Warfarin'],
 'diseases': ['stroke prevention', 'deep vein thrombosis']
}

In [None]:
rule = "[CHEMICAL] is used for [DISEASE]"

In [None]:
pairs = [ (chemical,disease) for chemical in sentence['chemicals'] for disease in sentence['diseases'] ]
pairs

In [None]:
chemical, disease = pairs[0]
chemical, disease

In [None]:
sentence_with_placeholders = sentence['text'].replace(chemical,'[CHEMICAL]').replace(disease,'[DISEASE]')
sentence_with_placeholders

In [None]:
rule_matches = rule in sentence_with_placeholders

print(f"Match: {rule_matches}")
print(f"  [CHEMICAL]={chemical}")
print(f"  [DISEASE]={disease}")
print(f"  {sentence_with_placeholders}")

## Task

Apply to all the sentences and come up with more rules

In [None]:
rules = [
  "[CHEMICAL] is used to treat [DISEASE]",
  "[CHEMICAL] treats [DISEASE]",
  "[CHEMICAL] is effective against [DISEASE]",
  "[CHEMICAL] has been shown to treat [DISEASE]",
  "[CHEMICAL] therapy for [DISEASE]",
  "[CHEMICAL] has therapeutic effects on [DISEASE]",
  "[CHEMICAL] is indicated for the treatment of [DISEASE]",
  "[CHEMICAL] is administered to manage [DISEASE]",
  "[CHEMICAL] is prescribed for [DISEASE]",
  "[CHEMICAL] is a treatment option for [DISEASE]",
  "[CHEMICAL] can be used for [DISEASE] therapy",
  "[CHEMICAL] is beneficial for patients with [DISEASE]",
  "Treatment of [DISEASE] with [CHEMICAL]",
  "Use of [CHEMICAL] in the treatment of [DISEASE]",
  "[CHEMICAL] alleviates symptoms of [DISEASE]"
]


In [None]:
for sentence in sentences:
  pairs = [ (chemical,disease) for chemical in sentence['chemicals'] for disease in sentence['diseases'] ]

  for chemical,disease in pairs:
    sentence_with_placeholders = sentence['text'].replace(chemical,'[CHEMICAL]').replace(disease,'[DISEASE]')

    if any( rule in sentence_with_placeholders for rule in rules) :
      print(f"{chemical} | {disease} | {sentence_with_placeholders}")

## A basic Open Information Extraction method

In [None]:
sentence = {
    "text": "Cetuximab binds to the epidermal growth factor receptor, blocking cancer cell proliferation.",
    "entities": ["Cetuximab", "epidermal growth factor receptor"]
}

In [None]:
pair = ("Cetuximab", "epidermal growth factor receptor")

loc1 = sentence['text'].index(pair[0])
loc2 = sentence['text'].index(pair[1])

loc1, loc2

In [None]:
import spacy

nlp = spacy.load("en_core_web_sm")

text = sentence['text']

doc = nlp(text)

for token in doc:
    if token.pos_ == "VERB":
        print(f"Verb: '{token.text}' at {token.idx}")


## Task

Apply one of the methods above to the large set of sentences

In [None]:
with open('sentences-openie.json') as f:
  sentences = json.load(f)

In [None]:
for sentence in sentences:
  entity1, entity2 = sentence['entities']

  doc = nlp(sentence['text'])

  verbs = [ (token.idx, token.text) for token in doc if token.pos_ == "VERB" ]

  loc1 = sentence['text'].index(entity1)
  loc2 = sentence['text'].index(entity2)

  loc1,loc2 = (loc2,loc1) if loc2 < loc1 else (loc1,loc2)

  verbs_between = [ verb for verb_loc,verb in verbs if verb_loc > loc1 and verb_loc < loc2 ]

  if len(verbs_between) == 1:
    print(f"{verbs_between[0]} | {entity1} | {entity2} | {sentence['text']}")



## Optional Extras

- Calculate p-values for each co-occurrence by creating a contigency matrix of document counts of when two entities appear (and appear together)