## Load the raw resource

In [None]:
import datasets

# load the raw dataset
dataset = datasets.load_dataset("BioDEX/raw_dataset")['train']

print(len(dataset)) # 65,648

# investigate an example
article = dataset[1]['article']
report = dataset[1]['reports'][0]

print(article['title'])    # Case Report: Perioperative Kounis Syndrome in an Adolescent With Congenital Glaucoma.
print(article['abstract']) # A 12-year-old male patient suffering from congenital glaucoma developed bradycardia, ...
print(article['fulltext']) # ...
print(article['fulltext_license']) # CC BY

print(report['patient']['patientsex']) # 1
print(report['patient']['drug'][0]['activesubstance']['activesubstancename']) # ATROPINE SULFATE
print(report['patient']['drug'][0]['drugadministrationroute']) # 040
print(report['patient']['drug'][1]['activesubstance']['activesubstancename']) # MIDAZOLAM
print(report['patient']['drug'][1]['drugindication']) # Anaesthesia
print(report['patient']['reaction'][0]['reactionmeddrapt'])  # Kounis syndrome
print(report['patient']['reaction'][1]['reactionmeddrapt'])  # Hypersensitivity

Optinal, Using our custom code. This takes some extra time to parse.

In [None]:
import datasets
from src.utils import get_matches

# load the raw dataset
dataset = datasets.load_dataset("BioDEX/raw_dataset")['train']
dataset = get_matches(dataset)

print(len(dataset)) # 65,648

# investigate an example
article = dataset[1].article
report = dataset[1].reports[0]

print(article.title)    # Case Report: Perioperative Kounis Syndrome in an Adolescent With Congenital Glaucoma.
print(article.abstract) # A 12-year-old male patient suffering from congenital glaucoma developed bradycardia, ...
print(article.fulltext) # ...
print(article.fulltext_license) # CC BY

print(report.patient.patientsex) # 1
print(report.patient.drug[0].activesubstance.activesubstancename) # ATROPINE SULFATE
print(report.patient.drug[0].drugadministrationroute) # 040
print(report.patient.drug[1].activesubstance.activesubstancename) # MIDAZOLAM
print(report.patient.drug[1].drugindication) # Anaesthesia
print(report.patient.reaction[0].reactionmeddrapt)  # Kounis syndrome
print(report.patient.reaction[1].reactionmeddrapt)  # Hypersensitivity


## Load the Report-Extraction dataset

In [None]:
import datasets

# load the report-extraction dataset
dataset = datasets.load_dataset("BioDEX/BioDEX-ICSR")

print(len(dataset['train']))        # 9,624
print(len(dataset['validation']))   # 2,407
print(len(dataset['test']))         # 3,628

example = dataset['train'][0]

print(example['fulltext_processed'][:1000], '...') # TITLE: # SARS-CoV-2-related ARDS in a maintenance hemodialysis patient ...
print(example['target']) # serious: 1 patientsex: 1 drugs: ACETAMINOPHEN, ASPIRIN ...

## Use the Report-Extraction model

In [None]:
from transformers import AutoTokenizer, T5ForConditionalGeneration
import datasets

# load the report-extraction dataset
dataset = datasets.load_dataset("BioDEX/BioDEX-ICSR")

# load the model
model_path = "BioDEX/flan-t5-large-report-extraction"
model = T5ForConditionalGeneration.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained(model_path)

# get an input and encode it
input = dataset['validation'][1]['fulltext_processed']
input_encoded = tokenizer(input, max_length=2048, truncation=True, padding="max_length", return_tensors='pt')

# forward pass
output_encoded = model.generate(**input_encoded, max_length=256)

output = tokenizer.batch_decode(output_encoded, skip_special_tokens=True)
output = output[0]

print(output) # serious: 1 patientsex: 2 drugs: AMLODIPINE BESYLATE, LISINOPRIL reactions: Intentional overdose, Metabolic acidosis, Shock

Evaluate performance

In [None]:
from src import Icsr
target = dataset['validation'][1]['target']

target_icsr = Icsr.from_string(target)
output_icsr = Icsr.from_string(output)

print(output_icsr.score(target_icsr))

## Get all full-text papers with a commercial license

In [None]:
import datasets

# load the raw dataset
dataset = datasets.load_dataset("BioDEX/raw_dataset")['train']
print(len(dataset)) # 65,648

# remove all fulltext papers with no commercial license
commercial_licenses = {'CC0', 'CC BY', 'CC BY-SA', 'CC BY-ND'}

def remove_noncom_paper(example):
    # remove the fulltext if no commercial license, keep all the other data of the example
    if example['article']['fulltext_license'] not in commercial_licenses:
        example['article']['fulltext'] = None
    return example

dataset_commercial = dataset.map(remove_noncom_paper)
print(len(dataset_commercial)) # 65,648 (no examples were dropped, only some fulltext fields were removed)