In [1]:
from src import Match, Icsr
from src.utils import get_matches

from datetime import datetime
import random
import datasets
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load matches
dataset = datasets.load_dataset("BioDEX/raw_dataset")
matches = get_matches(dataset['train'])
print(len(matches))

Using custom data configuration BioDEX--raw_dataset-0b83cc0b498dbbb2


Downloading and preparing dataset json/BioDEX--raw_dataset to /Users/kldooste/.cache/huggingface/datasets/BioDEX___json/BioDEX--raw_dataset-0b83cc0b498dbbb2/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab...


Downloading data: 100%|██████████| 46.8M/46.8M [00:19<00:00, 2.34MB/s]
Downloading data: 100%|██████████| 46.7M/46.7M [00:24<00:00, 1.88MB/s]
Downloading data: 100%|██████████| 47.2M/47.2M [00:27<00:00, 1.73MB/s]
Downloading data: 100%|██████████| 46.8M/46.8M [00:26<00:00, 1.74MB/s]
Downloading data: 100%|██████████| 45.6M/45.6M [00:25<00:00, 1.77MB/s]
Downloading data: 100%|██████████| 45.9M/45.9M [00:25<00:00, 1.77MB/s]
Downloading data: 100%|██████████| 25.9M/25.9M [00:14<00:00, 1.76MB/s]
Downloading data files: 100%|██████████| 1/1 [02:54<00:00, 174.84s/it]
Extracting data files: 100%|██████████| 1/1 [00:04<00:00,  4.43s/it]
                                

Dataset json downloaded and prepared to /Users/kldooste/.cache/huggingface/datasets/BioDEX___json/BioDEX--raw_dataset-0b83cc0b498dbbb2/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:00<00:00, 42.54it/s]


65648


In [3]:
# arguments
report_cutoff = 10
# fulltext_only = True
fulltext_only = False
commercial_only = False
test_cutoff = datetime(year=2021, month=1, day=1)

### Filter too many reports
A few articles have many reports. They are typically survey articles, drop them.

In [4]:
matches = [m for m in matches if len(m.reports) <= report_cutoff]
print(f'Matches with <= {report_cutoff} reports: {len(matches):,}')

Matches with <= 10 reports: 62,168


### Get articles with a full text

In [5]:
if fulltext_only:
    matches = [m for m in matches if m.article.fulltext]
    print(f'Matches with full text: {len(matches):,}')

## (optional) Get articles with commercial license

In [6]:
# noncommercial_licenses = {'CC BY-NC', 'CC BY-NC-SA', 'CC BY-NC-ND'}
commercial_licenses = {'CC0', 'CC BY', 'CC BY-SA', 'CC BY-ND'}

if commercial_only:
    matches = [m for m in matches if m.article.fulltext_license in commercial_licenses]
    print(f'Fulltext commercial dataset: {len(matches):,}')

## Split the data

In [7]:
def split_data(matches, test_cutoff):
    test_matches = []
    train_matches = []

    for m in matches:
        pubdate = datetime.strptime(m.article.pubdate[:4], '%Y')
        if  pubdate >= test_cutoff:
            test_matches.append(m)
        else:
            train_matches.append(m)

    print(f'Train size: {len(train_matches):,}')
    print(f'Test size: {len(test_matches):,}')

    return train_matches, test_matches

In [8]:
cutoff = datetime(year=2021, month=1, day=1)
train, test = split_data(matches, cutoff)

Train size: 52,252
Test size: 9,916


### Format the data
For every article, sample one report as target.

Drop some fields and upload to huggingface.

In [9]:

def get_icsrs_from_split(split):
    random.seed(42)

    icsrs = []
    for m in split:
        # get all valid icsrs 
        new_icsrs = [Icsr.from_report(r) for r in m.reports]
        new_icsrs = [t for t in new_icsrs if t]
        # sample one
        if new_icsrs:
            new_icsr = random.choice(new_icsrs)
            icsrs.append(new_icsr)
        else:
            icsrs.append(None)
    return icsrs

def get_ds_from_split(split):
    icsrs = get_icsrs_from_split(split)
    # only keep data with valid icsrs
    valid = [m for index,m in enumerate(split) if icsrs[index]]
    icsrs = [i for i in icsrs if i]
    print(f'Found samples with valid icsrs: {len(valid):,}')

    # format the data
    data = [m.article.dict() for m in valid]
    for d, i in zip(data, icsrs):
        d.update({
            'target': i.to_string()
        })

    df = pd.DataFrame(data=data)
    # reorder some of the columns
    df = df[['title', 'abstract','fulltext','target', 'pmid', 'fulltext_license', 'title_normalized','issue', 'pages', 'journal', 'authors', 'pubdate', 'doi', 'affiliations', 'medline_ta', 'nlm_unique_id', 'issn_linking', 'country', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'delete', 'pmc', 'other_id']]
    df = df.fillna('')
    ds = datasets.Dataset.from_pandas(df)
    return ds

In [10]:
train_ds = get_ds_from_split(train)
test_ds = get_ds_from_split(test)

train_ds = train_ds.shuffle(42)
test_ds = test_ds.shuffle(42)

Found samples with valid icsrs: 40,294
Found samples with valid icsrs: 8,053


In [11]:
# create a validation set
len_train = int(len(train_ds) * 0.8)

val_ds = train_ds.select(range(len_train,len(train_ds)))
train_ds = train_ds.select(range(len_train))

In [12]:
print('train size: ', len(train_ds))
print('val size: ', len(val_ds))
print('test size: ', len(test_ds))

train size:  32235
val size:  8059
test size:  8053


### Preprocess fulltext

In [13]:
def remove_front(text):
    if '==== Body' in text:
        text = ('\n').join(text.split('==== Body')[1:])
    return text.strip()

def remove_refs(text):
    if '==== Refs' in text:
        text = ('\n').join(text.split('==== Refs')[:-1])
    return text.strip() 

def get_fulltext_input(row, fulltext_only=False):
    fulltext_filtered = remove_refs(remove_front(row['fulltext']))
    data = ['\nTITLE:', row['title'], '\nABSTRACT:', row['abstract']]
    if fulltext_only:
        data = data.extend(['\nTEXT:', fulltext_filtered])
    return ('\n').join(data).strip()

# numbered_titles_re = r'^(\d[\.\d]*) (.*)\n'
# capitalized_titles_re = r'^([A-Z ]+)\n'

In [14]:
ds = datasets.DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

In [16]:
ds = ds.map(lambda example: {'fulltext_processed': get_fulltext_input(example, fulltext_only=fulltext_only)})

100%|██████████| 32235/32235 [00:03<00:00, 8478.86ex/s]
100%|██████████| 8059/8059 [00:00<00:00, 8447.30ex/s]
100%|██████████| 8053/8053 [00:01<00:00, 7607.17ex/s]


### Upload the data

In [18]:
if fulltext_only:
    ds.push_to_hub('BioDEX/BioDEX-ICSR')
else:
    ds.push_to_hub('BioDEX/BioDEX-ICSR-Abstract')


Pushing split train to the Hub.
Upload 1 LFS files: 100%|██████████| 1/1 [05:53<00:00, 353.23s/it]0:00<?, ?it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [05:54<00:00, 354.99s/it]
Pushing split validation to the Hub.
Upload 1 LFS files: 100%|██████████| 1/1 [01:20<00:00, 80.75s/it]00:00<?, ?it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [01:21<00:00, 81.75s/it]
Pushing split test to the Hub.
Upload 1 LFS files: 100%|██████████| 1/1 [01:58<00:00, 118.89s/it]0:00<?, ?it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [02:00<00:00, 120.03s/it]


### Load the data

In [2]:
ds = datasets.load_dataset('BioDEX/BioDEX-ICSR')

Using custom data configuration BioDEX--BioDEX-ICSR-40aa49fec6af4868
Found cached dataset parquet (/Users/kldooste/.cache/huggingface/datasets/BioDEX___parquet/BioDEX--BioDEX-ICSR-40aa49fec6af4868/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec)
100%|██████████| 3/3 [00:00<00:00, 252.46it/s]


In [3]:
# get the ranges
def get_ranges(dates):
    dates = [int(d[:4]) for d  in dates]
    return min(dates), max(dates)

print(get_ranges(ds['train']['pubdate']))
print(get_ranges(ds['validation']['pubdate']))
print(get_ranges(ds['test']['pubdate']))

(1990, 2020)
(1985, 2020)
(2021, 2022)


In [9]:
# average target length
target_lengths = [len(e['target']) for e in ds['train']]
print(sum(target_lengths) / len(target_lengths))
print(max(target_lengths))

# get percentile of a length
# Calculate percentile
import numpy as np
length = 128
percentile = np.percentile(target_lengths, (length / len(target_lengths)) * 100)

# Print percentile
print("The percentile of", length, "is", percentile)

163.05382377389859
1325
The percentile of 128 is 69.0
