In [1]:
from src import Match, Icsr
from src.utils import get_matches

from datetime import datetime
import random
import datasets
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load matches
dataset = datasets.load_dataset("FAERS-PubMed/raw_dataset")
matches = get_matches(dataset['train'])
print(len(matches))

Using custom data configuration FAERS-PubMed--raw_dataset-0b83cc0b498dbbb2
Found cached dataset json (/Users/kldooste/.cache/huggingface/datasets/FAERS-PubMed___json/FAERS-PubMed--raw_dataset-0b83cc0b498dbbb2/0.0.0/e6070c77f18f01a5ad4551a8b7edfba20b8438b7cad4d94e6ad9378022ce4aab)
100%|██████████| 1/1 [00:00<00:00, 50.42it/s]


65648


In [3]:
# arguments
report_cutoff = 10
fulltext_only = True
commercial_only = False
test_cutoff = datetime(year=2021, month=1, day=1)

### Filter too many reports
A few articles have many reports. They are typically survey articles, drop them.

In [4]:
matches = [m for m in matches if len(m.reports) <= report_cutoff]
print(f'Matches with <= {report_cutoff} reports: {len(matches):,}')

Matches with <= 10 reports: 62,168


### Get articles with a full text

In [5]:
if fulltext_only:
    matches = [m for m in matches if m.article.fulltext]
    print(f'Matches with full text: {len(matches):,}')

Matches with full text: 18,678


## (optional) Get articles with commercial license

In [6]:
# noncommercial_licenses = {'CC BY-NC', 'CC BY-NC-SA', 'CC BY-NC-ND'}
commercial_licenses = {'CC0', 'CC BY', 'CC BY-SA', 'CC BY-ND'}

if commercial_only:
    matches = [m for m in matches if m.article.fulltext_license in commercial_licenses]
    print(f'Fulltext commercial dataset: {len(matches):,}')

## Split the data

In [7]:
def split_data(matches, test_cutoff):
    test_matches = []
    train_matches = []

    for m in matches:
        pubdate = datetime.strptime(m.article.pubdate[:4], '%Y')
        if  pubdate >= test_cutoff:
            test_matches.append(m)
        else:
            train_matches.append(m)

    print(f'Train size: {len(train_matches):,}')
    print(f'Test size: {len(test_matches):,}')

    return train_matches, test_matches

In [8]:
cutoff = datetime(year=2021, month=1, day=1)
train, test = split_data(matches, cutoff)

Train size: 14,429
Test size: 4,249


### Format the data
For every article, sample one report as target.

Drop some fields and upload to huggingface.

In [9]:

def get_icsrs_from_split(split):
    random.seed(42)

    icsrs = []
    for m in split:
        # get all valid icsrs 
        new_icsrs = [Icsr.from_report(r) for r in m.reports]
        new_icsrs = [t for t in new_icsrs if t]
        # sample one
        if new_icsrs:
            new_icsr = random.choice(new_icsrs)
            icsrs.append(new_icsr)
        else:
            icsrs.append(None)
    return icsrs

def get_ds_from_split(split):
    icsrs = get_icsrs_from_split(split)
    # only keep data with valid icsrs
    valid = [m for index,m in enumerate(split) if icsrs[index]]
    icsrs = [i for i in icsrs if i]
    print(f'Found samples with valid icsrs: {len(valid):,}')

    # format the data
    data = [m.article.dict() for m in valid]
    for d, i in zip(data, icsrs):
        d.update({
            'target': i.to_string()
        })

    df = pd.DataFrame(data=data)
    # reorder some of the columns
    df = df[['title', 'abstract','fulltext','target', 'pmid', 'fulltext_license', 'title_normalized','issue', 'pages', 'journal', 'authors', 'pubdate', 'doi', 'affiliations', 'medline_ta', 'nlm_unique_id', 'issn_linking', 'country', 'mesh_terms', 'publication_types', 'chemical_list', 'keywords', 'references', 'delete', 'pmc', 'other_id']]
    df = df.fillna('')
    ds = datasets.Dataset.from_pandas(df)
    return ds

In [10]:
train_ds = get_ds_from_split(train)
test_ds = get_ds_from_split(test)

train_ds = train_ds.shuffle(42)
test_ds = test_ds.shuffle(42)

Found samples with valid icsrs: 12,031
Found samples with valid icsrs: 3,628


In [11]:
# create a validation set
len_train = int(len(train_ds) * 0.8)

val_ds = train_ds.select(range(len_train,len(train_ds)))
train_ds = train_ds.select(range(len_train))

In [14]:
print('train size: ', len(train_ds))
print('val size: ', len(val_ds))
print('test size: ', len(test_ds))

train size:  9624
val size:  2407
test size:  3628


### Upload the data

In [16]:
ds = datasets.DatasetDict({
    'train': train_ds,
    'validation': val_ds,
    'test': test_ds
})

ds.push_to_hub('FAERS-PubMed/BioDEX-ICSR')

Pushing split train to the Hub.
Upload 1 LFS files: 100%|██████████| 1/1 [04:19<00:00, 259.02s/it]0:00<?, ?it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [04:22<00:00, 262.22s/it]
Pushing split validation to the Hub.
Upload 1 LFS files: 100%|██████████| 1/1 [01:04<00:00, 64.24s/it]00:00<?, ?it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [01:07<00:00, 67.03s/it]
Pushing split test to the Hub.
Upload 1 LFS files: 100%|██████████| 1/1 [01:39<00:00, 99.32s/it]00:00<?, ?it/s]
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [01:41<00:00, 101.16s/it]


### Load the data

In [2]:
ds = datasets.load_dataset('FAERS-PubMed/BioDEX-ICSR')

Downloading readme: 100%|██████████| 1.48k/1.48k [00:00<00:00, 1.32MB/s]
Using custom data configuration FAERS-PubMed--BioDEX-ICSR-40aa49fec6af4868


Downloading and preparing dataset None/None to /Users/kldooste/.cache/huggingface/datasets/FAERS-PubMed___parquet/FAERS-PubMed--BioDEX-ICSR-40aa49fec6af4868/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data: 100%|██████████| 51.7M/51.7M [00:21<00:00, 2.40MB/s]
Downloading data: 100%|██████████| 202M/202M [01:24<00:00, 2.40MB/s]]
Downloading data: 100%|██████████| 83.9M/83.9M [00:35<00:00, 2.39MB/s]
Downloading data files: 100%|██████████| 3/3 [02:22<00:00, 47.45s/it]
Extracting data files: 100%|██████████| 3/3 [00:00<00:00, 1801.16it/s]
                                

Dataset parquet downloaded and prepared to /Users/kldooste/.cache/huggingface/datasets/FAERS-PubMed___parquet/FAERS-PubMed--BioDEX-ICSR-40aa49fec6af4868/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.


100%|██████████| 3/3 [00:00<00:00, 131.08it/s]


In [10]:
# get the ranges
def get_ranges(dates):
    dates = [int(d[:4]) for d  in dates]
    return min(dates), max(dates)

print(get_ranges(ds['train']['pubdate']))
print(get_ranges(ds['validation']['pubdate']))
print(get_ranges(ds['test']['pubdate']))

(1990, 2020)
(1985, 2020)
(2021, 2022)
